├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── data_request.md │ ├── feature_request.md │ └── report-data-error.md └── workflows │ ├── pages.yml │ └── pypi.yaml ├── .gitignore ├── LICENSE ├── NOTICE.txt ├── README.md ├── docs ├── .gitignore ├── 404.html ├── Gemfile ├── _config.yml ├── _includes │ └── head_custom.html ├── docs │ ├── Errata.md │ ├── GRS检验的证明.md │ ├── compute │ │ └── compute.md │ ├── configuration.md │ ├── data │ │ ├── A-share.md │ │ └── data.md │ ├── evaluation │ │ ├── AcaEvaluatorModel.md │ │ ├── AcaEvaluatorModelComparison.md │ │ ├── aca_evaluator.md │ │ └── evaluation.md │ └── release-plan.md └── index.md ├── file.py ├── firefin ├── __init__.py ├── cli │ └── command.py ├── common │ ├── config.py │ ├── config.yaml │ └── const.py ├── compute │ ├── __init__.py │ └── window.py ├── core │ ├── __init__.py │ ├── algorithm │ │ ├── __init__.py │ │ ├── _numba_funcs.py │ │ ├── newey_west_ttest_1samp.py │ │ └── regression.py │ └── plot │ │ ├── __init__.py │ │ └── plots.py ├── data │ ├── __init__.py │ ├── datainfo.py │ ├── fake.py │ ├── file_reader.py │ └── gateway.py └── evaluation │ ├── __init__.py │ ├── academia │ ├── AcaEvaluatorModel.py │ ├── AcaEvaluatorModelComparison.py │ ├── MSR_Test.py │ ├── __init__.py │ ├── anomaly_test.py │ ├── fama_macbeth.py │ ├── portfolio_sort.py │ └── winsorizer.py │ ├── eva_utils.py │ └── industry │ ├── __init__.py │ └── evaluator.py ├── pyproject.toml └── tests ├── evaluation ├── Beta_test.py ├── MSR_Test.py ├── aca_eva1_test.ipynb ├── eva_utils.py ├── fama_macbeth.py ├── grs.py └── portfolio_test.py ├── test.py ├── test_algo └── test_regression.py └── test_data.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: qiaobaochen 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/data_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Data request 3 | about: Request a new Data 4 | title: NEW DATA REQUEST 5 | labels: new data 6 | assignees: qiaobaochen 7 | 8 | --- 9 | 10 | **Which data do you want,describe it in detail** 11 | A clear and concise description of what the data is. Ex. I'm always frustrated when [...] 12 | 13 | **Put the data source here** 14 | URL here: 15 | 16 | **Describe how to clean data** 17 | 1. download 18 | 2. remove duplicated 19 | 3. fillna 20 | 21 | **Additional context** 22 | Add any other context or screenshots about the data request here. 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/report-data-error.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Report data error 3 | about: Report Data error 4 | title: '' 5 | labels: data error 6 | assignees: qiaobaochen 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages 7 | name: Deploy Jekyll site to Pages 8 | 9 | on: 10 | push: 11 | branches: ["main"] 12 | paths: 13 | - "docs/**" 14 | 15 | # Allows you to run this workflow manually from the Actions tab 16 | workflow_dispatch: 17 | 18 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 19 | permissions: 20 | contents: read 21 | pages: write 22 | id-token: write 23 | 24 | # Allow one concurrent deployment 25 | concurrency: 26 | group: "pages" 27 | cancel-in-progress: true 28 | 29 | jobs: 30 | # Build job 31 | build: 32 | runs-on: ubuntu-latest 33 | defaults: 34 | run: 35 | working-directory: docs 36 | steps: 37 | - name: Checkout 38 | uses: actions/checkout@v4 39 | - name: Setup Ruby 40 | uses: ruby/setup-ruby@v1 41 | with: 42 | ruby-version: '3.3' # Not needed with a .ruby-version file 43 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 44 | cache-version: 0 # Increment this number if you need to re-download cached gems 45 | working-directory: '${{ github.workspace }}/docs' # Set the working-directory param to the docs folder 46 | - name: Setup Pages 47 | id: pages 48 | uses: actions/configure-pages@v5 49 | - name: Build with Jekyll 50 | # Outputs to the './_site' directory by default 51 | run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}" 52 | env: 53 | JEKYLL_ENV: production 54 | - name: Upload artifact 55 | # Automatically uploads an artifact from the './_site' directory by default 56 | uses: actions/upload-pages-artifact@v3 57 | with: 58 | path: docs/_site # Set the path to the docs folder 59 | 60 | # Deployment job 61 | deploy: 62 | environment: 63 | name: github-pages 64 | url: ${{ steps.deployment.outputs.page_url }} 65 | runs-on: ubuntu-latest 66 | needs: build 67 | steps: 68 | - name: Deploy to GitHub Pages 69 | id: deployment 70 | uses: actions/deploy-pages@v4 -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build: 7 | name: Build distribution 📦 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v4 12 | with: 13 | persist-credentials: false 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: "3.x" 18 | - name: Install pypa/build 19 | run: python3 -m pip install build --user 20 | - name: Build a binary wheel and a source tarball 21 | run: python3 -m build 22 | - name: Store the distribution packages 23 | uses: actions/upload-artifact@v4 24 | with: 25 | name: python-package-distributions 26 | path: dist/ 27 | 28 | publish-to-pypi: 29 | name: >- 30 | Publish Python 🐍 distribution 📦 to PyPI 31 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes 32 | needs: 33 | - build 34 | runs-on: ubuntu-latest 35 | environment: 36 | name: pypi 37 | url: https://pypi.org/p/firefin # Replace with your PyPI project name 38 | permissions: 39 | id-token: write # IMPORTANT: mandatory for trusted publishing 40 | 41 | steps: 42 | - name: Download all the dists 43 | uses: actions/download-artifact@v4 44 | with: 45 | name: python-package-distributions 46 | path: dist/ 47 | - name: Publish distribution 📦 to PyPI 48 | uses: pypa/gh-action-pypi-publish@release/v1 49 | 50 | github-release: 51 | name: >- 52 | Sign the Python 🐍 distribution 📦 with Sigstore 53 | and upload them to GitHub Release 54 | needs: 55 | - publish-to-pypi 56 | runs-on: ubuntu-latest 57 | 58 | permissions: 59 | contents: write # IMPORTANT: mandatory for making GitHub Releases 60 | id-token: write # IMPORTANT: mandatory for sigstore 61 | 62 | steps: 63 | - name: Download all the dists 64 | uses: actions/download-artifact@v4 65 | with: 66 | name: python-package-distributions 67 | path: dist/ 68 | - name: Sign the dists with Sigstore 69 | uses: sigstore/gh-action-sigstore-python@v3.0.0 70 | with: 71 | inputs: >- 72 | ./dist/*.tar.gz 73 | ./dist/*.whl 74 | - name: Create GitHub Release 75 | env: 76 | GITHUB_TOKEN: ${{ github.token }} 77 | run: >- 78 | gh release create 79 | "$GITHUB_REF_NAME" 80 | --repo "$GITHUB_REPOSITORY" 81 | --notes "" 82 | - name: Upload artifact signatures to GitHub Release 83 | env: 84 | GITHUB_TOKEN: ${{ github.token }} 85 | # Upload to GitHub Release using the `gh` CLI. 86 | # `dist/` contains the built packages, and the 87 | # sigstore-produced signatures and certificates. 88 | run: >- 89 | gh release upload 90 | "$GITHUB_REF_NAME" dist/** 91 | --repo "$GITHUB_REPOSITORY" 92 | 93 | 94 | 95 | # publish-to-testpypi: 96 | publish-to-testpypi: 97 | name: Publish Python 🐍 distribution 📦 to TestPyPI 98 | needs: 99 | - build 100 | runs-on: ubuntu-latest 101 | 102 | environment: 103 | name: testpypi 104 | url: https://test.pypi.org/p/firefin 105 | 106 | permissions: 107 | id-token: write # IMPORTANT: mandatory for trusted publishing 108 | 109 | steps: 110 | - name: Download all the dists 111 | uses: actions/download-artifact@v4 112 | with: 113 | name: python-package-distributions 114 | path: dist/ 115 | - name: Publish distribution 📦 to TestPyPI 116 | uses: pypa/gh-action-pypi-publish@release/v1 117 | with: 118 | repository-url: https://test.pypi.org/legacy/ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | develop.ipynb 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | .idea/ 162 | 163 | # feather datafiles 164 | *.feather 165 | # gzip datafiles 166 | *.gz 167 | 168 | # vscode 169 | .vscode/ 170 | 171 | # data packages 172 | AStockData.tar.gz -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Copyright Super Quantum Inc. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # F.I.R.E. Factor Investment Research Engine 2 | 3 | This repo is the bundled opensource toolkit for book _Navigating the Factor Zoo:The Science of Quantitative Investing_. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | # for stable version 9 | pip install firefin 10 | 11 | # for test and nightly version 12 | pip install -i https://test.pypi.org/simple/ firefin 13 | 14 | # Install from source for loacl testing!!! 15 | ## replace $ThisRepoURL with the actual repo url 16 | git clone $ThisRepoURL 17 | ## install from source 18 | pip install -e . 19 | ``` 20 | 21 | ## Usage 22 | 23 | Download the data 24 | from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz) 25 | 26 | run the command and download data put in correct path automatically. 27 | 28 | ```bash 29 | # We have not released this repo yet, so you need download the data manually!!! See command below!!! 30 | # Auto download data 31 | firefin download 32 | ``` 33 | 34 | If you have already downloaded the data from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz), you can run the command to check the data and put the data in the correct path 35 | 36 | ```bash 37 | # replace path_to_data.tar.gz with the actual path 38 | firefin load path_to_data.tar.gz 39 | ``` 40 | 41 | ## Start to code 42 | 43 | ```python 44 | import firefin 45 | 46 | # get data 47 | data = firefin.fetch_data(["open", "close", "volume"]) 48 | open_price = data["open"] 49 | 50 | 51 | def pv_corr(close, volume): 52 | # price volume correlation 53 | return close.rolling(20).corr(volume) 54 | 55 | 56 | factor = pv_corr(data["close"], data["volume"]) 57 | 58 | # compute forward returns 59 | fr = firefin.compute_forward_returns(open_price.shift(-1), [1, 5, 10]) 60 | 61 | # evaluate factor 62 | mng = firefin.Evaluator(factor, fr) 63 | mng.get_ic("pearson") 64 | mng.get_quantile_returns(5) 65 | 66 | ``` 67 | 68 | ## Features 69 | 70 | 1. handy functions for fast factor computation 71 | 2. various tools for factor evaluation 72 | 73 | 74 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _site 2 | .sass-cache 3 | .jekyll-cache 4 | .jekyll-metadata 5 | vendor 6 | # gem 7 | Gemfile.lock -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /404.html 3 | layout: page 4 | --- 5 | 6 | 19 | 20 |
21 |

404

22 | 23 |

Page not found :(

24 |

The requested page could not be found.

25 |
26 | -------------------------------------------------------------------------------- /docs/Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org/" 2 | # Hello! This is where you manage which Jekyll version is used to run. 3 | # When you want to use a different version, change it below, save the 4 | # file and run `bundle install`. Run Jekyll with `bundle exec`, like so: 5 | # 6 | # bundle exec jekyll serve 7 | # 8 | # This will help ensure the proper Jekyll version is running. 9 | # Happy Jekylling! 10 | # gem "jekyll", "~> 4.4.1" 11 | gem "github-pages", "~> 232", group: :jekyll_plugins 12 | # just the docs theme 13 | gem "just-the-docs" 14 | # If you want to use GitHub Pages, remove the "gem "jekyll"" above and 15 | # uncomment the line below. To upgrade, run `bundle update github-pages`. 16 | # gem "github-pages", group: :jekyll_plugins 17 | # If you have any plugins, put them here! 18 | group :jekyll_plugins do 19 | gem "jekyll-feed", "~> 0.12" 20 | end 21 | 22 | # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem 23 | # and associated library. 24 | platforms :mingw, :x64_mingw, :mswin, :jruby do 25 | gem "tzinfo", ">= 1", "< 3" 26 | gem "tzinfo-data" 27 | end 28 | 29 | # Performance-booster for watching directories on Windows 30 | gem "wdm", "~> 0.1", :platforms => [:mingw, :x64_mingw, :mswin] 31 | 32 | # Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem 33 | # do not have a Java counterpart. 34 | gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby] 35 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | title: F.I.R.E. 2 | description: Factor Investment Research Engine (The bundled opensource toolkit for book Navigating the Factor Zoo:The Science of Quantitative Investing.) 3 | theme: just-the-docs 4 | 5 | url: fire-institute.github.io 6 | 7 | aux_links: 8 | F.I.R.E. on GitHub: https://github.com/fire-institute/fire 9 | 10 | color_scheme: dark 11 | 12 | # Build settings 13 | markdown: kramdown 14 | compress_html: 15 | blanklines: true -------------------------------------------------------------------------------- /docs/_includes/head_custom.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 20 | 21 | -------------------------------------------------------------------------------- /docs/docs/Errata.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Errata 3 | permalink: /errata/ 4 | nav_order: 2 5 | --- 6 | # Errata for *Navigating the Factor Zoo: The Science of Quantitative Investing* 7 | --- 8 | ## Overview 9 | 10 | This document captures all **verified printing and content errors** identified in *_Navigating the Factor Zoo: The Science of Quantitative Investing_*. It is maintained in the Fire Institute GitHub repository (https://github.com/fire-institute/fire) under `docs/docs/errata.md`. 11 | 12 | ### Structure of Entries 13 | Each erratum follows this format: 14 | 15 | | Field | Description | 16 | | ------------------ | ------------------------------------------------------------------ | 17 | | **Anchor** | Unique Markdown heading used as the link target. | 18 | | **Original** | Verbatim the incorrect text, caption, or equation. | 19 | | **Correction** | The accurate replacement text, caption, or equation. | 20 | | **Note** | (Optional) Additional context or explanation. | 21 | 22 | --- 23 | 24 | ### Submitting a New Error Report 25 | To contribute: 26 | 27 | 1. **Search** existing GitHub issues to avoid duplicates. 28 | 2. **Open a new issue** with the title: 29 | ``` 30 | [Errata] Page – brief description 31 | ``` 32 | 3. **Fill in the template** in the issue body: 33 | ```markdown 34 | **Page**: 35 | **Section or Heading**: 36 | **Original**: 37 | **Correction**: 38 | ``` 39 | 4. A maintainer will review, label it **confirmed**, and then add it here. 40 | 41 | --- 42 | 43 | ## Table of Content 44 | 45 | * [First Edition — Routledge (Hardcover & Paperback)](#first-edition-routledge-hardcover--paperback) 46 | * [Page 66 – Equation 3.19](#page-66-equation-3-19) 47 | 48 | 49 | 50 | --- 51 | 52 | ## First Edition — Routledge (Hardcover & Paperback) 53 | 54 | - **Publisher**: Routledge 55 | - **Publication Date**: November 20, 2024 (Hardcover) / December 9, 2024 (Paperback) 56 | - **Formats**: Hardcover (296 pp.) / Paperback (310 pp.) 57 | - **ISBN-10**: 1032768436 (HC) / 103276841X (PB) 58 | - **ISBN-13**: 978-1032768434 (HC) / 978-1032768410 (PB) 59 | 60 | ### Page 66 – Equation 3.19 61 | **Original** 62 | 63 | > In the limit of $n \rightarrow \infty$,$R V_{t}^{+} \rightarrow \text{ }_{t-1}^{t} \sigma_{s}^{2} ds+\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau}>0 }^{2} $, $ R V_{t}^{-} \rightarrow \int_{t-1}^{t} \sum_{s}^{2} d s+\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau}0 }^{2} $, and, 64 | > 65 | > $$S J_{t}=\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau}>0 }^{2} -\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau} 0}^{2} $$ 66 | 67 | **Correction** 68 | 69 | > In the limit of $n\to \infty$, $RV_t^+ \to \int _{t- 1}^t\sigma _s^2ds+ \sum_{t- 1\leq \tau \leq t}J_\tau^2 \mathbb{I} _{J_\tau > 0}$, $RV_t^- \to \int_{t- 1}^t \sigma_s^2 ds + \sum_{t-1\leq\tau\leq t}J_\tau^2\mathbb{I}_{J_\tau<0} $, and, 70 | > 71 | > $$SJ_t = \sum_{t- 1\leq \tau \leq t}J_\tau^2 \mathbb{I} _{J_\tau > 0}-\sum_{t-1\leq\tau\leq t}J_\tau^2\mathbb{I}_{J_\tau<0}.$$ 72 | 73 | **Note** 74 | 75 | > Inserted the missing integral symbol, properly representing the continuous term as $\int_{t-1}^t\sigma_s^2\,ds$. Replaced the ambiguous jump‐index notation with indicator functions $\mathbb{I}_{J_\tau>0}$ and $\mathbb{I}_{J_\tau<0}$ to clearly separate positive and negative jumps. 76 | 77 | -------------------------------------------------------------------------------- /docs/docs/GRS检验的证明.md: -------------------------------------------------------------------------------- 1 | $$ 2 | {线性回归的基础带hat的都是估计量,不带的是我们假设的隐藏真值;y_{i},x_{i}是观测值}\\ 3 | \vec{y}_{i}=\vec{\alpha}+\vec{\beta}{x_{i}}+\stackrel{\rightharpoonup}{\epsilon} \quad \vec{y}_{N \times 1} \quad \vec{\alpha}_{N \times 1} \quad \vec{\beta}_{N \times K} \quad \vec{\varepsilon}_{N \times 1}\\ 4 | { }\\ 5 | \begin{array}{l} 6 | \vec{y}=\vec{\alpha}+\vec{\beta} x_{i} \\ 7 | \vec{y_{i}}=\vec{\alpha}+\vec{\beta} x_{i}+\vec{\varepsilon_{i}} \\ 8 | \vec{y}_{i}=\hat{\vec{\alpha}}+\hat{\vec{\beta}} x_{i}+\hat{\vec{\varepsilon_{i}}} 9 | \end{array} 10 | $$ 11 | 12 | $$ 13 | \begin{array}{l} 14 | { 最小二乗:(以下所有的推导均在\vec{\epsilon}服从正态分布的假设下导出) } \\ 15 | \underset{a}\arg\min\sum_{i=1}^{T}\left\|\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right\|^{2} \\ 16 | \frac{\partial}{\partial \vec{a}} \sum_{i=1}^{T}\left(\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right)^{\prime}\left(\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right) \\ 17 | =\frac{\partial}{\partial \vec{a}} \sum_{i=1}^{T} \operatorname{Tr}\left(\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right)^{\prime}\left(\overrightarrow{y_{i}}-\vec{\alpha}-\vec{\beta} x_{i}\right) \\ 18 | =\frac{\partial}{\partial \vec{a}} \sum_{i=1}^{T} \operatorname{Tr}\left(\vec{y}_{i}^{\prime} y_{i}-\vec{y}_{i}^{\prime} \vec{a}-\vec{y}_{i}^{\prime} \vec{\beta}^{\prime} x_{i}-\vec{\alpha}^{\prime} \vec{y}_{i}+\vec{\alpha}^{\prime} \vec{\alpha}+\vec{\alpha}^{\prime} \vec{\beta} x_{i} - x_{i}^{\prime} \vec{\beta}^{\prime} \vec{y}_{i}+x_{i}^{\prime} \vec{\beta}^{\prime} \vec{\alpha}+\left(\vec{\beta} x_{i}\right)^{\prime}\left(\vec{\beta} x_{i}\right)\right) \\ 19 | 20 | { }\\ 21 | { 对\alpha求导等于0 } \\ 22 | \Rightarrow\sum_{i=1}^{T}-2 \vec{y}_{i}+2 \vec{a}+2 \vec{\beta}x_{i}=0 \\ 23 | 24 | { 对\beta求导等于0}\\ 25 | \frac{\partial}{\partial \vec{\beta}} \sum_{i=1}^{T} \operatorname{Tr}\left(\vec{y_{i}}-\vec{\alpha}-\vec{\beta} x_{i}\right)^{\prime}\left(\vec{y_{i}}-\vec{\alpha}-\vec{\beta} x_{i}\right)=\sum_{i=1}^{T}-\vec{y_{i}} x_{i}^{\prime}+\vec{\alpha} x_{i}^{\prime}-x_{i} \vec{y}_{i}^{\prime} + x_{i} \vec{\alpha}^{\prime}+2 \vec{\beta}{x}_{i}{x}_{i}^{\prime}=0 \\ 26 | { }\\ 27 | {求\alpha和\beta的值}\\ 28 | \left\{\begin{array}{ll} 29 | \sum_{i=1}^{T}-2 \overrightarrow{y_{i}}+2 \vec{\alpha}+2 \vec{\beta} x_{i}=0 \\ 30 | \sum_{i=1}^{T}-2 \overrightarrow{y_{i}} x_{i}^{\prime}+2 \vec{\alpha} x_{i}^{\prime}+2 \vec{\beta} {x}_{i} x_{i}^{\prime}=0 \\ 31 | \end{array}\right. \\ 32 | \Rightarrow\left\{\begin{array}{lll} 33 | 2 T \bar{y}+2 T \vec{\alpha}+2 T \vec{\beta} \vec{x}=0 \Rightarrow \hat{\vec{\alpha}}=\bar{y}-\vec{\beta} {\bar{x}}\\ 34 | \sum_{i=1}^{T}-2 \vec{y_{i}} x_{i}^{\prime}+(2 {\vec{y}}-2 \vec{\beta} \bar{x}) x_{i}^{\prime}+2 \vec{\beta} \vec{x}_{i} \vec{x}_{i}^{\prime}=0\\ 35 | \end{array}\right. \\ 36 | { }\\ 37 | {\hat{\vec{\beta}}的方差}\\ 38 | 39 | \hat{\vec{\beta}}=\sum_{i=1}^{T}\left(\overrightarrow{y_{i}}-\vec{y}\right) {x}_{i}^{\prime}\cdot \left(\sum_{i=1}^{T}\left({x_{i}}-\bar{x}\right) \vec{x}_{i}^{\prime}\right)^{-1} 40 | {} 41 | \hat{\vec{\beta}}\\ 42 | =\vec{\beta}+\sum_{i=1}^{T}\left(\vec{\varepsilon}_{i}-\bar{\varepsilon}_{i}\right) x_{i}^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right) x_{i}^{\prime}\right]^{-1}\\ 43 | =\vec{\beta}+\sum_{i=1}^{T}\left(\vec{\varepsilon}_{i}-\varepsilon_{i}\right)\left(x_{i}-\bar{x}\right)^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}\right]^{-1} \\ 44 | 45 | 46 | {原因:x_{i}以及\bar{x}都是常数(这个不管一元多元都是这样的),用到的假设是\epsilon的方差与x无关,即\operatorname{Var}(\epsilon|x)=\sigma^2,\Sigma=\sigma^2I}\\ 47 | \operatorname{Var}(\hat{\beta})=\sigma^{2}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}\right]^{-1}[\Sigma_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}]\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}\right]^{-1}=\Omega^{-1} \Sigma \quad \quad \quad \quad \Omega=\frac{1}{T} \sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime} \\ 48 | 49 | {\hat{\alpha}的方差}\\ 50 | \operatorname{Var}(\hat{\alpha})=\frac{\sum}{T}+\bar{x}^{\prime} \operatorname{Var}(\hat{\beta})\bar{x}=\frac{1}{T}\left(1+\bar{x}^{\prime} \Omega^{-1} \bar{x}\right) \Sigma \\ 51 | \left[\hat{\vec{a}}=\bar{y}-\hat{\vec{\beta}} \bar{x}=\alpha+\vec{\beta} \bar{x}+\frac{1}{T} \sum_{i=1}^{T} \vec{\varepsilon}_{i}-\left(\vec{\beta}+\sum_{i=1}^{T}\left(\varepsilon_{i}-\bar{\varepsilon}_{i}\right) x_{i}^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right) x_{i}^{\prime}\right]^{-1}\right) \bar{x} =\frac{1}{T} \sum_{i=1}^{T} \vec{\varepsilon}_{i}+\sum_{i=1}^{T}\left(\vec{\varepsilon}_{i}-\bar{\varepsilon}_{i}\right) x_{i}^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right) x_{i}^{\prime}\right]^{-1} \bar{x}\right] \\ 52 | \Rightarrow \hat{\vec{\alpha}} \sim N_{N}\left(\vec{\alpha}, \frac{1}{T}\left(1+\bar{x} \Omega^{-1} \bar{x}\right)^{-1} \Sigma\right)\\ 53 | 54 | 55 | { }\\ 56 | { 假设\vec{\alpha}=0} \\ 57 | 58 | (T-K-1) \hat{\Sigma}\sim W_{N}(T-K-1, \Sigma)\quad \quad({因为\Sigma的估计量为} \hat{\Sigma}=\frac{1}{T-k-1}(\hat{\varepsilon}-\bar{\hat{\varepsilon}})(\hat{\varepsilon}-\bar{\hat{\varepsilon}})^{\prime}\quad最小二乘的假设里E(\hat{\epsilon})=0,因此\bar{\hat{\varepsilon}}=0) \\ 59 | \sqrt{T / (1+\bar{x} \Omega^{-1} \bar{x}}) \cdot \hat{\vec{\alpha}} \sim N_{N}\left(0, \Sigma\right)\\ 60 | { }\\ 61 | { 构造 Hotelling T^{2} statistics}\\ 62 | \left(统计量的构造方法如下:x \sim N_{p}(0, \Sigma),\quad w \sim w_{p}(n, \Sigma)\quad \Rightarrow \frac{n-p+1}{p n} n x^{\prime} w^{-1} x \sim F(p, n-p+1)\right) \\ 63 | { 用\sqrt{T / (1+\bar{x} \Omega^{-1} \bar{x}}) \cdot \hat{\vec{\alpha}} \sim N_{N}\left(0, \Sigma\right)代替x}\\ 64 | { 用(T-K-1) \hat{\Sigma}\sim W_{N}(T-K-1, \Sigma)代替\omega}\\ 65 | { 即p=N,n=T-K-1}\\ 66 | \Rightarrow \frac{T(T-K-N)}{N(T-K-1)}\cdot 67 | \hat{\alpha}^{\prime} \sum^{-1} \hat{\alpha}\left(\frac{1}{1+\bar{x}^{\prime} \Omega^{-1} \bar{x}}\right) \sim F_{N, T-K-N} 68 | \end{array} 69 | $$ 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /docs/docs/compute/compute.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Construct Factor 3 | nav_order: 5 4 | permalink: /construct/ 5 | --- 6 | How to use data in your project 7 | {: .fs-6 .fw-300 } -------------------------------------------------------------------------------- /docs/docs/configuration.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Installation 3 | nav_order: 3 4 | --- 5 | 6 | # Installation 7 | 8 | ```bash 9 | # We have not released the package to pypi yet, so you need to install from source!!! 10 | # Install from source for loacl testing!!! 11 | ## replace $ThisRepoURL with the actual repo url 12 | git clone $ThisRepoURL 13 | ## install from source 14 | pip install -e . 15 | ``` 16 | 17 | # Load Data 18 | 19 | Download the data 20 | from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz) 21 | 22 | run the command and download data put in correct path automatically. 23 | 24 | ```bash 25 | # We have not released this repo yet, so you need download the data manually!!! See command below!!! 26 | # Auto download data 27 | fire download 28 | ``` 29 | 30 | If you have already downloaded the data from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz), you can run the command to check the data and put the data in the correct path 31 | 32 | ```bash 33 | # replace path_to_data.tar.gz with the actual path 34 | fire load path_to_data.tar.gz 35 | ``` 36 | -------------------------------------------------------------------------------- /docs/docs/data/A-share.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: A-Share Data 3 | permalink: /data/a-share/ 4 | parent: Data Management 5 | nav_order: 4.1 6 | --- 7 | 8 | # A-Share Data 9 | 10 | Fire provides comprehensive data for the Chinese A-share market, including historical prices, financial 11 | statements, and other relevant information. This section outlines the available datasets and how to access them. 12 | 13 | ## Available Datasets 14 | 15 | ### Historical Prices 16 | 17 | - **Daily Prices**: Contains daily open, high, low, close, money, vwap and volume data. 18 | - **Daily Valuations**: Provides daily valuation metrics such as P/E ratio, P/B ratio, etc 19 | 20 | 21 | 数据范围: 22 | 2015.01.01-2025.05.01 23 | 24 | 数据内容分类: 25 | 26 | 1. quote:量价数据 27 | 字段数量:28 个 28 | 数据类型:DataFrame 29 | 数据规模:(2509 行,5363 列) 30 | 31 | |字段|中文名| 32 | |---|---| 33 | |open|开盘价——日级| 34 | |close|收盘价——日级| 35 | |high|最高价——日级| 36 | |low|最低价——日级| 37 | |volume|成交量(股/份)——日级| 38 | |money|成交额(元)——日级| 39 | |return_adj|涨跌幅——日级| 40 | |vwap|成交量加权均价——日级| 41 | |adj_factor|复权因子| 42 | |open_dr|开盘价——日级| 43 | |high_dr|最高价——日级| 44 | |low_dr|最低价——日级| 45 | |close_dr|收盘价——日级| 46 | |volume_dr|成交量(股/份)——日级| 47 | |vwap_dr|成交量加权均价——日级| 48 | |FinanceValue|融资余额(元)| 49 | |FinanceBuyValue|融资买入额(元)| 50 | |FinanceRefundValue|融资偿还额(元)| 51 | |SecurityVolume|融券余量(股)| 52 | |SecuritySellVolume|融券卖出量(股)| 53 | |SecurityRefundVolume|融券偿还量(股)| 54 | |SecurityValue|融券余额(元)| 55 | |TradingValue|融资融券余额(元)| 56 | |FinaInTotalRatio|融资占交易所融资余额比(%)| 57 | |SecuInTotalRatio|融券占交易所融券余额比(%)| 58 | |shares_holding|持股数量(股)| 59 | |hold_ratio|持股占比(%)| 60 | |adjusted_hold_ratio|调整后的持股占比(%)| 61 | 62 | 63 | 2. valuation:估值数据 64 | 字段数量:14 个 65 | 数据类型:DataFrame 66 | 数据规模:(2509 行,5363 列) 67 | 68 | |字段|中文名| 69 | |---|---| 70 | |circulating_market_cap|流通市值(亿元)(含港股)| 71 | |pcf_ratio|市现率(PCF, 现金净流量TTM)| 72 | |market_cap|总市值(亿元)(含港股)| 73 | |pe_ratio_lyr|静态市盈率(PE)| 74 | |circulating_cap|流通股本(万股)(含港股)| 75 | |capitalization|总股本(万股)(含港股)| 76 | |pb_ratio|市净率(PB)| 77 | |pe_ratio|市盈率(PE, TTM)| 78 | |ps_ratio|市销率(PS, TTM)| 79 | |turnover_ratio|换手率(%)| 80 | |circulating_market_cap_ashare|A股流通市值(亿元)| 81 | |market_cap_ashare|A股总市值(亿元)| 82 | |circulating_cap_ashare|A股流通股本(万股)| 83 | |capitalization_ashare|A股总股本(万股)| 84 | 85 | 86 | 3. financial:财务数据 87 | 字段数量:11 个 88 | 数据类型:DataFrame 89 | 数据规模:(2509 行,5363 列) 90 | 91 | |字段|中文名| 92 | |---|---| 93 | |inventories|存货(元)| 94 | |total_current_assets|流动资产合计(元)| 95 | |fixed_assets|固定资产(元)| 96 | |good_will|商誉(元)| 97 | |total_assets|资产总计(元)| 98 | |total_liability|负债合计(元)| 99 | |operating_revenue|营业收入(元)| 100 | |operating_profit|营业利润(元)| 101 | |total_profit|利润总额(元)| 102 | |net_profit|净利润(元)| 103 | |basic_eps|基本每股收益(元)| 104 | 105 | -------------------------------------------------------------------------------- /docs/docs/data/data.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Management 3 | permalink: /data/ 4 | nav_order: 4 5 | --- 6 | 7 | # Data Management 8 | 9 | Fire provides a user-friendly interface for downloading and managing financial data. By leveraging the pre-cleaned and processed data pipeline from the Fire Institute, you can focus more on research and modeling rather than data preparation. 10 | 11 | 12 | Currently, Fire only porvides the data from the Chinese A stock market. We will provide more data in the future. 13 | 14 | 15 | # Download Data 16 | 17 | We provide a simple command-line interface to download the data. You can use the following command to download the data: 18 | 19 | ```bash 20 | firefin download 21 | ``` 22 | 23 | This command will download the latest data from the Fire Institute and store it in the `~/.fire/data/raw` directory, all data 24 | will be organized in feather format. (Maybe we will consider other database or k-v store in the future) cause we do not update the data frequently, so we choose feather format for its fast read/write speed. 25 | 26 | # Load Data 27 | 28 | If you have downloaded the data manually or received it from another source, you can use the following command to load it into the Firefin system: 29 | 30 | ```bash 31 | firefin load 32 | ``` 33 | 34 | This command will extract the contents of the provided tar file and place them in the appropriate directory within the Firefin system. 35 | 36 | # Data Structure 37 | 38 | The data is organized in a structured format to facilitate easy access and manipulation. Here is an overview of the data structure: 39 | 40 | 41 | | Date | security1 | security2 | security2 | ... | securityN | 42 | |------------|------------|------------|------------|------|------------| 43 | | 2023-01-01 | 10.5 | 10.7 | 10.8 | ... | 10.9 | 44 | | 2023-01-02 | 10.6 | 10.8 | 10.9 | ... | 11.1 | 45 | | ... | ... | ... | ... | ... | 46 | | 2023-12-31 | 11.0 | 11.2 | 11.3 | ... | 11.4 | 47 | 48 | 49 | 1. ALL data is stored in a single Feather file named `data_name.feather`. 50 | 2. Each row represents a date. 51 | 3. Each column represents a security, identified by its ticker symbol. 52 | 4. The values in the cells represent the closing prices of the securities on the corresponding dates. 53 | 5. **index(date) and columns(securities) are exactly the same across A datasets.** For example, 'A-share chinese market' 54 | 55 | With the above structure, you can easily perform time-series analysis, portfolio optimization, and other financial analyses, with out thinking about the data alignment issue. 56 | 57 | -------------------------------------------------------------------------------- /docs/docs/evaluation/AcaEvaluatorModel.md: -------------------------------------------------------------------------------- 1 | # AcaEvaluatorModel 2 | 3 | `AcaEvaluatorModel` is a class designed for evaluating the performance of a **single-factor model** using various asset pricing methodologies. It supports portfolio sorting, cross-sectional regression, information coefficient calculations, and anomaly tests. 4 | 5 | --- 6 | 7 | ## Class Initialization 8 | 9 | ```python 10 | AcaEvaluatorModel(factor: pd.DataFrame, forward_returns: ForwardReturns, return_adj: pd.DataFrame) 11 | ```` 12 | 13 | **Parameters** 14 | 15 | * `factor` *(pd.DataFrame)*: Factor exposure data (Time × Stock) 16 | * `forward_returns` *(dict\[str, pd.DataFrame])*: Future returns mapped by holding periods (Time × Stock) 17 | * `return_adj` *(pd.DataFrame)*: DataFrame of adjusted returns (Time × Stock) 18 | --- 19 | 20 | ## Methods 21 | 22 | ### `run_single_sort` 23 | 24 | Perform single-factor portfolio sorting. 25 | 26 | **Parameters** 27 | 28 | * `quantiles` *(int)*: Number of quantile groups (e.g. 5 for quintiles) 29 | * `value_weighted` *(bool)*: Use value-weighted portfolios if `True`; otherwise, equal-weighted 30 | * `return_stats` *(bool)*: Whether to return statistics for H-L portfolio 31 | * `market_cap` *(pd.DataFrame)*: Market cap data (required if `value_weighted=True`) 32 | * `get_quantile_sorts` *(bool)*: Whether to return quantile labels for each stock 33 | 34 | **Returns** 35 | 36 | * If `return_stats=True`: 37 | `Tuple[QuantileReturns, dict]` 38 | * Else: 39 | `QuantileReturns` 40 | 41 | --- 42 | 43 | ### `run_fama_macbeth` 44 | 45 | Run two-stage Fama-MacBeth cross-sectional regression. 46 | 47 | **Parameters** 48 | 49 | * `window` *(int)*: Rolling window size for first-stage regression (default: 252) 50 | * `return_stats` *(bool)*: Whether to return statistical summary 51 | 52 | **Returns** 53 | 54 | * If `return_stats=True`: 55 | `Tuple[RegressionResult, dict]` 56 | * Else: 57 | `RegressionResult` 58 | 59 | --- 60 | 61 | ### `run_ic` 62 | 63 | Compute Information Coefficients (IC) across time. 64 | 65 | **Parameters** 66 | 67 | * `method` *(str)*: Correlation type, one of `'pearson'`, `'spearman'`, or `'kendall'` 68 | 69 | **Returns** 70 | 71 | * `pd.DataFrame`: IC values by period 72 | 73 | --- 74 | 75 | ### `run_regression` 76 | 77 | Run static or rolling regression of returns on factor exposures. 78 | 79 | **Parameters** 80 | 81 | * `rolling` *(bool)*: Whether to run rolling regression 82 | * `window` *(int)*: Rolling window size (only used if `rolling=True`) 83 | * `fit_intercept` *(bool)*: Include intercept term if `True` 84 | 85 | **Returns** 86 | 87 | * `BatchRegressionResult` or `dict` (if rolling) 88 | 89 | --- 90 | 91 | ### `run_anomaly_test` 92 | 93 | Conduct anomaly tests by regressing returns on the factor. 94 | 95 | **Parameters** 96 | 97 | * `portfolio_returns` *(dict\[str, pd.DataFrame])*: DataFrame of portfolio returns, with each column representing a distinct portfolio. (Quantile returns) 98 | * `cov_type` *(Optional\[str])*: Type of covariance estimator (e.g., `'HAC'`, `'HC0'`, etc.) 99 | * `cov_kwds` *(Optional\[dict])*: Additional keyword arguments for the covariance estimator 100 | * `return_stats` *(bool)*: Whether to return summary statistics 101 | 102 | **Returns** 103 | 104 | * If `return_stats=True`: 105 | `Tuple[AnomalyTest, pd.DataFrame]` 106 | * Else: 107 | `AnomalyTest` 108 | 109 | --- 110 | 111 | ### `run_all` 112 | 113 | Run all available evaluation methods and return results in a dictionary. 114 | 115 | **Returns** 116 | 117 | * `dict`: Keys include: 118 | 119 | * `'single_sort_res'`, `'single_sort_stat'` 120 | * `'fama_macbeth'` 121 | * `'information_coefficient'` 122 | * `'regression'` 123 | * `'anomaly'` 124 | 125 | --- 126 | 127 | ## Notes 128 | 129 | * `run_regression` uses either time-by-time OLS or rolling regression depending on the `rolling` flag. 130 | * `run_all` is useful for executing a full evaluation pipeline for a single factor. 131 | * Ensure `market_cap` is provided when performing value-weighted portfolio sorts. 132 | * `return_adj` in `run_fama_macbeth` should be matched to the target return horizon. 133 | -------------------------------------------------------------------------------- /docs/docs/evaluation/AcaEvaluatorModelComparison.md: -------------------------------------------------------------------------------- 1 | # AcaEvaluatorModelComparison 2 | 3 | `AcaEvaluatorModelComparison` is a class designed for evaluating **multi-factor models**, supporting methods such as double portfolio sorting and maximum Sharpe ratio (MSR) comparison between two models. 4 | 5 | --- 6 | 7 | ## Class Initialization 8 | 9 | ```python 10 | AcaEvaluatorModelComparison(factor1: pd.DataFrame, factor2: pd.DataFrame, forward_returns: ForwardReturns) 11 | ```` 12 | 13 | **Parameters** 14 | 15 | * `factor1` *(pd.DataFrame)*: First factor exposure matrix (Time × Stock) 16 | * `factor2` *(pd.DataFrame)*: Second factor exposure matrix (Time × Stock) 17 | * `forward_returns` *(dict\[str, pd.DataFrame])*: Future returns by holding period (Time × Stock) 18 | 19 | --- 20 | 21 | ## Methods 22 | 23 | ### `run_double_sort` 24 | 25 | Perform double-sort portfolio sorting based on two factors. 26 | 27 | **Parameters** 28 | 29 | * `quantiles` *(Tuple\[int, int])*: Number of quantiles for each factor (e.g., (5, 5)) 30 | * `dependent` *(bool)*: Whether to apply dependent (conditional) sorting 31 | * `value_weighted` *(bool)*: Use value-weighted returns if `True`; otherwise equal-weighted 32 | * `market_cap` *(pd.DataFrame)*: Market cap data, required if `value_weighted=True` 33 | * `get_quantile_sorts` *(bool)*: Whether to return the portfolio labels for each stock 34 | 35 | **Returns** 36 | 37 | * `QuantileReturns` or `dict[str, pd.DataFrame]` (if `get_quantile_sorts=True`) 38 | 39 | --- 40 | 41 | ### `run_msr_test` 42 | 43 | Compare the Maximum Sharpe Ratios (MSR) between two models using a statistical test. 44 | 45 | **Parameters** 46 | 47 | * `regularize` *(bool)*: Whether to apply shrinkage regularization to the covariance matrix 48 | 49 | **Returns** 50 | 51 | * `dict`: 52 | 53 | * `'msr_a'`: Maximum Sharpe Ratio for model A 54 | * `'msr_b'`: Maximum Sharpe Ratio for model B 55 | * `'test_stat'`: Z-statistic of the MSR test 56 | * `'p_value'`: Corresponding two-sided p-value 57 | 58 | --- 59 | 60 | ### `run_all` 61 | 62 | Run all available evaluation methods in the class. 63 | 64 | **Parameters** 65 | 66 | * `market_cap` *(pd.DataFrame)*: Required if `value_weighted=True` in `run_double_sort` 67 | 68 | **Returns** 69 | 70 | * `dict`: 71 | 72 | * `'double_sort'`: Result of double-sort sorting 73 | * `'msr_test'`: Result of MSR comparison between the two factor models 74 | 75 | --- 76 | 77 | ## Notes 78 | 79 | * `run_double_sort` can support both independent and nested (conditional) sorting based on two factors. 80 | * `run_msr_test` is based on a Z-test for comparing Sharpe Ratios under multivariate settings. 81 | * `run_all` is a quick way to benchmark model performance using all implemented tools. -------------------------------------------------------------------------------- /docs/docs/evaluation/aca_evaluator.md: -------------------------------------------------------------------------------- 1 | # AcaEvaluator 2 | 3 | `AcaEvaluator` is a modular Python class designed to evaluate asset pricing factors using a comprehensive suite of financial econometrics tools. It supports portfolio sorts, cross-sectional and time-series regressions, model comparison tests, and robustness diagnostics. 4 | 5 | --- 6 | 7 | ## Key Features 8 | 9 | | Method | Purpose | 10 | | ------------------------------ | ---------------------------------------------------------------------------------- | 11 | | `run_single_sort()` | Perform univariate portfolio sorting based on a factor | 12 | | `run_double_sort()` | Perform bivariate (conditional or independent) portfolio sorts | 13 | | `run_fama_macbeth()` | Run Fama-MacBeth two-pass cross-sectional regressions | 14 | | `run_ic()` | Compute the Information Coefficient (IC) between factor values and forward returns | 15 | | `run_msr_test()` | Compute and compare Max Sharpe Ratios between two factor models | 16 | | `run_regression()` | Run time-series or rolling regression on test portfolios to obtain alpha and beta | 17 | | `get_grs_test()` | Conduct the Gibbons-Ross-Shanken (GRS) test to check model pricing accuracy | 18 | | `get_hj_distance_test()` | Compute Hansen–Jagannathan distance to assess pricing error | 19 | | `compare_model_alphas()` | Compare model alphas across multiple asset pricing models | 20 | | `run_horse_race_regression()` | Evaluate marginal explanatory power of factors (horse race regression) | 21 | | `get_spanning_test()` | Check whether a new factor is spanned by an existing model | 22 | | `run_subsample_analysis()` | Perform robustness tests by splitting the sample | 23 | | `compute_vif()` | Calculate Variance Inflation Factors to detect multicollinearity | 24 | 25 | --- 26 | 27 | ## Class Initialization 28 | 29 | ```python 30 | AcaEvaluator(factor: pd.DataFrame, forward_returns: dict[str, pd.DataFrame]) 31 | ``` 32 | 33 | **Parameters:** 34 | 35 | * `factor`: A (Time × Stock) DataFrame of factor exposures 36 | * `forward_returns`: A (Time × Stock) DataFrame of forward return 37 | 38 | --- 39 | 40 | ## Method Documentation 41 | 42 | ### `run_single_sort()` 43 | 44 | Perform single-factor portfolio sorting. 45 | 46 | **Parameters:** 47 | 48 | * `quantiles` (int): Number of quantile groups (e.g., 5 for quintiles) 49 | * `value_weighted` (bool): If True, portfolios are value-weighted; otherwise equal-weighted 50 | * `return_stats` (bool): Whether to return statistics (mean, t-value, p-value) of high-minus-low (H-L) portfolios 51 | * `market_cap` (pd.DataFrame): Required if `value_weighted` is True; same shape as `factor` 52 | * `get_quantile_sorts` (bool): Return group labels of stocks by quantile 53 | 54 | **Returns:** 55 | 56 | * Quantile portfolio returns or tuple of (returns, statistics) if `return_stats=True` 57 | 58 | --- 59 | 60 | ### `run_double_sort()` 61 | 62 | Perform double sorting based on two factors. 63 | 64 | **Parameters:** 65 | 66 | * `factor2` (pd.DataFrame): Second factor 67 | * `quantiles` (tuple\[int, int]): Quantile group counts for each factor 68 | * `dependent` (bool): Use nested sort if True 69 | * `value_weighted` (bool): If True, portfolios are value-weighted; otherwise equal-weighted 70 | * `market_cap` (pd.DataFrame): Required if `value_weighted` is True; same shape as `factor` 71 | * `get_quantile_sorts` (bool): Return group labels of stocks by quantile 72 | 73 | **Returns:** 74 | 75 | * Portfolio return structure or dictionary of quantile groupings 76 | 77 | --- 78 | 79 | ### `run_fama_macbeth()` 80 | 81 | Run two-pass Fama-MacBeth regression. 82 | 83 | **Parameters:** 84 | 85 | * `return_adj` (pd.DataFrame): Adjusted returns matrix 86 | * `window` (int): First-stage rolling window (default: 252) 87 | * `return_stats` (bool): Return t-statistics and significance 88 | 89 | **Returns:** 90 | 91 | * Regression result or (result, statistics) tuple 92 | 93 | --- 94 | 95 | ### `run_ic()` 96 | 97 | Calculate Information Coefficient between factor and future returns. 98 | 99 | **Parameters:** 100 | 101 | * `method` (str): Correlation method ('pearson', 'spearman', or 'kendall') 102 | 103 | **Returns:** 104 | 105 | * `pd.DataFrame`: Time series of IC values 106 | 107 | --- 108 | 109 | ### `run_msr_test()` 110 | 111 | Compare the maximum Sharpe ratios (MSRs) of two factor models using a z-test. 112 | 113 | **Parameters:** 114 | 115 | * `model_a_factors` (pd.DataFrame): Factor returns of Model A (Time × K) 116 | * `model_b_factors` (pd.DataFrame): Factor returns of Model B (Time × K) 117 | * `regularize_covariance` (bool): If True, regularize the covariance matrix. 118 | 119 | **Returns:** 120 | 121 | * Dictionary with keys: 122 | - `msr_a`: Maximum Sharpe ratio of Model A 123 | - `msr_b`: Maximum Sharpe ratio of Model B 124 | - `test_stat`: z-test statistic comparing MSRs 125 | - `p_value`: p-value of the test 126 | 127 | --- 128 | 129 | ### `run_regression()` 130 | 131 | Run either standard or rolling time-series regression on test portfolios, based on a flag. 132 | 133 | **Parameters:** 134 | 135 | * `rolling` (bool): optionalWhether to perform rolling regression, by default False. 136 | * `window` (int): optionalRolling window size (only used if rolling=True), by default 60. 137 | * `fit_intercept` (bool): optionalWhether to include an intercept in the regression, by default True. 138 | 139 | **Returns:** 140 | 141 | * `BatchRegressionResult` (dict): Regression result object (static) or a dictionary of rolling results. 142 | --- 143 | 144 | ### `get_grs_test()` 145 | 146 | Run GRS test for overall model explanatory power. 147 | 148 | **Parameters:** 149 | 150 | * `test_portfolios`: Time-series returns of test portfolios 151 | * `plot`: Whether to generate visual output 152 | 153 | **Returns:** 154 | 155 | * Dictionary with keys: `grs_stat`, `p_value`, `alphas`, `t_stats`, `residual_cov`, `betas` 156 | 157 | --- 158 | 159 | ### `get_hj_distance_test()` 160 | 161 | Compute HJ distance to assess pricing error. 162 | 163 | **Parameters:** 164 | 165 | * `test_portfolios`: Portfolio return matrix 166 | * `plot`: Whether to visualize 167 | 168 | **Returns:** 169 | 170 | * Dictionary with HJ distance, t-stat, alpha, betas, residual\_cov 171 | 172 | --- 173 | 174 | ### `compare_model_alphas()` 175 | 176 | Compare intercepts across different models. 177 | 178 | **Parameters:** 179 | 180 | * `models`: Dictionary of model name → factor return 181 | * `test_portfolios`: Test portfolio returns 182 | * `plot`: Whether to display comparison plot 183 | 184 | **Returns:** 185 | 186 | * Dictionary of results per model: alpha, t\_stat, mean\_abs\_alpha, mean\_abs\_t 187 | 188 | --- 189 | 190 | ### `run_horse_race_regression()` 191 | 192 | Run horse race regression to assess marginal explanatory power. 193 | 194 | **Parameters:** 195 | 196 | * `candidate_factors`: Dict of factor name → exposure DataFrame 197 | * `forward_return_key`: Key to select the return horizon 198 | * `date`: If set, single-period regression; otherwise multi-period 199 | * `plot`: Whether to visualize t-stats 200 | 201 | **Returns:** 202 | 203 | * Dictionary with `coefs`, `mean_coef`, `t_stat`, `p_value` 204 | 205 | --- 206 | 207 | ### `get_spanning_test()` 208 | 209 | Test whether a new factor can be spanned by base factors. 210 | 211 | **Parameters:** 212 | 213 | * `new_factor`: Series of the new factor 214 | * `base_model_factors`: Existing model factors (DataFrame) 215 | * `plot`: Whether to show visualization 216 | 217 | **Returns:** 218 | 219 | * Dictionary with `r_squared`, `alpha`, `t_stat`, `p_value`, `beta`, `resid_std` 220 | 221 | --- 222 | 223 | ### `run_subsample_analysis()` 224 | 225 | Run out-of-sample robustness checks across different time periods. 226 | 227 | **Parameters:** 228 | 229 | * `method`: "ic", "alpha", or "quantile\_returns" 230 | * `split_dates`: List of split timestamps 231 | * `forward_return_key`: Return name (if needed) 232 | * `quantiles`: Group count for sorting (if used) 233 | * `plot`: Whether to visualize comparison 234 | 235 | **Returns:** 236 | 237 | * Dictionary with per-sample evaluation results 238 | 239 | --- 240 | 241 | ### `compute_vif()` 242 | 243 | Detect multicollinearity using variance inflation factors (VIF). 244 | 245 | **Parameters:** 246 | 247 | * `factors`: Factor exposure matrix (T × K) 248 | * `plot`: Show bar plot of VIFs 249 | 250 | **Returns:** 251 | 252 | * Dictionary with `vif` (Series), `max_vif`, `mean_vif` 253 | -------------------------------------------------------------------------------- /docs/docs/evaluation/evaluation.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Evaluate Factor 3 | permalink: /evaluate/ 4 | nav_order: 6 5 | --- 6 | How to use data in your project 7 | {: .fs-6 .fw-300 } -------------------------------------------------------------------------------- /docs/docs/release-plan.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Release Plan 3 | permalink: /release-plan/ 4 | nav_order: 2 5 | --- 6 | 7 | # Release Plan 8 | 9 | | Stage Name | Begin Time | End Time | Note | 10 | | ----------------------------- | ---------- | --------- | -------------------------------------------------- | 11 | | Collect key features | 2025/2/10 | 2025/2/20 | 版本需求收集| 12 | | Change Review | 2025/2/20 | 2025/2/25 | Review project 需求(升级/退役/淘汰)| 13 | | Develop | 2025/2/25 | 2025/3/15 | 新特性开发,合入master| 14 | | Build & Alpha | 2025/3/16 | 2025/3/30 | 新开发特性合入,Alpha版本发布| 15 | | Test round 1 | 2025/3/30 | 2025/4/07 | 第一轮测试| 16 | | Beta version release | 2025/4/08 | 2025/4/08 | Beta版本发布| 17 | | Test round 2 | 2025/4/09 | 2025/4/15 | bug fix and 第二轮测试| 18 | | Release Review | 2025/4/16 | 2025/4/16 | 版本发布决策| 19 | | Release preparation | 2025/4/16 | 2025/4/16 | 发布前准备阶段,发布件梳理| 20 | | Release | 2025/4/17 | 2025/4/17 | Release正式发布| 21 | 22 | 23 | 24 | 25 | 26 | # Feature list 27 | 28 | 状态说明: 29 | 30 | - Discussion(方案讨论,需求未接受) 31 | - Developing(开发中) 32 | - Testing(测试中) 33 | - Accepted(已验收) 34 | - Reject(已拒绝\未交付) 35 | 36 | 37 | | Feature Name | Status | Owner | 38 | | ------------ | ------ | ----- | 39 | | Consensus Factor Models: CAPM, FF3, FF3+Mom, FF3+MOM+LIQ, FF5 | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) | 40 | | Portfolio Sort | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) | 41 | | Double sorting | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) | 42 | | Fama-MacBeth Regression | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) | 43 | | Testing Anomalies (Time-Series Regression) | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) hi | 44 | | GRS Test | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) | 45 | | Support More Algorithms | Developing | [@qiaobaochen](https://github.com/qiaobaochen) | 46 | | Project CI/CD | Developing | [@qiaobaochen](https://github.com/qiaobaochen)| 47 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Home 3 | layout: home 4 | nav_order: 1 5 | --- 6 | # F.I.R.E. Factor Investment Research Engine 7 | 8 | This project is the bundled opensource toolkit for book _Navigating the Factor Zoo:The Science of Quantitative Investing_. 9 | 10 | The Fire project serves as a development and evaluation toolkit for factor research and portfolio construction. It is designed specifically to be simple, easy to use, and built on top of popular Python libraries like pandas, numpy, and scikit-learn. 11 | 12 | Fire focuses on three critical aspects of factor research and portfolio construction: 13 | 14 | 1. **Data Management**: Fire provides a user-friendly interface for downloading and managing financial data. By leveraging the pre-cleaned and processed data pipeline from the Fire Institute, you can focus more on research and modeling rather than data preparation. 15 | 16 | 2. **Construction (Calculation)**: Fire offers a variety of algorithms for factor construction. Additionally, it allows users to build their own factors using popular libraries such as pandas, numpy, and scikit-learn. 17 | 18 | 3. **Evaluation**: Factor evaluation is a complex and crucial step in research. Fire provides a comprehensive set of tools to assess factor performance, bridging the gap between academic and industry evaluation practices. 19 | 20 | ---- 21 | 22 | ## Quick Start 23 | 24 | ## Installation 25 | 26 | ```bash 27 | # We have not released the package to pypi yet, so you need to install from source!!! 28 | pip install firefin 29 | 30 | # Install from source for loacl testing!!! 31 | ## replace $ThisRepoURL with the actual repo url 32 | git clone $ThisRepoURL 33 | ## install from source 34 | pip install -e . 35 | ``` 36 | 37 | ## Usage 38 | 39 | Download the data 40 | from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz) 41 | 42 | run the command and download data put in correct path automatically. 43 | 44 | ```bash 45 | # We have not released this repo yet, so you need download the data manually!!! See command below!!! 46 | # Auto download data 47 | firefin download 48 | ``` 49 | 50 | If you have already downloaded the data from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz), you can run the command to check the data and put the data in the correct path 51 | 52 | ```bash 53 | # replace path_to_data.tar.gz with the actual path 54 | firefin load path_to_data.tar.gz 55 | ``` 56 | 57 | ## Start to code 58 | 59 | ```python 60 | import firefin 61 | 62 | # get data 63 | data = firefin.fetch_data(["open", "close", "volume"]) 64 | open_price = data["open"] 65 | 66 | 67 | def pv_corr(close, volume): 68 | # price volume correlation 69 | return close.rolling(20).corr(volume) 70 | 71 | 72 | factor = pv_corr(data["close"], data["volume"]) 73 | 74 | # compute forward returns 75 | fr = firefin.compute_forward_returns(open_price.shift(-1), [1, 5, 10]) 76 | 77 | # evaluate factor 78 | mng = firefin.Evaluator(factor, fr) 79 | mng.get_ic("pearson") 80 | mng.get_quantile_returns(5) 81 | 82 | ``` -------------------------------------------------------------------------------- /file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/file.py -------------------------------------------------------------------------------- /firefin/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | from .compute.window import * 5 | from .data.gateway import fetch_data 6 | from .evaluation.eva_utils import compute_forward_returns, compute_ic, compute_quantile_returns 7 | from .evaluation.industry.evaluator import Evaluator 8 | from .core.plot.plots import plt_ic, plt_quantile_cumulated_end_returns, plt_quantile_cumulative_returns 9 | from .evaluation.academia.AcaEvaluatorModel import AcaEvaluatorModel 10 | from .evaluation.academia.AcaEvaluatorModelComparison import AcaEvaluatorModelComparison -------------------------------------------------------------------------------- /firefin/cli/command.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import os 5 | import sys 6 | import click 7 | import subprocess 8 | from ..common.config import DATA_PATH, logger 9 | 10 | 11 | @click.group() 12 | def cli(): 13 | pass 14 | 15 | 16 | @click.command(help="Display help") 17 | def help(): 18 | click.echo("Help") 19 | 20 | 21 | def _prepare_folder(): 22 | # check if data directory exists 23 | if not DATA_PATH.exists(): 24 | DATA_PATH.mkdir(parents=True, exist_ok=True) 25 | logger.info("Data directory is created.") 26 | else: 27 | logger.info("Data directory already exists. Skipping creating directory.") 28 | 29 | 30 | # TODO: Add more data source 31 | @click.command(help="Download data") 32 | @click.option('--data_url', default=None, help='download from provided url') 33 | def download(data_url): 34 | logger.info("Preparing Data for the first time ...") 35 | _prepare_folder() 36 | 37 | if data_url: 38 | if not data_url.startswith("http"): 39 | raise Exception("Please provide a valid url to download data from.") 40 | if not data_url.endswith(".tar.gz"): 41 | raise Exception("Please provide a valid url to download data from. The url should end with .tar.gz") 42 | data_file_name = data_url.split("/")[-1] 43 | raw_data_path = DATA_PATH / data_file_name 44 | request_url = data_url 45 | else: 46 | # download default package 47 | logger.info("No URL provided, will download default AStockData.tar.gz from GitHub.") 48 | raw_data_path = DATA_PATH / "AStockData.tar.gz" 49 | request_url = ( 50 | "https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz" 51 | ) 52 | 53 | # Check if gz file is exist 54 | if raw_data_path.exists(): 55 | logger.info("Data already exists, Remove Data First...") 56 | # ensure the file is removed before downloading again 57 | raw_data_path.unlink(missing_ok=True) # remove the file 58 | logger.info("Data Removed.") 59 | 60 | logger.info("Downloading data ...") 61 | # Download data from file server 62 | try: 63 | subprocess.run(f"wget {request_url} -O {raw_data_path}", shell=True, check=True) 64 | subprocess.run( 65 | f'tar -xvf {raw_data_path} -C {DATA_PATH} --strip-components=1', 66 | shell=True, 67 | check=True 68 | ) 69 | except subprocess.CalledProcessError as e: 70 | logger.error(f"Command execution failed: {e}") 71 | sys.exit(1) 72 | except KeyboardInterrupt: 73 | logger.info("KeyboardInterrupt (Ctrl+C), program terminated") 74 | sys.exit(0) 75 | 76 | 77 | @click.command(help="Prepare data") 78 | @click.argument("file_path", type=click.Path(exists=True)) 79 | def load(file_path: str = None): 80 | logger.info("Preparing Data for the first time ...") 81 | _prepare_folder() 82 | 83 | # tar unzip file, print progress 84 | try: 85 | subprocess.run( 86 | f'tar -xvf {file_path} -C {DATA_PATH} --strip-components=1', 87 | shell=True, 88 | check=True 89 | ) 90 | except subprocess.CalledProcessError as e: 91 | logger.error(f"Command execution failed: {e}") 92 | sys.exit(1) 93 | except KeyboardInterrupt: 94 | logger.info("KeyboardInterrupt (Ctrl+C), program terminated") 95 | sys.exit(0) 96 | 97 | cli.add_command(help) 98 | cli.add_command(download) 99 | cli.add_command(load) -------------------------------------------------------------------------------- /firefin/common/config.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | import os 4 | import pathlib 5 | import yaml 6 | import json 7 | from loguru import logger 8 | 9 | # Load configuration from YAML file 10 | with open(os.path.join(os.path.dirname(__file__), "config.yaml"), "r") as stream: 11 | config = yaml.safe_load(stream) 12 | 13 | # Load configuration from YAML file 14 | 15 | # Define DATA_PATH based on configuration, expanding user and resolving path 16 | 17 | if os.name == "posix": 18 | DATA_PATH = config.get("paths", {}).get("unix", "~/.fire/data/raw/") 19 | else: 20 | DATA_PATH = config.get("paths", {}).get("windows", "%USERPROFILE%\\.fire\\data\\raw") 21 | 22 | # resolve ~ and envars 23 | DATA_PATH = pathlib.Path(DATA_PATH).expanduser().resolve() 24 | 25 | # load data maps from config 26 | DATA_MAPS = config.get("data_maps", {}) 27 | 28 | json_files = list(DATA_PATH.glob("*.json")) 29 | if json_files: 30 | for json_file in json_files: 31 | with open(json_file, "r") as f: 32 | DATA_MAPS.update(json.load(f)) 33 | else: 34 | import multiprocessing as _mp 35 | 36 | if _mp.current_process().name == "MainProcess": 37 | logger.info("No additional JSON files found in DATA_PATH, load default DATA_MAPS.") 38 | -------------------------------------------------------------------------------- /firefin/common/config.yaml: -------------------------------------------------------------------------------- 1 | data_paths: 2 | unix: ~/.fire/data/raw/ 3 | windows: '%USERPROFILE%\.fire\data\raw\' 4 | 5 | data_maps: 6 | # quote:量价数据 7 | open: file::feather 8 | close: file::feather 9 | high: file::feather 10 | low: file::feather 11 | volume: file::feather 12 | money: file::feather 13 | return_adj: file::feather 14 | vwap: file::feather 15 | adj_factor: file::feather 16 | open_dr: file::feather 17 | high_dr: file::feather 18 | low_dr: file::feather 19 | close_dr: file::feather 20 | volume_dr: file::feather 21 | vwap_dr: file::feather 22 | FinanceValue: file::feather 23 | FinanceBuyValue: file::feather 24 | FinanceRefundValue: file::feather 25 | SecurityVolume: file::feather 26 | SecuritySellVolume: file::feather 27 | SecurityRefundVolume: file::feather 28 | SecurityValue: file::feather 29 | TradingValue: file::feather 30 | FinaInTotalRatio: file::feather 31 | SecuInTotalRatio: file::feather 32 | shares_holding: file::feather 33 | hold_ratio: file::feather 34 | adjusted_hold_ratio: file::feather 35 | # valuation:估值数据 36 | circulating_market_cap: file::feather 37 | pcf_ratio: file::feather 38 | market_cap: file::feather 39 | pe_ratio_lyr: file::feather 40 | circulating_cap: file::feather 41 | capitalization: file::feather 42 | pb_ratio: file::feather 43 | pe_ratio: file::feather 44 | ps_ratio: file::feather 45 | turnover_ratio: file::feather 46 | circulating_market_cap_ashare: file::feather 47 | market_cap_ashare: file::feather 48 | circulating_cap_ashare: file::feather 49 | capitalization_ashare: file::feather 50 | # financial:财务 51 | inventories: file::feather 52 | total_current_assets: file::feather 53 | fixed_assets: file::feather 54 | good_will: file::feather 55 | total_assets: file::feather 56 | total_liability: file::feather 57 | operating_revenue: file::feather 58 | operating_profit: file::feather 59 | total_profit: file::feather 60 | net_profit: file::feather 61 | basic_eps: file::feather 62 | 63 | -------------------------------------------------------------------------------- /firefin/common/const.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import pandas as pd 5 | 6 | # for minute data bartimes 7 | _morning = pd.date_range("2020-01-01 09:30", "2020-01-01 11:30", freq="1 min") 8 | _afternoon = pd.date_range("2020-01-01 13:00", "2020-01-01 15:00", freq="1 min") 9 | MIN_BARTIMES = _morning.union(_afternoon).strftime("%H:%M") 10 | -------------------------------------------------------------------------------- /firefin/compute/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | """ 5 | utility functions for computing 6 | 7 | """ 8 | -------------------------------------------------------------------------------- /firefin/compute/window.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import pandas as pd 5 | import typing 6 | from ..core.algorithm import _numba_funcs 7 | 8 | __all__ = ["ts_corr"] 9 | 10 | 11 | def ts_corr(x: pd.DataFrame, y: pd.DataFrame, n: int, method: typing.Literal["pearson", "kendall", "spearman"]): 12 | x, y = x.align(y, join="outer", copy=False) 13 | result = pd.DataFrame( 14 | _numba_funcs.ts_corr(x.values, y.values, n, method), 15 | index=x.index, 16 | columns=x.columns, 17 | ) 18 | return result 19 | -------------------------------------------------------------------------------- /firefin/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/core/__init__.py -------------------------------------------------------------------------------- /firefin/core/algorithm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/core/algorithm/__init__.py -------------------------------------------------------------------------------- /firefin/core/algorithm/_numba_funcs.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import numpy as np 5 | from numba import njit 6 | 7 | 8 | @njit 9 | def _validate_pairwise(x, y): 10 | if x.ndim != 2 or y.ndim != 2: 11 | raise ValueError("_validate_pairwise: Both inputs must be 2D arrays") 12 | n1, m1 = x.shape 13 | n2, m2 = y.shape 14 | if n1 != n2: 15 | raise ValueError("_validate_pairwise: Both inputs must have the same number of rows") 16 | if m1 != m2 and min(m1, m2) != 1: 17 | raise ValueError("_validate_pairwise: Both inputs must have the same number of columns or one column") 18 | 19 | 20 | @njit 21 | def _corr_pearson(x, y): 22 | assert len(x) == len(y) 23 | msk = np.isfinite(x) & np.isfinite(y) 24 | if msk.sum() <= 3: 25 | return np.nan 26 | elif msk.all(): 27 | x_ = x 28 | y_ = y 29 | else: 30 | x_ = x[msk] 31 | y_ = y[msk] 32 | mean_x = np.mean(x_) 33 | mean_y = np.mean(y_) 34 | x_centered = x_ - mean_x 35 | y_centered = y_ - mean_y 36 | var_x = np.sum(x_centered**2) 37 | if var_x == 0: 38 | return np.nan 39 | var_y = np.sum(y_centered**2) 40 | if var_y == 0: 41 | return np.nan 42 | cov = np.sum(x_centered * y_centered) 43 | return cov / np.sqrt(var_x * var_y) 44 | 45 | 46 | @njit 47 | def corr(x, y, method="pearson"): 48 | if x.ndim != 1 or y.ndim != 1: 49 | raise ValueError("corr: Both inputs must be 1D arrays") 50 | if x.shape != y.shape: 51 | raise ValueError("corr: Both inputs must have the same shape") 52 | if method == "pearson": 53 | return _corr_pearson(x, y) 54 | else: 55 | raise NotImplementedError("corr: Only Pearson correlation is supported") 56 | 57 | 58 | @njit 59 | def ts_corr(x, y, w, method="pearson"): 60 | _validate_pairwise(x, y) 61 | n, m1 = x.shape 62 | _, m2 = y.shape 63 | k = max(m1, m2) 64 | out = np.full((n, k), np.nan) 65 | for i in range(n): 66 | x_ = x[max(0, i - w + 1) : i + 1] 67 | y_ = y[max(0, i - w + 1) : i + 1] 68 | for j in range(k): 69 | x__ = x_[:, min(j, m1 - 1)] 70 | y__ = y_[:, min(j, m2 - 1)] 71 | out[i, j] = corr(x__, y__, method) 72 | return out 73 | -------------------------------------------------------------------------------- /firefin/core/algorithm/newey_west_ttest_1samp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate Newey-West Adjusted Standard Error in t-test for Academic Research 3 | --------------------------------------------------- 4 | This module provides a class for performing a one-sample t-test with Newey-West adjusted standard errors. 5 | The implementation focuses on clarity, comprehensive documentation, and best practices for financial research. 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from typing import Union, Tuple 11 | import statsmodels.api as sm 12 | 13 | class NeweyWestTTest: 14 | """ 15 | A class for performing a one-sample t-test using Newey-West adjusted standard errors. 16 | """ 17 | 18 | @staticmethod 19 | def newey_west_ttest_1samp(data: Union[np.ndarray, pd.Series, list], 20 | popmean: float = 0.0, 21 | lags: int = 4, 22 | nan_policy: str = 'omit') -> Tuple[float, float, float]: 23 | """ 24 | Perform a one-sample t-test using Newey-West adjusted standard errors. 25 | 26 | Parameters 27 | ---------- 28 | data : array-like 29 | The sample data. 30 | popmean : float, optional 31 | The hypothesized population mean (default is 0.0). 32 | lags : int, optional 33 | The number of lags for Newey-West adjustment (default is 4). 34 | nan_policy : {'propagate', 'omit', 'raise'}, optional 35 | Defines how to handle input NaNs: 36 | 'propagate' : if a NaN is present in the input, return NaN for all outputs. 37 | 'omit' : omit NaNs when performing the calculation. If insufficient data remains, return NaN. 38 | 'raise' : if a NaN is present, raise a ValueError. 39 | (default is 'omit'). 40 | 41 | Returns 42 | ------- 43 | t_value : float 44 | The t-statistic. 45 | p_value : float 46 | The p-value for the t-test. 47 | se : float 48 | The Newey-West adjusted standard error. 49 | 50 | Raises 51 | ------ 52 | ValueError 53 | If the input data is not one-dimensional or if nan_policy is set to 'raise' and data contains NaNs. 54 | """ 55 | # Convert input data to a NumPy array 56 | data_arr = np.asarray(data) 57 | # Ensure the data is one-dimensional 58 | if data_arr.ndim != 1: 59 | raise ValueError("Input data must be a one-dimensional array or series. Only a single variable is allowed.") 60 | 61 | # Validate nan_policy argument 62 | if nan_policy not in ['propagate', 'omit', 'raise']: 63 | raise ValueError("nan_policy must be one of 'propagate', 'omit', or 'raise'.") 64 | 65 | # Handle NaN values according to nan_policy 66 | if nan_policy == 'propagate': 67 | if np.isnan(data_arr).any(): 68 | return np.nan, np.nan, np.nan 69 | elif nan_policy == 'raise': 70 | if np.isnan(data_arr).any(): 71 | raise ValueError("Input data contains NaN values.") 72 | elif nan_policy == 'omit': 73 | data_arr = data_arr[~np.isnan(data_arr)] 74 | # If no sufficient data remains after omitting NaNs, return NaN values 75 | if data_arr.size < 2: 76 | raise ValueError("No sufficient data (length < 2) remains after omitting NaNs.") 77 | 78 | # If the data length is insufficient, return NaNs 79 | if data_arr.size < 2: 80 | raise ValueError("No sufficient data (length < 2).") 81 | 82 | # Adjust the data by subtracting the hypothesized population mean 83 | adjusted_data = data_arr - popmean 84 | # Create an intercept term (a column of ones) 85 | X = np.ones(len(adjusted_data)) 86 | # Fit an OLS model with Newey-West (HAC) standard errors 87 | model = sm.OLS(adjusted_data, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags}) 88 | # Extract the t-statistic, p-value, and standard error 89 | t_value = model.tvalues[0] 90 | p_value = model.pvalues[0] 91 | se = model.bse[0] 92 | 93 | return t_value, p_value, se 94 | -------------------------------------------------------------------------------- /firefin/core/algorithm/regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Created : 2025/3/26 17:01 3 | # @Author : Liao Renjie 4 | # @Email : liao.renjie@techfin.ai 5 | # @File : least_square.py 6 | # @Software: PyCharm 7 | 8 | import textwrap 9 | import typing 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import statsmodels.api as sm 14 | from joblib import Parallel, delayed 15 | 16 | __all__ = ["least_square", "RollingRegressor", "rolling_regression", "table_regression"] 17 | 18 | NotProvided = object() 19 | 20 | 21 | class RegressionResult: 22 | """ 23 | Encapsulate the results from `least_square`. 24 | 25 | Parameters 26 | ---------- 27 | sm_result: sm.regression.linear_model.RegressionResults 28 | The regression results object from the statsmodels library. 29 | fit_intercept: bool 30 | Whether to fit an intercept term. 31 | univariate: bool 32 | Whether it is a univariate regression. 33 | 34 | """ 35 | 36 | def __init__(self, sm_result: sm.regression.linear_model.RegressionResults, fit_intercept: bool, univariate: bool): 37 | self.sm_result = sm_result 38 | self.fit_intercept = fit_intercept 39 | self.univariate = univariate 40 | 41 | @property 42 | def alpha(self): 43 | """float or None""" 44 | if self.fit_intercept: 45 | return self.sm_result.params[0] 46 | else: 47 | return None 48 | 49 | @property 50 | def beta(self): 51 | """1D array if multivariate or float if univariate""" 52 | if self.univariate: 53 | return self.sm_result.params[-1] 54 | else: 55 | # multivariate 56 | if self.fit_intercept: 57 | return self.sm_result.params[1:] 58 | else: 59 | return self.sm_result.params 60 | 61 | @property 62 | def r2(self): 63 | """ 64 | Return the coefficient of determination R² of the regression. 65 | 66 | Returns 67 | ------- 68 | float 69 | The R² value. 70 | """ 71 | return self.sm_result.rsquared 72 | 73 | @property 74 | def r2_adj(self): 75 | """ 76 | Return the adjusted coefficient of determination R² of the regression. 77 | 78 | Returns 79 | ------- 80 | float 81 | The adjusted R² value. 82 | """ 83 | return self.sm_result.rsquared_adj 84 | 85 | @property 86 | def residuals(self): 87 | """ 88 | Return the residuals of the regression. 89 | 90 | Returns 91 | ------- 92 | array 93 | The array of residuals. 94 | """ 95 | return self.sm_result.resid 96 | 97 | 98 | class BatchRegressionResult: 99 | """ 100 | Encapsulate the results of batch regression. 101 | 102 | Parameters 103 | ---------- 104 | beta 105 | The regression coefficients. 106 | alpha: optional 107 | The intercept term, default is None. 108 | r2: optional 109 | The coefficient of determination R², default is None. 110 | r2_adj: optional 111 | The adjusted coefficient of determination R², default is None. 112 | residuals: optional 113 | The residuals, default is None. 114 | """ 115 | 116 | def __init__( 117 | self, 118 | beta, 119 | alpha=None, 120 | r2=None, 121 | r2_adj=None, 122 | residuals=None, 123 | ): 124 | # NOTE: public names will be displayed in __repr__ 125 | self.alpha = alpha 126 | self.beta = beta 127 | self.r2 = r2 128 | self.r2_adj = r2_adj 129 | self.residuals = residuals 130 | 131 | def __repr__(self): 132 | content = {a: getattr(self, a) for a in dir(self) if not a.startswith("_")} 133 | content = ",\n".join( 134 | [f" {k}:\n{textwrap.indent(repr(v), prefix=' ')}" for k, v in content.items() if v is not None] 135 | ) 136 | return f"{self.__class__.__name__}(\n{content}\n)" 137 | 138 | 139 | def _regression( 140 | x: pd.DataFrame | pd.Series, 141 | y: pd.Series, 142 | w: pd.Series = None, 143 | fit_intercept: bool = True, 144 | cov_type: str | None = None, 145 | cov_kwds: dict | None = None, 146 | ) -> sm.regression.linear_model.RegressionResults: 147 | """ 148 | Perform a linear regression using either OLS or WLS. 149 | 150 | Parameters 151 | ---------- 152 | x: pd.DataFrame | pd.Series 153 | The independent variable(s). 154 | y: pd.Series 155 | The dependent variable. 156 | w: pd.Series, optional 157 | The weights for WLS, default is None. 158 | fit_intercept: bool, optional 159 | Whether to fit an intercept term, default is True. 160 | cov_type: str | None, optional 161 | The covariance estimator, default is None. 162 | - If None: use the default homoskedastic standard errors. 163 | - If "HAC": Newey–West heteroskedasticity-and-autocorrelation robust SE. 164 | - Other options supported by statsmodels (e.g. "HC0", "HC1", …). 165 | cov_kwds: dict | None, optional 166 | The keyword arguments for the covariance estimator, default is None. 167 | For Newey–West, you’d typically pass `{"maxlags": L}` to control lag length. 168 | 169 | Returns 170 | ------- 171 | sm.regression.linear_model.RegressionResults 172 | The regression results. 173 | """ 174 | 175 | ## if x contains nan, fill nan with 0 176 | ## TODO: fill nan with 0 is not a good idea, we should use the mean of the column to fill nan 177 | x = np.nan_to_num(x, nan=0) 178 | y = np.nan_to_num(y, nan=0) 179 | 180 | if fit_intercept: 181 | x = sm.add_constant(x) 182 | if w is None: 183 | model = sm.OLS(y, x) 184 | else: 185 | ## TODO: fill nan with 0 is not a good idea, we should use the mean of the column to fill nan 186 | y = np.nan_to_num(y, nan=0) 187 | model = sm.WLS(y, x, weights=w) 188 | 189 | if cov_type is None: 190 | return model.fit() 191 | else: 192 | return model.fit(cov_type=cov_type, cov_kwds=cov_kwds or {}) 193 | 194 | 195 | def least_square( 196 | x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray, 197 | y: pd.Series | np.ndarray, 198 | w: pd.Series | np.ndarray | None = None, 199 | fit_intercept: bool = True, 200 | ) -> RegressionResult: 201 | """ 202 | A simple wrapper around sm.OLS or sm.WLS. 203 | 204 | Parameters 205 | ---------- 206 | x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 207 | The independent variable(s). If one-dimensional, the regression is considered univariate, otherwise is 208 | considered multivariate. This affects the format of returned beta. 209 | y: pd.Series | np.ndarray 210 | The dependent variable. 211 | w: pd.Series | np.ndarray | None, optional 212 | The weights for WLS, default is None. 213 | fit_intercept: bool, optional 214 | Whether to fit an intercept term, default is True. 215 | 216 | Returns 217 | ------- 218 | RegressionResult 219 | The regression result object. 220 | """ 221 | if isinstance(x, (tuple, list)): 222 | x = pd.concat(x, axis=1) 223 | 224 | if isinstance(x, pd.Series): 225 | x = x.to_frame() 226 | 227 | if isinstance(x, np.ndarray): 228 | if x.ndim == 1: 229 | x = x.reshape(-1, 1) 230 | if x.ndim != 2: 231 | raise ValueError("x must be 1d or 2d array") 232 | 233 | univariate = x.shape[1] == 1 234 | 235 | result = _regression(x, y, w=w, fit_intercept=fit_intercept) 236 | return RegressionResult(result, fit_intercept=fit_intercept, univariate=univariate) 237 | 238 | 239 | @delayed 240 | def calculate_window(x_wind, y_wind, w_wind, m1, m2, m3, m, fit_intercept, univariate, cov_type, cov_kwds): 241 | alphas: list[float] = [] 242 | betas: list[float | np.ndarray | None] = [] 243 | for j in range(m): 244 | x_j = x_wind[:, :, min(j, m1 - 1)].T 245 | y_j = y_wind[:, min(j, m2 - 1)] 246 | w_j = None if w_wind is None else w_wind[:, min(j, m3 - 1)] 247 | # if any x is all nan, skip regression 248 | if np.isnan(x_j).all(axis=0).any() or np.isnan(y_j).all(): 249 | # alpha 一定是float;beta可能是array或者float,所以用None表示🈳 250 | alpha = np.nan 251 | beta = None 252 | else: 253 | res = RegressionResult( 254 | # fit_intercept is always False, because we've padded X in __init__ 255 | _regression(x_j, y_j, w_j, fit_intercept=False, cov_type=cov_type, cov_kwds=cov_kwds), 256 | fit_intercept=fit_intercept, 257 | univariate=univariate, 258 | ) 259 | alpha = res.alpha 260 | beta = res.beta 261 | alphas.append(alpha) 262 | betas.append(beta) 263 | 264 | return alphas, betas 265 | 266 | 267 | class RollingRegressor: 268 | """ 269 | Perform rolling regression. 270 | 271 | Parameters 272 | ---------- 273 | x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 274 | The independent variable(s). 275 | y: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 276 | The dependent variable. 277 | w: optional 278 | The weights for WLS, default is None. 279 | mode: typing.Literal["single", "multi"], optional 280 | The mode of regression, default is None. 281 | fit_intercept: bool, optional 282 | Whether to fit an intercept term, default is True. 283 | """ 284 | 285 | def __init__( 286 | self, 287 | x, 288 | y, 289 | w=None, 290 | *, 291 | mode: typing.Literal["single", "multi"] = None, 292 | fit_intercept: bool = True, 293 | ): 294 | # We generally don't check the alignment of the inputs. It's the user's obligation to make sure the inputs are 295 | # compatible in turns of shape and align with each other. 296 | self._keys = {} 297 | self._index = {} 298 | self._columns = {} 299 | self.x = self._parse_data(x, "x") 300 | self.y = self._parse_data(y, "y") 301 | self.w = self._parse_data(w, "w", allow_none=True) 302 | 303 | # "multi": x is 3d array 304 | # "single": x is 2d array 305 | if isinstance(self.x, np.ndarray): 306 | if self.x.ndim == 2: 307 | self.inferred_mode = "single" 308 | self.x = self.x.reshape(1, *self.x.shape) 309 | elif self.x.ndim == 3: 310 | self.inferred_mode = "multi" 311 | else: 312 | raise ValueError("x must be 2d or 3d array") 313 | else: 314 | raise ValueError("parsed x should be array") 315 | 316 | # now x is 3d array: key-index-columns 317 | if fit_intercept: 318 | self.x = np.concatenate([np.ones((1, *self.x.shape[1:])), self.x]) 319 | 320 | if mode is not None and mode != self.inferred_mode: 321 | raise ValueError(f"inferred mode ({self.inferred_mode}) is not equal to the specified mode ({mode})") 322 | 323 | self.keys = None if not self._keys else next(iter(self._keys.values())) 324 | self.index = None if not self._index else next(iter(self._index.values())) 325 | 326 | if not self._columns: 327 | self.columns = None 328 | else: 329 | len_col = list(map(len, self._columns.values())) 330 | max_len_loc = len_col.index(max(len_col)) 331 | self.columns = list(self._columns.values())[max_len_loc] 332 | 333 | self.fit_intercept = fit_intercept 334 | 335 | @property 336 | def is_univariate(self): 337 | """ 338 | Check if the regression is univariate. 339 | 340 | Returns 341 | ------- 342 | bool 343 | True if univariate, False otherwise. 344 | """ 345 | if self.inferred_mode == "single": 346 | return True 347 | else: 348 | assert self.inferred_mode == "multi" 349 | return False 350 | 351 | def _parse_data(self, a, data_name: typing.Literal["x", "y", "w"], allow_none=False): 352 | """ 353 | Parse the input data. 354 | 355 | Parameters 356 | ---------- 357 | a 358 | The input data. 359 | data_name: typing.Literal["x", "y", "w"] 360 | The name of the data. 361 | allow_none: bool, optional 362 | Whether to allow None as input, default is False. 363 | 364 | Returns 365 | ------- 366 | np.ndarray 367 | The parsed data. 368 | """ 369 | if a is None: 370 | if allow_none: 371 | return 372 | else: 373 | raise ValueError(f"input {data_name} cannot be None") 374 | if isinstance(a, pd.Series): 375 | a = a.to_frame() 376 | if isinstance(a, pd.DataFrame): 377 | self._index[data_name] = a.index 378 | self._columns[data_name] = a.columns 379 | return a.values 380 | elif isinstance(a, np.ndarray): 381 | if a.ndim == 1: 382 | a = a.reshape(-1, 1) 383 | if a.ndim not in (2, 3): 384 | raise ValueError(f"input {data_name} should be 2-d or 3-d if it's array") 385 | return a 386 | else: 387 | if data_name in ("x", "w"): 388 | if isinstance(a, dict): 389 | self._keys[data_name] = list(a.keys()) 390 | a = list(a.values()) 391 | 392 | if isinstance(a, (list, tuple)): 393 | if len(set([i.shape for i in a])) != 1: 394 | raise ValueError(f"input {data_name} should have same shape") 395 | if not all(i.ndim == 2 for i in a): 396 | raise ValueError(f"input contents of {data_name} should be 2-d, if it's list") 397 | if isinstance(a[0], pd.DataFrame): 398 | self._index[data_name] = a[0].index 399 | self._columns[data_name] = a[0].columns 400 | a = np.array(a) 401 | return a 402 | else: 403 | raise TypeError(f"input {data_name} should be array-like or list") 404 | 405 | else: 406 | raise ValueError(f"input {data_name}'s type not supported") 407 | 408 | @classmethod 409 | def _transpose_or_none(cls, _x): 410 | """ 411 | Transpose the array if it is not None. 412 | 413 | Parameters 414 | ---------- 415 | _x 416 | The input array. 417 | 418 | Returns 419 | ------- 420 | np.ndarray or None 421 | The transposed array or None. 422 | """ 423 | # the last 2 axes are always time x stocks 424 | if _x is not None: 425 | return np.swapaxes(_x, -1, -2) 426 | 427 | def fit( 428 | self, 429 | window: int | None = None, 430 | axis=0, 431 | cov_type: str | None = None, 432 | cov_kwds: dict | None = None, 433 | n_jobs: int = 4, 434 | verbose: int = 0, 435 | ): 436 | """ 437 | Fit the rolling regression model. 438 | 439 | Parameters 440 | ---------- 441 | window: int | None, optional 442 | The window size for rolling regression, default is None. If None, window = len(data) 443 | axis: int, optional 444 | The axis along which to perform the regression, default is 0. 445 | cov_type: str | None, optional 446 | The covariance estimator, default is None. 447 | - If None: use the default homoskedastic standard errors. 448 | - If "HAC": Newey–West heteroskedasticity-and-autocorrelation robust SE. 449 | - Other options supported by statsmodels (e.g. "HC0", "HC1", …). 450 | cov_kwds: dict | None, optional 451 | The keyword arguments for the covariance estimator, default is None. 452 | For Newey–West, you’d typically pass `{"maxlags": L}` to control lag length. 453 | n_jobs: int 454 | num of parallel workers, passed to Parallel 455 | verbose: int 456 | verbosity of progress, passed to Parallel 457 | 458 | Returns 459 | ------- 460 | BatchRegressionResult 461 | The batch regression result object. 462 | """ 463 | x = self.x 464 | y = self.y 465 | w = self.w 466 | 467 | keys = self.keys 468 | index = self.index 469 | columns = self.columns 470 | 471 | fit_intercept = self.fit_intercept 472 | univariate = self.inferred_mode == "single" 473 | transpose = axis != 0 474 | 475 | if transpose: 476 | x = self._transpose_or_none(x) 477 | y = self._transpose_or_none(y) 478 | w = self._transpose_or_none(w) 479 | 480 | # generic shape compat 481 | k, n1, m1 = x.shape 482 | n2, m2 = y.shape 483 | if m1 != m2 and min(m1, m2) != 1: 484 | raise ValueError(f"incompatible x, y shapes: {x.shape} vs {y.shape}") 485 | if n1 != n2: 486 | raise ValueError(f"x, y should have same length") 487 | 488 | n = n1 489 | m = max(m1, m2) 490 | m3 = 1 491 | 492 | if w is not None: 493 | n3, m3 = w.shape 494 | if m3 > 1 and m3 != m: 495 | raise ValueError(f"incompatible x, y, w shapes: {x.shape} vs {y.shape} vs {w.shape}") 496 | if n3 != n: 497 | raise ValueError(f"x, w should have same length") 498 | 499 | # window not specified, use total length as window 500 | # in this case, result should also be pruned 501 | is_table = window is None 502 | if is_table: 503 | window = n 504 | 505 | alpha = None 506 | if fit_intercept: 507 | alpha = np.full((n, m), np.nan) 508 | beta = np.full((k - fit_intercept, n, m), np.nan) 509 | 510 | result_gen = Parallel(n_jobs=n_jobs, verbose=verbose, return_as="generator")( 511 | calculate_window( 512 | x_wind=x[:, i : i + window], 513 | y_wind=y[i : i + window], 514 | w_wind=None if w is None else w[i : i + window], 515 | m1=m1, 516 | m2=m2, 517 | m3=m3, 518 | m=m, 519 | fit_intercept=fit_intercept, 520 | univariate=univariate, 521 | cov_type=cov_type, 522 | cov_kwds=cov_kwds, 523 | ) 524 | for i in range(n - window + 1) 525 | ) 526 | for i, (alphas, betas) in enumerate(result_gen): 527 | alpha[i + window - 1] = alphas 528 | for j, _beta in enumerate(betas): 529 | if _beta is not None: 530 | beta[:, i + window - 1, j] = _beta 531 | 532 | # squeeze if table 533 | if is_table: 534 | # columns 535 | alpha = alpha[-1] 536 | # keys x columns 537 | beta = beta[:, -1] 538 | # maybe transpose back 539 | if transpose: 540 | beta = self._transpose_or_none(beta) 541 | # wrap dataframe if possible 542 | if is_table: 543 | alpha = pd.Series(alpha, index=index if transpose else columns, name="alpha") 544 | if transpose: 545 | # axis = 1 546 | beta = pd.DataFrame(beta, index=index, columns=keys) 547 | else: 548 | beta = pd.DataFrame(beta, index=keys, columns=columns) 549 | if self.is_univariate: 550 | beta = beta.squeeze(axis=axis) 551 | else: 552 | alpha = pd.DataFrame(alpha, index=index, columns=columns) 553 | if self.is_univariate: 554 | beta = pd.DataFrame(np.squeeze(beta, axis=0), index=index, columns=columns) 555 | else: 556 | beta = [pd.DataFrame(beta[i], index=index, columns=columns) for i in range(k - fit_intercept)] 557 | if keys is not None: 558 | for _key, _beta in zip(keys, beta): 559 | _beta.name = _key 560 | return BatchRegressionResult(beta, alpha=alpha) 561 | 562 | 563 | def rolling_regression(x, y, window, w=None, *, fit_intercept=True): 564 | """ 565 | Perform rolling regression. 566 | 567 | Parameters 568 | ---------- 569 | x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 570 | The independent variable(s). 571 | y: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 572 | The dependent variable. 573 | window: int 574 | The window size for rolling regression. 575 | w: optional 576 | The weights for WLS, default is None. 577 | fit_intercept: bool, optional 578 | Whether to fit an intercept term, default is True. 579 | 580 | Returns 581 | ------- 582 | BatchRegressionResult 583 | The batch regression result object. 584 | """ 585 | return RollingRegressor(x, y, w, fit_intercept=fit_intercept).fit(window) 586 | 587 | 588 | def table_regression(x, y, w=None, *, fit_intercept=True, axis=1): 589 | """ 590 | Perform table regression (apply regression column-wise or row-wise) 591 | 592 | Parameters 593 | ---------- 594 | x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 595 | The independent variable(s). 596 | y: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray 597 | The dependent variable. 598 | w: optional 599 | The weights for WLS, default is None. 600 | fit_intercept: bool, optional 601 | Whether to fit an intercept term, default is True. 602 | axis: int, optional 603 | The axis along which to perform the regression, default is 1. 604 | 605 | Returns 606 | ------- 607 | BatchRegressionResult 608 | The batch regression result object. 609 | """ 610 | return RollingRegressor(x, y, w, fit_intercept=fit_intercept).fit(None, axis=axis) 611 | -------------------------------------------------------------------------------- /firefin/core/plot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/core/plot/__init__.py -------------------------------------------------------------------------------- /firefin/core/plot/plots.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | from pathlib import Path 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | import seaborn as sns 10 | import statsmodels.api as sm 11 | from matplotlib.gridspec import GridSpec 12 | from numba import njit 13 | from scipy import stats 14 | 15 | from ...evaluation.eva_utils import PeriodType, IC, QuantileReturns 16 | 17 | __all__ = [ 18 | "plt_ic", 19 | "plt_cumulative_returns", 20 | "plt_quantile_cumulative_returns", 21 | "plt_quantile_cumulated_end_returns", 22 | ] 23 | 24 | 25 | sns.set_style("whitegrid") 26 | 27 | 28 | def _plt_cumsum_ic(summarized_data, ax, factor_name, data_name): 29 | y_mean = summarized_data.resample("YE", label="left").mean() 30 | for c, sr in y_mean.T.iterrows(): 31 | ax.scatter(sr.index, sr, marker="x") 32 | ax.set_title(f"{factor_name} Cumulative {data_name} & Yearly Mean") 33 | ax.axhline(0, linestyle="-", color="black", lw=1) 34 | 35 | axr = ax.twinx() 36 | axr.plot(summarized_data.cumsum()) 37 | axr.legend(summarized_data.columns, loc=2) 38 | axr.grid(linestyle=":") 39 | 40 | # make sure the left axis has visible 0-line 41 | b, t = ax.get_ylim() 42 | if b > 0: 43 | ax.set_ylim(0, t) 44 | elif t < 0: 45 | ax.set_ylim(b, 0) 46 | 47 | 48 | def _summarize_ic_data(data): 49 | origin_columns = data.columns 50 | 51 | _mean = data.mean() 52 | _std = data.std() 53 | _ir = data.mean() / data.std() 54 | 55 | summary_columns = [f"{c}, AVG={_mean[c]:.2%}, STD={_std[c]:.2%}, IR={_ir[c]:.2f}" for c in origin_columns] 56 | 57 | summarized_data = data.rename(columns=dict(zip(origin_columns, summary_columns))) 58 | 59 | return summarized_data 60 | 61 | 62 | def _plt_monthly_and_20ma_ic(data, axs, data_name, color_bounds): 63 | origin_columns = data.columns 64 | markersize = 5 65 | 66 | data_month = data.resample("ME").mean() 67 | summarized_data = _summarize_ic_data(data) 68 | 69 | for i_p, col in enumerate(origin_columns): 70 | ax = axs[i_p] 71 | 72 | data_col = summarized_data.iloc[:, i_p] 73 | ax.plot(data_col.rolling(20, min_periods=1).mean()) 74 | 75 | month_p_data = data_month.iloc[:, i_p] 76 | 77 | for color, bounds in color_bounds.items(): 78 | data_sel = month_p_data[(month_p_data >= bounds[0]) & (month_p_data <= bounds[1])] 79 | data_sel.plot(color=color, linestyle="", marker="D", markersize=markersize, ax=ax) 80 | 81 | ax.set( 82 | xlabel="", 83 | title=f"{col} {data_name}, Monthly Average and 20-day MA", 84 | ) 85 | 86 | return summarized_data 87 | 88 | 89 | def plt_ic(ic_data: IC, factor_name="Factor", dist=True, plot_dir=None, show=True): 90 | """ 91 | Plot an IC plot with Monthly IC, Cumulative IC and IC distribution. 92 | 93 | Parameters 94 | ---------- 95 | ic_data: IC 96 | factor_name: str 97 | plot_dir: None or Path 98 | show: bool 99 | dist: bool 100 | if True, show distribution of IC and its QQ-plot. 101 | 102 | """ 103 | 104 | ic_data = ic_data.dropna(how="all") 105 | if not isinstance(ic_data.index, pd.DatetimeIndex): 106 | ic_data.index = pd.DatetimeIndex(ic_data.index) 107 | columns = ic_data.columns 108 | n_cols = len(columns) 109 | 110 | if dist: 111 | fig_width = 20 112 | grid_width = 4 113 | else: 114 | fig_width = 10 115 | grid_width = 2 116 | 117 | fig = plt.figure(figsize=(fig_width, 3.5 * n_cols)) 118 | grid = GridSpec(n_cols * 4, grid_width, figure=fig) 119 | 120 | # 折线图,每个period一张,高度为3 121 | ax0 = fig.add_subplot(grid[:3, :2]) 122 | axs = [ax0] 123 | for i_p in range(1, n_cols): 124 | axs.append(fig.add_subplot(grid[i_p * 3 : (i_p + 1) * 3, :2], sharex=ax0, sharey=ax0)) 125 | # 累计图,一张,高度为n_periods 126 | axs.append(fig.add_subplot(grid[n_cols * 3 :, :2])) 127 | 128 | if dist: 129 | # 分布图,每个period一张,高度为4 130 | for i_p in range(n_cols): 131 | for i in range(2): 132 | axs.append(fig.add_subplot(grid[i_p * 4 : (i_p + 1) * 4, 2 + i])) 133 | 134 | # matplotlib/_color_data.py 135 | # https://drafts.csswg.org/css-color-4/#named-colors 136 | # 都是闭区间,先画暗色,再画两端的亮色以及0的灰色 137 | color_bounds = { 138 | "grey": [-0.02, 0.02], 139 | "darkblue": [-0.05, -0.02], 140 | "darkred": [0.02, 0.05], 141 | "blue": [-np.inf, -0.05], 142 | "red": [0.05, np.inf], 143 | } 144 | 145 | summarized_data = _plt_monthly_and_20ma_ic(ic_data, axs, "IC", color_bounds=color_bounds) 146 | _plt_cumsum_ic(summarized_data, axs[n_cols], factor_name, "IC") 147 | 148 | if dist: 149 | for i_p, p in enumerate(columns): 150 | ic_data_p = ic_data.iloc[:, i_p].dropna() 151 | 152 | ax1, ax2 = axs[n_cols + 1 + i_p * 2], axs[n_cols + 2 + i_p * 2] 153 | 154 | sns.histplot(ic_data_p, kde=True, bins=int(np.ceil(np.log(ic_data_p.size) * 10)), stat="density", ax=ax1) 155 | ax1.set( 156 | xlabel=f"{p}, Mean {ic_data_p.mean():.2f}, Skew {ic_data_p.skew():.2f}, Kurt {ic_data_p.kurt():.2f}" 157 | ) 158 | sm.qqplot(ic_data_p, stats.norm, fit=True, line="45", ax=ax2) 159 | ax2.set(ylabel="Observed Quantile", xlabel="Norm Distribution Quantile") 160 | 161 | if plot_dir: 162 | plt.savefig(Path(plot_dir) / f"{factor_name} IC plot.png", bbox_inches="tight") 163 | if show: 164 | plt.show() 165 | 166 | summary_table = pd.DataFrame( 167 | np.nan, 168 | index=["mean", "std", "ir", "> 0", "< 0", "> 3%", "< -3%", "> 5%", "< -5%"], 169 | columns=columns, 170 | ) 171 | 172 | ic_mean = ic_data.mean() 173 | ic_std = ic_data.std() 174 | ir = ic_mean / ic_std 175 | 176 | summary_table.loc["mean"] = ic_mean.values 177 | summary_table.loc["std"] = ic_std.values 178 | summary_table.loc["ir"] = ir.values 179 | summary_table.loc["> 0"] = ((ic_data > 0).sum() / np.isfinite(ic_data).sum()).values 180 | summary_table.loc["< 0"] = ((ic_data < 0).sum() / np.isfinite(ic_data).sum()).values 181 | summary_table.loc["> 3%"] = ((ic_data > 0.03).sum() / np.isfinite(ic_data).sum()).values 182 | summary_table.loc["< -3%"] = ((ic_data < -0.03).sum() / np.isfinite(ic_data).sum()).values 183 | summary_table.loc["> 5%"] = ((ic_data > 0.05).sum() / np.isfinite(ic_data).sum()).values 184 | summary_table.loc["< -5%"] = ((ic_data < -0.05).sum() / np.isfinite(ic_data).sum()).values 185 | print(summary_table) 186 | 187 | 188 | def _get_annual_and_end_returns(daily_cum_returns): 189 | daily_cum_returns = np.asarray(daily_cum_returns) 190 | n, m = daily_cum_returns.shape 191 | na_mask = np.isfinite(daily_cum_returns) 192 | end_returns = [] 193 | 194 | for j in range(m): 195 | for i in range(n): 196 | if na_mask[n - 1 - i, j]: 197 | end_returns.append(daily_cum_returns[n - 1 - i, j]) 198 | break 199 | end_returns = np.asarray(end_returns) 200 | annual_returns = np.float_power(np.add(end_returns, 1), 244 / na_mask.sum(axis=0)) - 1 201 | return annual_returns, end_returns 202 | 203 | 204 | def plt_cumulative_returns( 205 | *, 206 | daily_returns=None, 207 | daily_cum_returns=None, 208 | show_min_max=True, 209 | title="Cumulative Returns", 210 | ax=None, 211 | show=False, 212 | plot_dir=None, 213 | ): 214 | """ 215 | 216 | Parameters 217 | ---------- 218 | daily_returns: pd.DataFrame 219 | daily_cum_returns: pd.DataFrame 220 | show_min_max: bool 221 | title: str 222 | ax: matplotlib axis 223 | show: bool 224 | plot_dir: Path, default=None 225 | 226 | """ 227 | 228 | if daily_returns is None: 229 | if daily_cum_returns is None: 230 | raise ValueError(f"one of daily_returns or daily_cum_returns must be provided") 231 | else: 232 | daily_cum_returns = daily_cum_returns.dropna(how="all") 233 | daily_returns = daily_cum_returns.add(1).pct_change() 234 | daily_returns.iloc[0] = daily_cum_returns.iloc[0] 235 | 236 | else: 237 | if daily_cum_returns is not None: 238 | raise ValueError(f"exactly one of daily_returns or daily_cum_returns should be provided") 239 | else: 240 | daily_returns = daily_returns.dropna(how="all") 241 | daily_cum_returns = daily_returns.add(1).cumprod() - 1 242 | 243 | # set name for columns 244 | annual_returns, end_returns = _get_annual_and_end_returns(daily_cum_returns) 245 | daily_cum_returns.columns = [ 246 | f"{c}, ANN. {art:.2%}, TOT. {ert:.2%}" 247 | for c, art, ert in zip(daily_cum_returns.columns, annual_returns, end_returns) 248 | ] 249 | daily_returns.columns = daily_cum_returns.columns 250 | 251 | if ax is None: 252 | _, ax = plt.subplots() 253 | daily_cum_returns.plot(cmap=plt.cm.coolwarm, ax=ax) 254 | if show_min_max: 255 | max_group = daily_returns.columns[[0, -1]][np.argmax(annual_returns[[0, -1]])] 256 | min_group = daily_returns.columns[[0, -1]][np.argmin(annual_returns[[0, -1]])] 257 | min_max_diff = (daily_returns.loc[:, max_group] - daily_returns.loc[:, min_group] + 1).cumprod() - 1 258 | [annual_returns], [end_returns] = _get_annual_and_end_returns(min_max_diff.to_frame()) 259 | min_max_diff.name = f"Min Max, ANN. {annual_returns:.2%}, TOT. {end_returns:.2%}" 260 | min_max_diff.plot(lw=2, color="black", alpha=0.8, ax=ax) 261 | 262 | ax.set(xlabel="", ylabel="Cumulative Returns", title=title) 263 | ax.legend(loc=2, ncol=int(np.ceil(len(daily_returns.columns) / 25)), fontsize=8) 264 | ax.axhline(0.0, linestyle="-", color="black", lw=1) 265 | 266 | # if logy: 267 | # from matplotlib.ticker import FuncFormatter 268 | # 269 | # log_return_locator_cls = get_log_return_locator() 270 | # 271 | # fwd, ivt = lambda x: np.log1p(x), lambda x: np.exp(x) - 1 272 | # ax.set_yscale("function", functions=(fwd, ivt)) 273 | # ax.set_ylim([np.exp(np.log(1 + np.nanmin(daily_cum_returns)) * 1.1) - 1, None]) 274 | # ax.yaxis.set_major_locator(log_return_locator_cls(base=10, linthresh=1)) 275 | # ax.yaxis.set_major_formatter(FuncFormatter(log_return_formater)) 276 | 277 | if plot_dir: 278 | plt.savefig(plot_dir / f"{title}.png", bbox_inches="tight") 279 | if show: 280 | plt.show() 281 | 282 | 283 | def return_to_daily(data: pd.Series | pd.DataFrame, period: PeriodType): 284 | """Convert period returns to daily returns.""" 285 | if period == 1: 286 | return data.copy(deep=False) 287 | return ((data + 1) ** (1 / period)) - 1 288 | 289 | 290 | def compute_cum_returns(daily_ret: pd.Series | pd.DataFrame): 291 | return (1 + daily_ret).cumprod() - 1 292 | 293 | 294 | def _can_plot_recent(data: pd.Series | pd.DataFrame, years=3) -> tuple[bool, pd.Timestamp]: 295 | """check if longer than 3 years and return the -3 year loc if possible""" 296 | # index of this is datetime index 297 | index = data.index 298 | if (index[-1] - index[0]).days // 365 >= years: 299 | plot_recent = True 300 | loc = index[-1] - pd.Timedelta(days=365 * years) 301 | else: 302 | plot_recent = False 303 | loc = None 304 | return plot_recent, loc 305 | 306 | 307 | def plt_quantile_cumulative_returns(quantile_returns: QuantileReturns, factor_name="Factor", plot_dir=None, show=True): 308 | """ 309 | Plot the cumulative returns of each quantile. 310 | 311 | Parameters 312 | ---------- 313 | quantile_returns: QuantileReturns 314 | factor_name: str 315 | plot_dir: Path, default None 316 | show: bool, default True 317 | 318 | """ 319 | cum_returns = { 320 | period: compute_cum_returns(return_to_daily(period_returns, period)) 321 | for period, period_returns in quantile_returns.items() 322 | } 323 | periods = sorted(cum_returns.keys()) 324 | 325 | plot_recent, loc = _can_plot_recent(next(iter(quantile_returns.values()))) 326 | 327 | fig, axs = plt.subplots(len(periods), 1 + plot_recent, figsize=(10 + (3 * plot_recent), 7 * len(periods))) 328 | for (period, period_cum_returns), ax in zip(cum_returns.items(), axs): 329 | if plot_recent: 330 | ax1, ax2 = ax 331 | else: 332 | ax1, ax2 = ax, NotImplemented 333 | 334 | plt_cumulative_returns( 335 | daily_cum_returns=period_cum_returns, 336 | ax=ax1, 337 | show_min_max=True, 338 | title=f"{factor_name} ({period} Fwd Period)", 339 | show=False, 340 | ) 341 | if plot_recent: 342 | recent_data = period_cum_returns.loc[loc:] + 1 343 | recent_data = recent_data.pct_change(fill_method=None).add(1).cumprod().sub(1) 344 | plt_cumulative_returns( 345 | daily_cum_returns=recent_data, 346 | ax=ax2, 347 | show_min_max=True, 348 | title=f"{factor_name} (Recent) ({period} Fwd Period)", 349 | show=False, 350 | ) 351 | if plot_dir: 352 | plt.savefig( 353 | Path(plot_dir) / f"{factor_name} Quantile Cum Returns.png", 354 | bbox_inches="tight", 355 | ) 356 | if show: 357 | plt.show() 358 | 359 | 360 | @njit 361 | def get_cum_end_returns(daily_rt): 362 | started = False 363 | cum_rt = 1 364 | total = 0 365 | n_cs_nans = 0 366 | for x in daily_rt: 367 | if np.isfinite(x): 368 | n_cs_nans = 0 369 | started = True 370 | cum_rt *= 1 + x 371 | total += 1 372 | else: 373 | n_cs_nans += 1 374 | if started: 375 | total += 1 376 | if not started: 377 | return np.nan 378 | total -= n_cs_nans 379 | return cum_rt ** (244 / total) - 1 380 | 381 | 382 | def get_cumulated_end_returns(daily_ret: pd.Series | pd.DataFrame, std=False): 383 | """ 384 | Get cumulated end returns of each quantile 385 | 386 | Parameters 387 | ---------- 388 | std: bool, default False 389 | If True, returns the standard deviation of the cumulated end returns 390 | 391 | """ 392 | 393 | returns_avg = daily_ret.apply(get_cum_end_returns, raw=True) 394 | 395 | if std: 396 | returns_std = daily_ret.std() * np.sqrt(244) 397 | return returns_avg, returns_std 398 | else: 399 | return returns_avg 400 | 401 | 402 | def _get_avg_and_std(quantile_returns: QuantileReturns): 403 | returns_avg = {} 404 | returns_std = {} 405 | for period, period_returns in quantile_returns.items(): 406 | returns_avg[period], returns_std[period] = get_cumulated_end_returns(period_returns, std=True) 407 | 408 | # quantile x periods 409 | return pd.DataFrame(returns_avg), pd.DataFrame(returns_std) 410 | 411 | 412 | def plt_quantile_cumulated_end_returns( 413 | quantile_returns: QuantileReturns, factor_name="Factor", plot_dir=None, show=True 414 | ): 415 | """ 416 | Plot the cumulated end returns of each quantile. 417 | 418 | Parameters 419 | ---------- 420 | quantile_returns: QuantileReturns 421 | factor_name: str 422 | plot_dir: Path, default None 423 | show: bool, default True 424 | 425 | """ 426 | returns_avg, returns_std = _get_avg_and_std(quantile_returns) 427 | 428 | plot_recent, loc = _can_plot_recent(next(iter(quantile_returns.values()))) 429 | 430 | w, h = (4 * len(returns_avg) * (1 + plot_recent) + 50) / 9, 16 431 | 432 | def _plot(avg, std, axavg, axstd, name): 433 | avg.plot(kind="bar", width=0.8, ax=axavg) 434 | axavg.set( 435 | xlabel="", 436 | ylabel="Return Mean (Ann.)", 437 | title=f"{name} Return Mean By Quantile", 438 | ) 439 | std.plot(kind="bar", width=0.8, ax=axstd) 440 | axstd.set( 441 | xlabel="", 442 | ylabel="Return Std (Ann.)", 443 | title=f"{name} Return Std By Quantile", 444 | ) 445 | 446 | fig, axs = plt.subplots(2, 1 + plot_recent, figsize=(w, h)) 447 | if plot_recent: 448 | (ax_avg1, ax_avg2), (ax_std1, ax_std2) = axs 449 | else: 450 | ax_avg1, ax_std1 = axs 451 | ax_avg2 = ax_std2 = None 452 | _plot(returns_avg, returns_std, ax_avg1, ax_std1, factor_name) 453 | 454 | if plot_recent: 455 | returns_avg_rct, returns_std_rct = _get_avg_and_std( 456 | QuantileReturns({k: v.loc[loc:] for k, v in quantile_returns.items()}) 457 | ) 458 | _plot(returns_avg_rct, returns_std_rct, ax_avg2, ax_std2, f"{factor_name} (Recent)") 459 | 460 | for ax in axs.flatten(): 461 | ax.yaxis.set_major_formatter(plt.FuncFormatter("{:.0%}".format)) 462 | 463 | if plot_dir: 464 | plt.savefig( 465 | Path(plot_dir) / f"{factor_name} Quantile End Returns.png", 466 | bbox_inches="tight", 467 | ) 468 | if show: 469 | plt.show() 470 | -------------------------------------------------------------------------------- /firefin/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | """ 5 | interface for fetching data 6 | 7 | """ 8 | from .gateway import fetch_data -------------------------------------------------------------------------------- /firefin/data/datainfo.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import os 5 | import pandas as pd 6 | from ..common.config import DATA_PATH 7 | 8 | def load_AStock_info() -> tuple[pd.DataFrame, pd.DataFrame]: 9 | try: 10 | columns = pd.read_feather(os.path.join(DATA_PATH, "columns.feather")) 11 | index = pd.read_feather(os.path.join(DATA_PATH, "index.feather")) 12 | except FileNotFoundError as e: 13 | raise FileNotFoundError(f"File not found: {e}, please download data first") 14 | return columns, index -------------------------------------------------------------------------------- /firefin/data/fake.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import typing 5 | from functools import partial 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from ..common.const import MIN_BARTIMES 11 | 12 | IndexType = typing.Literal["d", "day", "m", "min", "minute", "l1", "l2"] 13 | MockType = typing.Literal["rand", "norm", "price", "volume", "return", "arange"] 14 | 15 | 16 | def _get_l2_seconds(): 17 | morning = pd.timedelta_range("09:15:00", "11:30:00", freq="3 s") 18 | afternoon = pd.timedelta_range("13:00:00", "15:30:00", freq="3 s") 19 | return morning.union(afternoon) 20 | 21 | 22 | l2_seconds = _get_l2_seconds() 23 | 24 | 25 | def _index_maker(n, index_type: IndexType = "day"): 26 | if index_type in ("d", "day"): 27 | return pd.date_range("2010/1/1", periods=n, name="trade_date").strftime("%Y-%m-%d") 28 | elif index_type in ("m", "min", "minute"): 29 | n_days, n_minute = divmod(n, len(MIN_BARTIMES)) 30 | if n_minute > 0: 31 | n_days += 1 32 | day_part = _index_maker(n_days, index_type="day") 33 | total_index = pd.MultiIndex.from_product([day_part, MIN_BARTIMES], names=["trade_date", "bartime"]) 34 | if n_minute == 0: 35 | return total_index 36 | else: 37 | return total_index[: -(len(MIN_BARTIMES) - n_minute)] 38 | elif index_type in ("l1", "l2"): 39 | n_days, n_sec = divmod(n, 5702) 40 | if n_sec > 0: 41 | n_days += 1 42 | day_part = _index_maker(n_days, index_type="day") 43 | 44 | total_index = pd.concat( 45 | [pd.Series(0, index=pd.DatetimeIndex([dt]).repeat(5702) + l2_seconds) for dt in day_part] 46 | ).index 47 | if n_sec == 0: 48 | return total_index 49 | else: 50 | return total_index[: -(5702 - n_sec)] 51 | raise NotImplementedError(f"index_type {index_type} not implemented") 52 | 53 | 54 | def _nb_random(shape, mock): 55 | if mock == "rand": 56 | return np.random.random(shape) 57 | elif mock == "norm": 58 | return np.random.randn(*shape) 59 | elif mock == "return": 60 | return np.random.normal(0.0, 0.03, shape) 61 | elif mock == "price": 62 | rt = _nb_random(shape, mock="return") 63 | price = (rt + 1).cumprod().reshape((shape[0], -1)) 64 | price *= np.exp(np.random.normal(3.5, 1.06, price.shape[-1])) 65 | return price.reshape(shape) 66 | elif mock == "volume": 67 | return np.exp(np.random.normal(14.26, 1.29, shape)) 68 | elif mock == "arange": 69 | total = 1 70 | for s in shape: 71 | total *= s 72 | return np.arange(total, dtype=np.float64).reshape(shape) 73 | 74 | 75 | def _value_maker(shape, fill_value=np.nan, mock: MockType = "rand"): 76 | if fill_value is np.nan: 77 | if mock in MockType.__args__: 78 | return _nb_random(shape, mock) 79 | else: 80 | raise ValueError(f"mock {mock} not implemented") 81 | else: 82 | return np.full(shape, fill_value) 83 | 84 | 85 | def _generate_stock_code(i): 86 | c = f"{i:06}." 87 | if not c.startswith(("0", "3", "6")): 88 | c = ("0", "3", "6")[int(c[0]) % 3] + c[1:] 89 | 90 | if c.startswith(("0", "3")): 91 | c += "SZ" 92 | else: 93 | c += "SH" 94 | return c 95 | 96 | 97 | def _columns_maker(n): 98 | return pd.Index(sorted(map(_generate_stock_code, range(n))), name="stock_code") 99 | 100 | 101 | def gen_df(*shape, fill_value=np.nan, index: IndexType = "day", mock: MockType = "rand", **joblib_kwargs): 102 | """quickly generate stock like DataFrames for test""" 103 | if not shape: 104 | shape = (10, 3) 105 | shape = tuple(np.ravel(shape)) 106 | 107 | index_maker = partial(_index_maker, index_type=index) 108 | value_maker = partial(_value_maker, fill_value=fill_value, mock=mock) 109 | 110 | if len(shape) == 1: 111 | container = pd.Series 112 | idx_col = {"index": index_maker(shape[0])} 113 | elif len(shape) == 2: 114 | container = pd.DataFrame 115 | idx_col = {"index": index_maker(shape[0]), "columns": _columns_maker(shape[1])} 116 | else: 117 | raise NotImplementedError(f"shape {shape} not implemented") 118 | 119 | out = container(value_maker(shape), **idx_col) 120 | return out 121 | -------------------------------------------------------------------------------- /firefin/data/file_reader.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import os 5 | import pandas as pd 6 | from ..common.config import DATA_PATH 7 | 8 | data_path = os.path.join(os.path.dirname(__file__), 'raw') 9 | 10 | # TODO: support other file types 11 | # TODO: support start and end date, only read the data in the range 12 | def read_feather(names): 13 | try: 14 | result = {n : pd.read_feather(f"{DATA_PATH}/{n}.feather") for n in names} 15 | except FileNotFoundError as e: 16 | raise FileNotFoundError(f"File not found: {e}, please download data first") 17 | return result 18 | 19 | 20 | def file_reader(info: dict[str, list[str]]) -> dict[str, pd.DataFrame]: 21 | # TODO: support other file types 22 | feather_reader_names = info['feather'] 23 | return read_feather(feather_reader_names) -------------------------------------------------------------------------------- /firefin/data/gateway.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from ..common.config import logger, DATA_MAPS 7 | from .datainfo import load_AStock_info 8 | from .fake import gen_df 9 | from .file_reader import file_reader 10 | 11 | def _get_clean_names(names) -> list: 12 | output = [] 13 | 14 | def _add_from_str(s): 15 | for n in s.split(","): 16 | n = n.replace(" ", "") 17 | if n and n not in output: 18 | output.append(n) 19 | 20 | for name in np.ravel(names): 21 | if isinstance(name, str): 22 | _add_from_str(name) 23 | else: 24 | # we assume it's iterable of strings 25 | for _name in name: 26 | _add_from_str(_name) 27 | return output 28 | 29 | 30 | def _get_clean_se(start=None, end=None, dates=None): 31 | """basic checks, no transformation for input ts""" 32 | if dates is None: 33 | if start is not None: 34 | assert np.ndim(start) == 0, f"start must be a scalar, got {start}" 35 | if end is not None: 36 | assert np.ndim(end) == 0, f"end must be a scalar, got {end}" 37 | 38 | else: 39 | if start is not None or end is not None: 40 | raise ValueError("start and end cannot be used with dates") 41 | else: 42 | if isinstance(dates, slice): 43 | dates = [dates.start, dates.stop] 44 | elif pd.api.types.is_list_like(dates): 45 | pass 46 | else: 47 | # str, datetime-like 48 | dates = np.atleast_1d(dates) 49 | start, end = dates[0], dates[-1] 50 | 51 | return start, end 52 | 53 | 54 | def _parse_args(names, start_date, end_date, dates): 55 | """ 56 | parse names, start_date, end_date 57 | 58 | Notes 59 | ----- 60 | define name string as a str of a single data or a str of multiple data names separated by comma. 61 | `names` can be: single name sting, iterable of name sting, or an iterable containing name string 62 | and trailing datetime-like 63 | if `names` has trailing datetime-like, `start_date`, `end_date` and `dates` should be None 64 | 65 | Examples 66 | -------- 67 | names can be: 68 | "close" 69 | "close, open" 70 | ["close", "open"] 71 | ["close, open"] 72 | tailing datetime-like can be: 73 | "2020/1/1" 74 | ["2020/1/1", "2020/1/2"] 75 | slice("2020/1/1", "2020/1/2") 76 | 77 | """ 78 | 79 | def is_datetime_like(obj): 80 | try: 81 | pd.to_datetime(obj) 82 | except Exception: 83 | return False 84 | else: 85 | return True 86 | 87 | if dates is None: 88 | # datetime is list-like or slice 89 | if isinstance(n1 := names[-1], slice): 90 | names = names[:-1] 91 | start_date, end_date = _get_clean_se(start_date, end_date, dates=n1) 92 | elif pd.api.types.is_list_like(n1) and is_datetime_like(t := np.ravel(n1)): 93 | names = names[:-1] 94 | start_date, end_date = _get_clean_se(start_date, end_date, dates=t) 95 | else: 96 | if is_datetime_like(n1): 97 | end_date = n1 98 | names = names[:-1] 99 | if len(names) >= 2 and is_datetime_like(n2 := names[-1]): 100 | start_date = n2 101 | names = names[:-1] 102 | else: 103 | start_date = end_date 104 | start_date, end_date = _get_clean_se(start_date, end_date) 105 | else: 106 | start_date, end_date = _get_clean_se(start_date, end_date, dates=dates) 107 | 108 | return _get_clean_names(names), start_date, end_date 109 | 110 | 111 | def check_if_valid(names: list[str]) -> dict[str, bool]: 112 | return {n: n in DATA_MAPS.keys() for n in names} 113 | 114 | 115 | def fetch_data( 116 | *args, 117 | names=None, 118 | start_date=None, 119 | end_date=None, 120 | dates=None, 121 | market_range="ALL", 122 | ) -> dict[str, pd.DataFrame]: 123 | if names is None: 124 | names = args 125 | elif args: 126 | raise ValueError("you may only use `names` or `*args` to specify the data to be queried") 127 | 128 | names, start_date, end_date = _parse_args(names, start_date, end_date, dates) 129 | 130 | results = {} 131 | if not names: 132 | return results 133 | 134 | valid = check_if_valid(names) 135 | 136 | for k, v in valid.items(): 137 | if not v: 138 | columns, index = load_AStock_info() 139 | logger.warning(f"{k} is not a valid data name, mock with random data") 140 | results[k] = gen_df((len(index), len(columns))) 141 | names.remove(k) 142 | 143 | if len(names) == 0: 144 | return results 145 | 146 | # only support file reader for now 147 | file_reader_names = dict() 148 | 149 | for name in names: 150 | try: 151 | l, r = DATA_MAPS[name].split("::") # noqa: E741 152 | except Exception as e: 153 | logger.error(f"cannot find data source for {name}, reason: {e}") 154 | continue 155 | 156 | if l == "file": 157 | if r not in file_reader_names: 158 | file_reader_names[r] = [name] 159 | else: 160 | file_reader_names[r].append(name) 161 | else: 162 | raise ValueError(f"{name} unsupported data source: {l}::{r}") 163 | 164 | # only support file reader for now 165 | results.update(file_reader(file_reader_names)) 166 | return results 167 | -------------------------------------------------------------------------------- /firefin/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/evaluation/__init__.py -------------------------------------------------------------------------------- /firefin/evaluation/academia/AcaEvaluatorModel.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import pandas as pd 3 | from ..eva_utils import compute_ic, ForwardReturns, QuantileReturns 4 | from ...core.algorithm.regression import least_square, rolling_regression, BatchRegressionResult 5 | from ...common.config import logger 6 | from .anomaly_test import AnomalyTest 7 | from .fama_macbeth import FamaMacBeth 8 | from .portfolio_sort import PortfolioSort 9 | 10 | class AcaEvaluatorModel: 11 | def __init__(self, 12 | factor: pd.DataFrame, 13 | forward_returns: ForwardReturns, 14 | return_adj: pd.DataFrame, 15 | n_jobs: int = 10, 16 | verbose: int = 0): 17 | """ 18 | Parameters: 19 | factor: pd.DataFrame 20 | Factor exposure data (Time × Stock) 21 | forward_returns: dict[str, pd.DataFrame] 22 | A dictionary where each key is a holding period, and the value is a DataFrame of future returns (Time × Stock) 23 | return_adj: pd.DataFrame 24 | DataFrame of adjusted returns (Time × Stock) 25 | n_jobs: int 26 | Number of jobs to run in parallel 27 | verbose: int 28 | Verbosity level 29 | """ 30 | 31 | self.factor = factor 32 | self.forward_returns = forward_returns 33 | self.return_adj = return_adj 34 | self.n_jobs = n_jobs 35 | self.verbose = verbose 36 | 37 | def run_single_sort(self, 38 | quantiles: int = 5, 39 | value_weighted: bool = True, 40 | return_stats: bool = False, 41 | market_cap: pd.DataFrame = None, 42 | get_quantile_sorts: bool = False): 43 | """ 44 | Perform single-factor portfolio sorting to compute returns for each quantile group, 45 | with optional return of statistics and quantile labels. 46 | 47 | Parameters: 48 | quantiles: int 49 | Number of quantile groups (e.g., 5 for quintile sorting) 50 | value_weighted: bool 51 | Whether to use value-weighted portfolios; False indicates equal-weighted portfolios 52 | return_stats: bool 53 | Whether to compute and return statistics (mean, t-stat, p-value, etc.) for the H-L portfolio 54 | market_cap: pd.DataFrame 55 | Market capitalization data, with the same dimensions as the factor; required if value_weighted is True 56 | get_quantile_sorts: bool 57 | Whether to return the quantile label assigned to each stock 58 | 59 | Returns: 60 | If return_stats is True: 61 | Tuple[QuantileReturns, dict] → (portfolio returns, dictionary of statistics) 62 | Otherwise: 63 | QuantileReturns 64 | """ 65 | 66 | if value_weighted and market_cap is None: 67 | raise ValueError("You must provide market_cap when value_weighted=True.") 68 | 69 | portfolio_returns = PortfolioSort.single_sort( 70 | factor=self.factor, 71 | forward_returns=self.forward_returns, 72 | market_cap=market_cap, 73 | quantiles=quantiles, 74 | value_weighted=value_weighted, 75 | get_quantile_sorts=get_quantile_sorts 76 | ) 77 | 78 | if return_stats: 79 | stats = PortfolioSort.get_statistics(portfolio_returns, quantiles) 80 | return portfolio_returns, stats 81 | 82 | return portfolio_returns 83 | 84 | def run_fama_macbeth(self, 85 | window: int = 252, 86 | return_stats: bool = False): 87 | """ 88 | Perform Fama-MacBeth two-stage cross-sectional regression estimation. 89 | 90 | Parameters: 91 | window: int 92 | Rolling window size for the first-stage regressions (default is 252, i.e., one year) 93 | return_stats: bool 94 | Whether to return t-statistics and significance test results 95 | 96 | Returns: 97 | If return_stats is True: 98 | Tuple[RegressionResult, dict] → (regression results, statistics) 99 | Otherwise: 100 | RegressionResult 101 | """ 102 | 103 | results = FamaMacBeth.run_regression(self.factor, self.return_adj, window=window, n_jobs=self.n_jobs, verbose=self.verbose) 104 | if return_stats: 105 | stats = FamaMacBeth.test_statistics(results) 106 | return results, stats 107 | return results 108 | 109 | def run_ic(self, method: str = "pearson") -> pd.DataFrame: 110 | """ 111 | Compute the Information Coefficient (IC) between the factor and future returns. 112 | 113 | Parameters: 114 | method: str 115 | Correlation method to use; options are: 'pearson', 'spearman', 'kendall' 116 | 117 | Returns: 118 | pd.DataFrame 119 | IC values for each period 120 | """ 121 | 122 | return compute_ic(self.factor, self.forward_returns, method=method) 123 | 124 | def run_regression(self, rolling: bool = False, window: int = 60, fit_intercept: bool = True) -> BatchRegressionResult | dict: 125 | """ 126 | Run either static or rolling regression of returns on factor exposures. 127 | 128 | Parameters 129 | ---------- 130 | rolling : bool, optional 131 | Whether to perform rolling regression, by default False. 132 | window : int, optional 133 | Rolling window size (only used if rolling=True), by default 60. 134 | fit_intercept : bool, optional 135 | Whether to include an intercept in the regression, by default True. 136 | 137 | Returns 138 | ------- 139 | BatchRegressionResult | dict 140 | Regression result object (static) or a dictionary of rolling results. 141 | """ 142 | if rolling: 143 | # Use rolling_regression function 144 | result = rolling_regression(x=self.factor, y=self.return_adj, window=window, fit_intercept=fit_intercept, n_jobs=self.n_jobs, verbose=self.verbose) 145 | else: 146 | # Time-by-time regression using least_square 147 | from collections import defaultdict 148 | results = defaultdict(list) 149 | for t in self.factor.index: 150 | x_t = self.factor.loc[t] 151 | y_t = self.return_adj.loc[t] 152 | if x_t.isnull().any() or y_t.isnull().any(): 153 | continue 154 | reg_result = least_square(x=x_t, y=y_t, fit_intercept=fit_intercept) 155 | results['alpha'].append(reg_result.alpha) 156 | results['beta'].append(reg_result.beta) 157 | results['r2'].append(reg_result.r2) 158 | results['r2_adj'].append(reg_result.r2_adj) 159 | results['residuals'].append(reg_result.residuals) 160 | result = BatchRegressionResult(alpha=results['alpha'], beta=results['beta'], r2=results['r2'], r2_adj=results['r2_adj'], residuals=results['residuals']) 161 | return result 162 | 163 | def run_anomaly_test(self, 164 | portfolio_returns: QuantileReturns, 165 | cov_type: typing.Optional[str] = None, 166 | cov_kwds: typing.Optional[dict] = None, 167 | return_stats: bool = False): 168 | """ 169 | Perform anomaly test by regressing portfolio returns on a factor model. 170 | 171 | Parameters: 172 | return_stats : bool 173 | Whether to return regression statistics summary. 174 | 175 | Returns: 176 | If return_stats is True: 177 | Tuple[AnomalyTest, pd.DataFrame] 178 | Else: 179 | AnomalyTest 180 | """ 181 | mkt_ret = pd.DataFrame(self.return_adj.mean(axis=1)) 182 | tester = AnomalyTest(portfolio_returns= portfolio_returns, factor_model=mkt_ret) 183 | 184 | if return_stats: 185 | summary = tester.fit(cov_type=cov_type, cov_kwds=cov_kwds).test_statistics() 186 | return summary 187 | return tester 188 | 189 | 190 | def run_all(self) -> dict: 191 | """ 192 | Run all available evaluation methods and return results in a dictionary. 193 | 194 | Returns 195 | ------- 196 | dict 197 | A dictionary containing the results of all evaluation methods. 198 | """ 199 | results = {} 200 | #Single Sort 201 | logger.info("Running Single Sort") 202 | results['single_sort_res'], results['single_sort_stat'] = self.run_single_sort( 203 | quantiles=5, 204 | value_weighted=False, 205 | return_stats=True 206 | ) 207 | logger.info("Single Sort Completed") 208 | #Fama-MacBeth Regression 209 | logger.info("Running Fama-MacBeth Regression") 210 | results['fama_macbeth_res'], results['fama_macbeth_stat']= self.run_fama_macbeth( 211 | window=252, 212 | return_stats=True 213 | ) 214 | logger.info("Fama-MacBeth Regression Completed") 215 | # IC 216 | logger.info("Running IC") 217 | results['information_coefficient'] = self.run_ic(method="pearson") 218 | logger.info("IC Completed") 219 | 220 | # Static Regression 221 | logger.info("Running Static Regression") 222 | results['regression'] = self.run_regression(rolling=False, fit_intercept=True) 223 | logger.info("Static Regression Completed") 224 | 225 | # Anomaly Test 226 | logger.info("Running Anomaly Test") 227 | for k, v in results['single_sort_res'].items(): 228 | results['anomaly_stat'] = {k:self.run_anomaly_test(portfolio_returns= pd.DataFrame(v.iloc[:,-1]), return_stats= True)} 229 | logger.info("Anomaly Test Completed") 230 | 231 | return results -------------------------------------------------------------------------------- /firefin/evaluation/academia/AcaEvaluatorModelComparison.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from .portfolio_sort import PortfolioSort 3 | from .MSR_Test import MSRTest 4 | from ..eva_utils import ForwardReturns 5 | 6 | class AcaEvaluatorModelComparison: 7 | def __init__(self, factor1: pd.DataFrame, factor2: pd.DataFrame, forward_returns: ForwardReturns): 8 | """ 9 | Parameters: 10 | factor1 & factor2: pd.DataFrame 11 | Factor exposure data (Time × Stock) 12 | forward_returns: dict[str, pd.DataFrame] 13 | A dictionary where each key is a holding period, and the value is a DataFrame of future returns (Time × Stock) 14 | """ 15 | 16 | self.factor1 = factor1 17 | self.factor2 = factor2 18 | self.forward_returns = forward_returns 19 | 20 | def run_double_sort(self, 21 | quantiles: tuple = (5, 5), 22 | dependent: bool = False, 23 | value_weighted: bool = True, 24 | market_cap: pd.DataFrame = None, 25 | get_quantile_sorts: bool = False): 26 | """ 27 | Perform double-factor sorting by jointly grouping assets based on factor1 and factor2, and calculate returns. 28 | 29 | Parameters: 30 | quantiles: Tuple[int, int] 31 | Number of quantile groups for the primary and secondary factors (e.g., (5, 5)) 32 | dependent: bool 33 | Whether to use conditional (nested) sorting 34 | value_weighted: bool 35 | Whether to use value-weighted portfolios 36 | market_cap: pd.DataFrame 37 | Market capitalization data, same dimensions as the factors; required if value_weighted is True 38 | get_quantile_sorts: bool 39 | Whether to return portfolio labels (i.e., the group each stock belongs to) 40 | 41 | Returns: 42 | QuantileReturns or dict[str, pd.DataFrame] (if get_quantile_sorts is True) 43 | """ 44 | 45 | if value_weighted and market_cap is None: 46 | raise ValueError("You must provide market_cap when value_weighted=True.") 47 | 48 | return PortfolioSort.double_sort( 49 | factor1=self.factor1, 50 | factor2=self.factor2, 51 | forward_returns=self.forward_returns, 52 | market_cap=market_cap, 53 | quantiles=quantiles, 54 | dependent=dependent, 55 | value_weighted=value_weighted, 56 | get_quantile_sorts=get_quantile_sorts 57 | ) 58 | 59 | def run_msr_test(self, regularize=True): 60 | """ 61 | Compare the Maximum Sharpe Ratios of two factor models using a Z-test. 62 | Args: 63 | regularize_covariance (bool): If True, regularize the covariance matrix. 64 | Returns: 65 | dict: { 66 | 'msr_a': float, # MSR of model A 67 | 'msr_b': float, # MSR of model B 68 | 'test_stat': float, # Z-statistic 69 | 'p_value': float # two-sided p-value 70 | } 71 | """ 72 | return MSRTest.run_msr_comparison(model_a=self.factor1, model_b=self.factor2, regularize_covariance=True) 73 | 74 | def run_all(self, market_cap: pd.DataFrame = None) -> dict: 75 | """ 76 | Run all evaluation methods and return results as a dictionary. 77 | 78 | Parameters: 79 | market_cap: pd.DataFrame(Required for value-weighted double sort) 80 | 81 | Returns: 82 | dict: 83 | {'double_sort': result of double sort, 84 | 'msr_test': result of MSR test} 85 | """ 86 | results = {} 87 | 88 | try: 89 | results['double_sort'] = self.run_double_sort( 90 | quantiles=(5, 5), 91 | value_weighted=True, 92 | market_cap=market_cap, 93 | get_quantile_sorts=False 94 | ) 95 | except Exception as e: 96 | results['double_sort'] = f"Error: {e}" 97 | 98 | try: 99 | results['msr_test'] = self.run_msr_test( 100 | regularize=True 101 | ) 102 | except Exception as e: 103 | results['msr_test'] = f"Error: {e}" 104 | 105 | return results -------------------------------------------------------------------------------- /firefin/evaluation/academia/MSR_Test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.stats import norm 4 | 5 | class MSRTest: 6 | """ 7 | A class to compute and statistically compare the Maximum Sharpe Ratios (MSRs) 8 | between two factor models using the asymptotic test from Barillas & Shanken (2018). 9 | """ 10 | 11 | @staticmethod 12 | def compute_max_sharpe_ratio(factor_returns: pd.DataFrame, regularize_covariance: bool = False) -> tuple: 13 | """ 14 | Compute the maximum Sharpe ratio for a factor model. 15 | 16 | Args: 17 | factor_returns (pd.DataFrame): T × K matrix of factor returns. 18 | regularize_covariance (bool): If True, regularize the covariance matrix. 19 | Returns: 20 | tuple: 21 | - float: Maximum Sharpe Ratio 22 | - np.ndarray: Mean return vector (μ) 23 | - np.ndarray: Covariance matrix (Σ) 24 | """ 25 | mu = factor_returns.mean().values 26 | sigma = np.cov(factor_returns.T, ddof=1) 27 | 28 | # Regularizing the covariance matrix 29 | if regularize_covariance: 30 | epsilon = 1e-6 # small constant for regularization 31 | sigma += np.eye(sigma.shape[0]) * epsilon 32 | 33 | msr = np.sqrt(mu @ np.linalg.inv(sigma) @ mu) 34 | return msr, mu, sigma 35 | 36 | @staticmethod 37 | def asymptotic_variance_msr_squared(mu: np.ndarray, sigma: np.ndarray, T: int) -> float: 38 | """ 39 | Compute the asymptotic variance of the squared maximum Sharpe ratio. 40 | 41 | Args: 42 | mu (np.ndarray): Mean return vector. 43 | sigma (np.ndarray): Covariance matrix. 44 | T (int): Sample size. 45 | 46 | Returns: 47 | float: Asymptotic variance of MSR². 48 | """ 49 | inv_sigma = np.linalg.inv(sigma) 50 | term = 4 * (mu @ inv_sigma @ sigma @ inv_sigma @ mu) 51 | return term / T 52 | 53 | @staticmethod 54 | def run_msr_comparison(model_a: pd.DataFrame, model_b: pd.DataFrame, regularize_covariance: bool = False) -> dict: 55 | """ 56 | Compare the Maximum Sharpe Ratios of two factor models using a Z-test. 57 | 58 | Args: 59 | model_a (pd.DataFrame): T × K matrix of factor returns for model A. 60 | model_b (pd.DataFrame): T × K matrix of factor returns for model B. 61 | regularize_covariance (bool): If True, regularize the covariance matrix. 62 | Returns: 63 | dict: { 64 | 'msr_a': float, # MSR of model A 65 | 'msr_b': float, # MSR of model B 66 | 'test_stat': float, # Z-statistic 67 | 'p_value': float # two-sided p-value 68 | } 69 | """ 70 | T = model_a.shape[0] 71 | # Compute MSRs and their components 72 | msr_a, mu_a, sigma_a = MSRTest.compute_max_sharpe_ratio(model_a, regularize_covariance) 73 | msr_b, mu_b, sigma_b = MSRTest.compute_max_sharpe_ratio(model_b, regularize_covariance) 74 | 75 | # Compute variances of MSR² 76 | msr2_a = msr_a ** 2 77 | msr2_b = msr_b ** 2 78 | var_a = MSRTest.asymptotic_variance_msr_squared(mu_a, sigma_a, T) 79 | var_b = MSRTest.asymptotic_variance_msr_squared(mu_b, sigma_b, T) 80 | 81 | # Z-test for MSR² difference 82 | diff = msr2_a - msr2_b 83 | std_error = np.sqrt(var_a + var_b) 84 | z_stat = diff / std_error 85 | p_value = 2 * (1 - norm.cdf(np.abs(z_stat))) 86 | 87 | return { 88 | "msr_a": msr_a, 89 | "msr_b": msr_b, 90 | "test_stat": z_stat, 91 | "p_value": p_value 92 | } -------------------------------------------------------------------------------- /firefin/evaluation/academia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/evaluation/academia/__init__.py -------------------------------------------------------------------------------- /firefin/evaluation/academia/anomaly_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | from typing import List, Optional, Union 5 | 6 | from ...core.algorithm.regression import _regression, RegressionResult 7 | from ..eva_utils import QuantileReturns 8 | 9 | 10 | class AnomalyTest: 11 | """ 12 | Perform anomaly tests by regressing portfolio returns on a specified factor model 13 | and summarizing the resulting parameter estimates and test statistics. 14 | 15 | Attributes 16 | ---------- 17 | portfolio_returns : pd.DataFrame 18 | DataFrame of portfolio returns, with each column representing a distinct portfolio. 19 | factor_model : pd.DataFrame 20 | DataFrame containing factor return series as independent variables. 21 | _regression_results : dict[str, RegressionResult] 22 | Mapping from portfolio name to its fitted RegressionResult. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | portfolio_returns: QuantileReturns, 28 | factor_model: Union[pd.DataFrame, List[pd.Series]], 29 | ) -> None: 30 | """ 31 | Initialize the AnomalyTest object. 32 | 33 | Parameters 34 | ---------- 35 | portfolio_returns : QuantileReturns 36 | Data structure holding portfolio returns. Must be convertible to a DataFrame 37 | and have a .columns attribute. 38 | factor_model : DataFrame or list of Series 39 | Factor return series used as regressors. Can be a DataFrame or a list of Series. 40 | 41 | Raises 42 | ------ 43 | TypeError 44 | If inputs are not of the expected types. 45 | ValueError 46 | If factor_model is empty. 47 | """ 48 | # Convert portfolio_returns to DataFrame if needed 49 | if not hasattr(portfolio_returns, "columns"): 50 | raise TypeError("`portfolio_returns` must have a `columns` attribute.") 51 | self.portfolio_returns = ( 52 | portfolio_returns 53 | if isinstance(portfolio_returns, pd.DataFrame) 54 | else pd.DataFrame(portfolio_returns) 55 | ) 56 | 57 | # Build factor DataFrame 58 | if isinstance(factor_model, list): 59 | if not factor_model: 60 | raise ValueError("`factor_model` cannot be an empty list.") 61 | self.factor_model = pd.concat(factor_model, axis=1) 62 | elif isinstance(factor_model, pd.DataFrame): 63 | if factor_model.empty: 64 | raise ValueError("`factor_model` DataFrame cannot be empty.") 65 | self.factor_model = factor_model 66 | else: 67 | raise TypeError("`factor_model` must be a DataFrame or a list of Series.") 68 | 69 | self.factor_model.columns = [f'Factor_{i}' for i in self.factor_model.columns] 70 | self._regression_results: dict[str, RegressionResult] = {} 71 | 72 | def fit( 73 | self, 74 | cov_type: Optional[str] = None, 75 | cov_kwds: Optional[dict] = None, 76 | ) -> AnomalyTest: 77 | """ 78 | Fit time-series regressions of each portfolio return on the factor model. 79 | 80 | Parameters 81 | ---------- 82 | cov_type : str, optional 83 | Covariance estimator type (e.g., 'HAC' for Newey–West or 'HC1'). 84 | If None, uses the default homoskedastic standard errors. 85 | cov_kwds : dict, optional 86 | Keyword arguments for the covariance estimator (e.g., {'maxlags': L}). 87 | 88 | Returns 89 | ------- 90 | self : AnomalyTest 91 | The fitted AnomalyTest instance, with regression results stored internally. 92 | """ 93 | # Align portfolio and factor data on the same dates 94 | df_all = pd.concat([self.portfolio_returns, self.factor_model], axis=1).dropna() 95 | y_df = df_all[self.portfolio_returns.columns] 96 | x_df = df_all[self.factor_model.columns] 97 | 98 | for port in y_df.columns: 99 | raw_res = _regression( 100 | x=x_df, 101 | y=y_df[port], 102 | w=None, 103 | fit_intercept=True, 104 | cov_type=cov_type, 105 | cov_kwds=cov_kwds, 106 | ) 107 | # Determine if regression is univariate 108 | univariate = (x_df.shape[1] == 1) 109 | wrapped = RegressionResult(raw_res, fit_intercept=True, univariate=univariate) 110 | self._regression_results[port] = wrapped 111 | return self 112 | 113 | def test_statistics(self) -> pd.DataFrame: 114 | """ 115 | Generate a comprehensive summary table of parameter estimates and test statistics. 116 | 117 | Returns 118 | ------- 119 | summary : pd.DataFrame 120 | A MultiIndex DataFrame with rows labeled by (portfolio, parameter) and 121 | columns ['coef', 'tvalue', 'stderr', 'pvalue']. 122 | """ 123 | records = [] 124 | for port, res in self._regression_results.items(): 125 | params = res.sm_result.params 126 | tvals = res.sm_result.tvalues 127 | stderrs = res.sm_result.bse 128 | pvals = res.sm_result.pvalues 129 | records.append({ 130 | 'coef': params, 131 | 'tvalue': tvals, 132 | 'stderr': stderrs, 133 | 'pvalue': pvals 134 | }) 135 | return records 136 | 137 | def alpha(self, portfolio: str) -> float: 138 | """ 139 | Retrieve the intercept (alpha) from the regression of a specific portfolio. 140 | 141 | Parameters 142 | ---------- 143 | portfolio : str 144 | The name of the portfolio column. 145 | 146 | Returns 147 | ------- 148 | alpha : float 149 | The estimated intercept term. 150 | """ 151 | return self._regression_results[portfolio].alpha # type: ignore 152 | 153 | def betas(self, portfolio: str) -> pd.Series: 154 | """ 155 | Retrieve the factor loadings (betas) for a specific portfolio. 156 | 157 | Parameters 158 | ---------- 159 | portfolio : str 160 | The name of the portfolio column. 161 | 162 | Returns 163 | ------- 164 | betas : pd.Series 165 | Series of estimated factor coefficients, indexed by factor name. 166 | """ 167 | return self._regression_results[portfolio].beta # type: ignore 168 | -------------------------------------------------------------------------------- /firefin/evaluation/academia/fama_macbeth.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ...core.algorithm.newey_west_ttest_1samp import NeweyWestTTest 4 | from ...core.algorithm.regression import RollingRegressor, BatchRegressionResult 5 | 6 | 7 | class FamaMacBeth: 8 | 9 | @staticmethod 10 | def run_regression( 11 | factor: pd.DataFrame | pd.Series, return_adj: pd.DataFrame, window: int = 252, n_jobs=4, verbose: int = 0 12 | ) -> BatchRegressionResult: 13 | """ 14 | Run Fama-MacBeth regression." 15 | """ 16 | if isinstance(factor, pd.Series): 17 | # Convert series to DataFrame for consistency 18 | factor = pd.concat([factor] * return_adj.shape[1], axis=1) 19 | factor.columns = return_adj.columns 20 | elif isinstance(factor, pd.DataFrame): 21 | pass 22 | else: 23 | raise ValueError("Factor must be a pandas Series or DataFrame.") 24 | 25 | # Note: Calculate excess returns if necessary 26 | # return_adj = return_adj - risk_free_rate 27 | # excess return is different in many cases, we leave it to the user to handle this. 28 | 29 | # First step: Time-series regressions 30 | r = RollingRegressor(factor, return_adj, None, fit_intercept=True).fit(window, n_jobs=n_jobs, verbose=verbose) 31 | 32 | # Second step: Cross-sectional regressions 33 | # This step involves regressing the time-series regression coefficients on the factors 34 | r = RollingRegressor(r.beta, return_adj, None, fit_intercept=True).fit(window=None, axis=1, n_jobs=n_jobs, verbose=verbose) 35 | 36 | return r 37 | 38 | @staticmethod 39 | def test_statistics(results: BatchRegressionResult) -> pd.Series: 40 | # mean and std 41 | 42 | mean_beta = results.beta.mean() 43 | std_beta = results.beta.std() 44 | 45 | mean_alpha = results.alpha.mean() 46 | std_alpha = results.alpha.std() 47 | 48 | # t-statistics 49 | 50 | t_stat, p_value, se = NeweyWestTTest.newey_west_ttest_1samp(results.beta, popmean=0, lags=6, nan_policy="omit") 51 | 52 | return pd.Series( 53 | { 54 | "mean_beta": mean_beta, 55 | "std_beta": std_beta, 56 | "mean_alpha": mean_alpha, 57 | "std_alpha": std_alpha, 58 | "t_stat": t_stat, 59 | "p_value": p_value, 60 | "se": se, 61 | } 62 | ) 63 | -------------------------------------------------------------------------------- /firefin/evaluation/academia/portfolio_sort.py: -------------------------------------------------------------------------------- 1 | """ 2 | Portfolio Sort Implementation for Academic Research 3 | --------------------------------------------------- 4 | This module provides a class for performing single and double portfolio sorts 5 | based on characteristics, market capitalization, and returns. The implementation 6 | focuses on clarity, documentation, and best practices for financial research. 7 | """ 8 | 9 | import typing 10 | import numpy as np 11 | import pandas as pd 12 | from ...core.algorithm.newey_west_ttest_1samp import NeweyWestTTest 13 | from ..eva_utils import factor_to_quantile, factor_to_quantile_dependent_double_sort 14 | from ..eva_utils import _compute_quantile_df, _compute_weighted_quantile_df 15 | from ..eva_utils import ForwardReturns, QuantileReturns 16 | 17 | StatisticResults = typing.NewType("StatisticResults", dict[str, pd.DataFrame]) 18 | 19 | class PortfolioSort: 20 | """ 21 | Class to perform single and double portfolio sorts based on characteristics. 22 | """ 23 | 24 | @staticmethod 25 | def single_sort( 26 | factor: pd.DataFrame, 27 | forward_returns: ForwardReturns, 28 | quantiles: int, 29 | value_weighted: bool = True, 30 | get_quantile_sorts: bool = False, 31 | market_cap: pd.DataFrame | None = None, 32 | ) -> typing.Union[QuantileReturns, pd.DataFrame]: 33 | """ 34 | Perform single portfolio sort based on characteristic and create long-short portfolio. 35 | 36 | Args: 37 | factor: (Time x Stock) DataFrame of characteristic exposures 38 | forward_returns: period : (Time x Stock) DataFrame of returns 39 | market_cap: (Time x Stock) DataFrame of market capitalizations 40 | quantiles: number of quantiles 41 | value_weighted: Use market cap weighting (True) or equal weighting (False) 42 | get_quantile_sorts: Return portfolio assignments 43 | Returns: 44 | Portfolio returns and statistical results 45 | """ 46 | # 1. DATA PREPARATION 47 | # assume factor, forward_return, market_cap are aligned DataFrames in our case 48 | # 2. QUANTILE CALCULATIONS 49 | quantile_sorts = factor_to_quantile(factor, quantiles) 50 | 51 | # Early exit if quantile assignments requested 52 | if get_quantile_sorts: 53 | return quantile_sorts 54 | 55 | # 3. RETURN CALCULATIONS 56 | # TODO: Add support for other weighting schemes 57 | # TODO: Add transaction costs 58 | if value_weighted: 59 | portfolio_returns = QuantileReturns ({ 60 | period: _compute_weighted_quantile_df(quantile_sorts, period_returns, market_cap,quantiles=quantiles) 61 | for period, period_returns in forward_returns.items() 62 | }) 63 | else: 64 | # equal weighted 65 | portfolio_returns = QuantileReturns ({ 66 | period: _compute_quantile_df(quantile_sorts, period_returns, quantiles=quantiles) 67 | for period, period_returns in forward_returns.items() 68 | }) 69 | 70 | # 4. HEDGE PORTFOLIO (High-Low) 71 | for period, _ in forward_returns.items(): 72 | portfolio_returns[period]["H-L"] = ( 73 | portfolio_returns[period][quantiles] - portfolio_returns[period][1] 74 | ) 75 | 76 | return portfolio_returns 77 | 78 | @staticmethod 79 | def double_sort( 80 | factor1: pd.DataFrame, 81 | factor2: pd.DataFrame, 82 | forward_returns: ForwardReturns, 83 | quantiles: typing.Tuple[int, int] = (5, 5), 84 | dependent: bool = False, 85 | value_weighted: bool = True, 86 | get_quantile_sorts: bool = False, 87 | market_cap: pd.DataFrame | None = None, 88 | ) -> typing.Union[QuantileReturns, pd.DataFrame]: 89 | """ 90 | Sort securities based on two characteristics. 91 | 92 | Args: 93 | factor1: (Time x Stock) DataFrame of characteristic exposures 94 | factor2: (Time x Stock) DataFrame of characteristic exposures 95 | forward_returns: period : (Time x Stock) DataFrame of returns 96 | market_cap: (Time x Stock) DataFrame of market capitalizations 97 | dependent: Whether to use dependent sorting (True) or independent sorting (False) 98 | quantiles: number of quantiles 99 | value_weighted: Use market cap weighting (True) or equal weighting (False) 100 | get_quantile_sorts: Return portfolio assignments 101 | Returns: 102 | Portfolio returns and statistical results 103 | """ 104 | # Ensure that factor1 and factor2 have the same index and columns 105 | assert factor1.index.equals(factor2.index) and factor1.columns.equals(factor2.columns), \ 106 | "factor1 and factor2 must have the same index and columns" 107 | 108 | # 1. DATA PREPARATION 109 | # assume factor1, factor2, forward_return, market_cap are aligned DataFrames in our case 110 | # 2. QUANTILE CALCULATIONS 111 | if dependent: 112 | # Dependent sorting (conditional double sorting) 113 | """ 114 | Note from Professor SHI: 115 | 116 | Suppose we first sort the stocks based on X1, dividing all stocks into L1 groups. Then, within each of 117 | these L1 groups, we further sort the stocks based on X2, dividing the stocks into L2 groups. Again, a 118 | total of L1 × L2 groups 119 | 120 | The two sorting variables are NOT treated equally: the first sorting variable acts solely as a control 121 | variable, and the main interest is the relationship between the second sorting variable and asset 122 | returns. A factor should only be constructed based on the second sorting variable 123 | 124 | Lets assume factor1 is the control variable and factor2 is the main variable of interest. 125 | We will first sort the stocks based on factor1, then within each group, we will sort the stocks based on factor2. 126 | """ 127 | 128 | combined_sorts = factor_to_quantile_dependent_double_sort(factor1, factor2, quantiles) 129 | else: 130 | # independent sorting (unconditional double sorting) 131 | # Independent sorting will result some NONE quantile 132 | 133 | quantile_sorts_factor1 = factor_to_quantile(factor1, quantiles[0]).astype(int) 134 | quantile_sorts_factor2 = factor_to_quantile(factor2, quantiles[1]).astype(int) 135 | # quantile_sorts to string and add them to q1_q2 format 136 | combined_sorts = quantile_sorts_factor1.astype(str) + "_" + quantile_sorts_factor2.astype(str) 137 | 138 | # Initialize a dictionary to store the portfolio returns 139 | portfolio_returns = {} 140 | 141 | # 3. RETURN CALCULATIONS 142 | for period, period_returns in forward_returns.items(): 143 | 144 | # Calculate returns for each combined quantile 145 | if value_weighted: 146 | period_portfolio_returns = _compute_weighted_quantile_df( 147 | combined_sorts, period_returns, market_cap, reindex=False, quantiles= quantiles[0] * quantiles[1] 148 | ) 149 | else: 150 | # equal weighted 151 | period_portfolio_returns = _compute_quantile_df( 152 | combined_sorts, period_returns, reindex=False, quantiles= quantiles[0] * quantiles[1] 153 | ) 154 | # Store the results 155 | portfolio_returns[period] = period_portfolio_returns 156 | 157 | # 4. HEDGE PORTFOLIO (High-High vs Low-Low) 158 | for period, _ in forward_returns.items(): 159 | high_high = portfolio_returns[period].xs(f"{quantiles[0]}_{quantiles[1]}", axis=1) 160 | low_low = portfolio_returns[period].xs("1_1", axis=1) 161 | portfolio_returns[period]['HH-LL'] = high_high - low_low 162 | 163 | # Early exit if quantile assignments requested 164 | if get_quantile_sorts: 165 | return combined_sorts 166 | 167 | return QuantileReturns(portfolio_returns) 168 | 169 | @staticmethod 170 | def get_statistics(result: QuantileReturns, quantiles: int) -> StatisticResults: 171 | """ 172 | Compute statistical results for single portfolio sort. 173 | 174 | TODO: 175 | 1. Add more statistics 176 | 2. plot the results 177 | """ 178 | # T-Test for all periods 179 | # periods * (quantiles + H-L) 180 | t_stats = np.empty((len(result), quantiles + 1), dtype=float) 181 | p_values = np.empty((len(result), quantiles + 1), dtype=float) 182 | se_values = np.empty((len(result), quantiles + 1), dtype=float) 183 | mean_returns = np.empty((len(result), quantiles + 1), dtype=float) 184 | 185 | for n, (_, period_returns) in enumerate(result.items()): 186 | # T-Test for all periods 187 | t_stats[n], p_values[n], se_values[n] = np.apply_along_axis( 188 | NeweyWestTTest.newey_west_ttest_1samp, 189 | axis=0, 190 | arr=period_returns, 191 | popmean=0, 192 | lags=6, 193 | nan_policy='omit' 194 | ) 195 | # other statistics can be added here 196 | mean_returns[n] = np.nanmean(period_returns, axis=0) 197 | 198 | return StatisticResults({'t_stats': pd.DataFrame(t_stats, index=result.keys(), columns=period_returns.columns), 199 | 'p_values': pd.DataFrame(p_values, index=result.keys(), columns=period_returns.columns), 200 | 'se_values': pd.DataFrame(se_values, index=result.keys(), columns=period_returns.columns), 201 | 'mean_returns': pd.DataFrame(mean_returns, index=result.keys(), columns=period_returns.columns)}) -------------------------------------------------------------------------------- /firefin/evaluation/academia/winsorizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Winsorization Implementation for Academic Research 3 | --------------------------------------------------- 4 | This module provides a class for performing winsorizations, including MAD, k-sigma, 5 | and winsorization at extreme percentiles. The implementation focuses on clarity, 6 | documentation, and best practices for financial research. 7 | """ 8 | 9 | import pandas as pd 10 | import numpy as np 11 | from typing import Union, Tuple 12 | 13 | class Winsorizer: 14 | """ 15 | A class to perform winsorizations on cross-sectional characteristic matrices. 16 | Supports input as either pandas DataFrame or numpy array. 17 | """ 18 | 19 | @staticmethod 20 | def __to_dataframe(features: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame: 21 | """ 22 | Convert the input features to a pandas DataFrame if it is a numpy array. 23 | 24 | Args: 25 | features: (Time x Stock) DataFrame of features. 26 | 27 | Returns: 28 | pd.DataFrame: The input converted to a DataFrame. 29 | """ 30 | if isinstance(features, pd.DataFrame): 31 | return features.copy() 32 | elif isinstance(features, np.ndarray): 33 | return pd.DataFrame(features) 34 | else: 35 | raise TypeError("Input features must be a pandas DataFrame or a numpy array.") 36 | 37 | @staticmethod 38 | def __to_original_type(result: pd.DataFrame, original: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]: 39 | """ 40 | Convert the DataFrame result back to the type of the original input. 41 | 42 | Args: 43 | result (pd.DataFrame): The processed DataFrame. 44 | original (Union[pd.DataFrame, np.ndarray]): The original input features. 45 | 46 | Returns: 47 | Union[pd.DataFrame, np.ndarray]: The result in the same type as the original input. 48 | """ 49 | if isinstance(original, np.ndarray): 50 | return result.values 51 | return result 52 | 53 | @classmethod 54 | def MAD_winsorization( 55 | cls, 56 | features: Union[pd.DataFrame, np.ndarray], 57 | scaled: bool = False, 58 | k: int = 3 59 | ) -> Union[pd.DataFrame, np.ndarray]: 60 | """ 61 | Apply winsorization on features using the Median Absolute Deviation (MAD) method. 62 | 63 | Args: 64 | features: (Time x Stock) DataFrame of features. 65 | scaled (bool, optional): Whether to scale the MAD value (MAD * 1.4826). Default is False. 66 | k (int, optional): Scaling factor to determine limits. Default is 3. 67 | 68 | Returns: 69 | Union[pd.DataFrame, np.ndarray]: Winsorized features using MAD. 70 | """ 71 | original = features 72 | df = cls.__to_dataframe(features) 73 | 74 | # Calculate the median for each row 75 | median = df.median(axis=1) 76 | # Compute the absolute deviation from the median, then calculate the median of these deviations for each row 77 | mad = (df.sub(median, axis=0)).abs().median(axis=1) 78 | 79 | # Scale the MAD if required 80 | if scaled: 81 | mad *= 1.4826 82 | 83 | # Calculate the lower and upper limits for winsorization 84 | lower = median - k * mad 85 | upper = median + k * mad 86 | 87 | # Apply winsorization using the DataFrame.clip() method for each row 88 | result = df.clip(lower=lower, upper=upper, axis=0) 89 | return cls.__to_original_type(result, original) 90 | 91 | @classmethod 92 | def sigma_winsorization( 93 | cls, 94 | features: Union[pd.DataFrame, np.ndarray], 95 | k: int = 3 96 | ) -> Union[pd.DataFrame, np.ndarray]: 97 | """ 98 | Apply winsorization on features using the k-sigma rule. 99 | 100 | Args: 101 | features: (Time x Stock) DataFrame of features. 102 | k (int, optional): Scaling factor to determine limits. Default is 3. 103 | 104 | Returns: 105 | Union[pd.DataFrame, np.ndarray]: Winsorized features using the k-sigma rule. 106 | """ 107 | original = features 108 | df = cls.__to_dataframe(features) 109 | 110 | # Calculate the mean and standard deviation for each row 111 | mean = df.mean(axis=1) 112 | std = df.std(axis=1) 113 | 114 | # Calculate the lower and upper limits for winsorization 115 | lower = mean - k * std 116 | upper = mean + k * std 117 | 118 | # Apply winsorization using the DataFrame.clip() method for each row 119 | result = df.clip(lower=lower, upper=upper, axis=0) 120 | return cls.__to_original_type(result, original) 121 | 122 | @classmethod 123 | def percentile_winsorization( 124 | cls, 125 | features: Union[pd.DataFrame, np.ndarray], 126 | percentile: Tuple[float, float] = (0.01, 0.99), 127 | set_outlier_nan: bool = False 128 | ) -> Union[pd.DataFrame, np.ndarray]: 129 | """ 130 | Apply winsorization on features using the percentile rule. 131 | 132 | Args: 133 | features: (Time x Stock) DataFrame of features. 134 | percentile (Tuple[float, float], optional): The lower and upper percentiles to winsorize. 135 | Default is (0.01, 0.99). 136 | set_outlier_nan (bool, optional): Whether to set outliers to be NaN instead of clipping them. Default is False. 137 | 138 | Returns: 139 | Union[pd.DataFrame, np.ndarray]: Winsorized features using the percentile rule. 140 | """ 141 | original = features 142 | df = cls.__to_dataframe(features) 143 | 144 | # Calculate lower and upper bounds based on the given percentiles 145 | lower_bound = df.quantile(percentile[0], axis=1) 146 | upper_bound = df.quantile(percentile[1], axis=1) 147 | 148 | if set_outlier_nan: 149 | # set the outlier values to be NaN 150 | mask = df.lt(lower_bound, axis=0) | df.gt(upper_bound, axis=0) 151 | result = df.mask(mask) 152 | else: 153 | # Clip values to the specified bounds 154 | result = df.clip(lower=lower_bound, upper=upper_bound, axis=0) 155 | 156 | return cls.__to_original_type(result, original) -------------------------------------------------------------------------------- /firefin/evaluation/eva_utils.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | # TODO: Move some common algorithms to fire/core/algorithm/ 5 | 6 | import typing 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | __all__ = [ 12 | "compute_forward_returns", 13 | "compute_ic", 14 | "factor_to_quantile", 15 | "factor_to_quantile_dependent_double_sort", 16 | "compute_quantile_returns", 17 | "_compute_weighted_quantile_df", 18 | "_compute_quantile_df", 19 | ] 20 | 21 | PeriodType = typing.NewType("PeriodType", int) 22 | ForwardReturns = typing.NewType("ForwardReturns", dict[PeriodType, pd.DataFrame]) 23 | IC = typing.NewType("IC", pd.DataFrame) 24 | QuantileReturns = typing.NewType("QuantileReturns", dict[PeriodType, pd.DataFrame]) 25 | 26 | 27 | def compute_forward_returns(price: pd.DataFrame, periods: list[PeriodType]) -> ForwardReturns: 28 | forward_returns_dict = {} 29 | 30 | returns: pd.DataFrame = np.log(price).shift(-1) - np.log(price) 31 | 32 | for period in sorted(periods): 33 | if period == 1: 34 | forward_returns_dict[period] = returns 35 | continue 36 | 37 | log_period_returns = returns.rolling(period).sum().shift(1 - period) 38 | period_returns: pd.DataFrame = np.exp(log_period_returns) - 1 39 | forward_returns_dict[period] = period_returns 40 | return ForwardReturns(forward_returns_dict) 41 | 42 | 43 | def _compute_ic_df_df( 44 | a: pd.DataFrame, b: pd.DataFrame, method: typing.Literal["pearson", "kendall", "spearman"] 45 | ) -> pd.Series: 46 | return a.corrwith(b, axis=1, method=method) 47 | 48 | 49 | def compute_ic( 50 | factor: pd.DataFrame, forward_returns: ForwardReturns, method: typing.Literal["pearson", "kendall", "spearman"] 51 | ) -> IC: 52 | """ 53 | Compute IC (Information Coefficient) for the factor and forward returns, which is the correlation between the 54 | factor and the forward returns. 55 | 56 | Parameters 57 | ---------- 58 | factor: pd.DataFrame 59 | forward_returns: ForwardReturns 60 | method: str 61 | default "pearson" 62 | 63 | Returns 64 | ------- 65 | IC 66 | a dataframe of IC values for each period in columns. 67 | 68 | """ 69 | factor = factor[np.isfinite(factor)] 70 | return IC( 71 | pd.DataFrame( 72 | { 73 | period: _compute_ic_df_df(factor, period_returns, method=method) 74 | for period, period_returns in forward_returns.items() 75 | } 76 | ) 77 | ) 78 | 79 | 80 | def factor_to_quantile(factor: pd.DataFrame, quantiles: int = 5) -> pd.DataFrame: 81 | """ 82 | Convert factor to quantile row-wise. The result will always have quantile values ranging from `quantiles` down 83 | to 1 continuously (if only 1 group, it'll be `quantiles`). 84 | 85 | Parameters 86 | ---------- 87 | factor: pd.DataFrame 88 | quantiles: int 89 | default 5 90 | 91 | Returns 92 | ------- 93 | pd.DataFrame 94 | a dataframe of quantile values. 95 | 96 | """ 97 | quantile_values = np.arange(1, quantiles + 1) 98 | 99 | def _row_to_quantile(row): 100 | finite = np.isfinite(row) 101 | if finite.any(): 102 | tmp: pd.Series = pd.qcut(row[finite], quantiles, labels=False, duplicates="drop") 103 | # rearrange values from `q` to 1 104 | # this makes sure that the quantile values are generally continuous, 105 | # and we always have a group of long portfolio of `q` 106 | old_values = tmp.unique() 107 | old_values.sort() 108 | new_values = quantile_values[-len(old_values) :] 109 | if not np.array_equal(old_values, new_values): 110 | tmp.replace(old_values, new_values, inplace=True) 111 | row = row.copy() 112 | row[finite] = tmp 113 | return row 114 | else: 115 | return row 116 | 117 | return factor.apply(_row_to_quantile) 118 | 119 | def factor_to_quantile_dependent_double_sort(primary_factor: pd.DataFrame, secondary_factor: pd.DataFrame, quantiles: typing.Tuple[int, int]): 120 | """ 121 | Perform dependent double sorting on two factors. 122 | 123 | Parameters: 124 | ------------ 125 | primary_factor : pd.DataFrame 126 | The primary factor used for initial sorting. 127 | secondary_factor : pd.DataFrame 128 | The secondary factor used for sorting within each group defined by the primary factor. 129 | quantiles : tuple of int 130 | A tuple containing the number of quantiles for the primary and secondary factors respectively. 131 | 132 | Returns: 133 | -------- 134 | quantile_sorts : pd.DataFrame 135 | A DataFrame where each entry represents the quantile assignment for the secondary factor within the group defined by the primary factor. 136 | 137 | TODO: numba jit acceleration 138 | """ 139 | quantile_values_p = np.arange(1, quantiles[0] + 1) 140 | quantile_values_s = np.arange(1, quantiles[1] + 1) 141 | 142 | def _row_to_quantile(row_p, row_s): 143 | finite_p = np.isfinite(row_p) 144 | finite_s = np.isfinite(row_s) 145 | 146 | if finite_p.any() or finite_s.any(): 147 | # Sort by primary factor first 148 | temp_p : pd.Series = pd.qcut(row_p[finite_p], quantiles[0], labels=False, duplicates='drop') 149 | old_values = temp_p.unique() 150 | old_values.sort() 151 | new_values = quantile_values_p[-len(old_values) :] 152 | if not np.array_equal(old_values, new_values): 153 | temp_p.replace(old_values, new_values, inplace=True) 154 | 155 | # Sort by secondary factor within each primary quantile 156 | temp_s = pd.Series(np.zeros_like(row_p), index=row_p.index, dtype=int) 157 | temp_s[~finite_p | ~finite_s] = np.nan 158 | 159 | for q in quantile_values_p: 160 | mask = temp_p == q 161 | if mask.any(): 162 | # nan + nan, nan + int -> nan, int + nan -> nan, int + int -> int 163 | temp_s[mask] = pd.qcut(row_s[finite_s & mask], quantiles[1], labels=False, duplicates='drop') 164 | else: 165 | temp_s[mask] = np.nan 166 | 167 | old_values = temp_s.unique() 168 | old_values.sort() 169 | new_values = quantile_values_s[-len(old_values) :] 170 | if not np.array_equal(old_values, new_values): 171 | temp_s.replace(old_values, new_values, inplace=True) 172 | 173 | return temp_p.astype(str) + "_" + temp_s.astype(str) 174 | else: 175 | return pd.Series(index=row_p.index, dtype=str) 176 | 177 | result = pd.DataFrame(index=primary_factor.index, columns=primary_factor.columns) 178 | # apply the function to each row both of the factors 179 | for (i, row_p), (_, row_s) in zip(primary_factor.iterrows(), secondary_factor.iterrows()): 180 | result.loc[i] = _row_to_quantile(row_p, row_s) 181 | 182 | return result 183 | 184 | def _compute_quantile_df(qt: pd.DataFrame, fr: pd.DataFrame, reindex=True, quantiles: int = 5): 185 | # assume aligned 186 | result = {} 187 | for (dt, fr_row), (_, qt_row) in zip(fr.iterrows(), qt.iterrows()): 188 | result[dt] = fr_row.groupby(qt_row).mean() 189 | result = pd.DataFrame(result).T 190 | if reindex: 191 | return result.reindex(columns=np.arange(1, quantiles + 1), copy=False) 192 | return result 193 | 194 | def _compute_weighted_quantile_df(qt: pd.DataFrame, fr: pd.DataFrame, wt: pd.DataFrame, reindex= True, quantiles: int = 5): 195 | # assume aligned 196 | result = {} 197 | for (dt, fr_row), (_, qt_row), (_, wt_row) in zip(fr.iterrows(), qt.iterrows(), wt.iterrows()): 198 | _wt_row = wt_row.groupby(qt_row).transform(lambda x: x / x.sum()) 199 | result[dt] = (fr_row * _wt_row).groupby(qt_row).sum() 200 | result = pd.DataFrame(result).T 201 | if reindex: 202 | return result.reindex(columns=np.arange(1, quantiles + 1), copy=False) 203 | return result 204 | 205 | def compute_quantile_returns( 206 | factor: pd.DataFrame, forward_returns: ForwardReturns, quantiles: int = 5 207 | ) -> QuantileReturns: 208 | """ 209 | Compute quantile returns. Factor will be converted to quantiles using `factor_to_quantile`. Then, for each period 210 | in forward_returns, the period returns will be grouped row-wise by quantiles and averaged. 211 | 212 | Parameters 213 | ---------- 214 | factor: pd.DataFrame 215 | forward_returns: ForwardReturns 216 | quantiles: int 217 | default 5 218 | 219 | Returns 220 | ------- 221 | QuantileReturns 222 | a dictionary of period returns for each quantile. The quantile returns are dataframe with index as date and 223 | columns as quantiles. 224 | 225 | """ 226 | factor_as_quantile = factor_to_quantile(factor, quantiles=quantiles) 227 | return QuantileReturns( 228 | { 229 | period: _compute_quantile_df(factor_as_quantile, period_returns, quantiles=quantiles) 230 | for period, period_returns in forward_returns.items() 231 | } 232 | ) 233 | -------------------------------------------------------------------------------- /firefin/evaluation/industry/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | """ 5 | factor evaluation 6 | 7 | """ 8 | from .evaluator import Evaluator -------------------------------------------------------------------------------- /firefin/evaluation/industry/evaluator.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | import typing 5 | 6 | import pandas as pd 7 | 8 | from ...core.plot import plots 9 | from ..eva_utils import ForwardReturns, IC, QuantileReturns, compute_ic, compute_quantile_returns 10 | 11 | __all__ = ["Evaluator"] 12 | 13 | 14 | def to_datetime_index(df: pd.DataFrame) -> pd.DataFrame: 15 | out = df.copy(deep=False) 16 | if not isinstance(out.index, pd.DatetimeIndex): 17 | out.index = pd.to_datetime(out.index) 18 | return out 19 | 20 | 21 | class Evaluator: 22 | def __init__(self, factor: pd.DataFrame, forward_returns: ForwardReturns): 23 | self.factor = factor 24 | self.forward_returns = forward_returns 25 | self._to_datetime_index() 26 | self._reindex_forward_returns() 27 | self._result = {} 28 | 29 | def _to_datetime_index(self): 30 | self.factor = to_datetime_index(self.factor) 31 | self.forward_returns = {k: to_datetime_index(v) for k, v in self.forward_returns.items()} 32 | 33 | def _reindex_forward_returns(self): 34 | self.forward_returns = {k: v.reindex_like(self.factor, copy=False) for k, v in self.forward_returns.items()} 35 | 36 | def get_ic(self, method: typing.Literal["pearson", "kendall", "spearman"], plot=True) -> IC: 37 | cache_key = ("ic", (method,)) 38 | if cache_key not in self._result: 39 | self._result[cache_key] = compute_ic(self.factor, self.forward_returns, method) 40 | ic = self._result[cache_key] 41 | if plot: 42 | plots.plt_ic(ic) 43 | return ic 44 | 45 | def get_quantile_returns(self, quantiles: int = 5, plot=True) -> QuantileReturns: 46 | cache_key = ("quantile_returns", (quantiles,)) 47 | if cache_key not in self._result: 48 | self._result[cache_key] = compute_quantile_returns(self.factor, self.forward_returns, quantiles) 49 | qt = self._result[cache_key] 50 | if plot: 51 | plots.plt_quantile_cumulative_returns(qt) 52 | plots.plt_quantile_cumulated_end_returns(qt) 53 | return qt 54 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | 4 | [build-system] 5 | requires = ["setuptools>=65.5.0", "wheel"] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "firefin" 10 | version = "0.2.1" 11 | description = "The bundled opensource toolkit for book Navigate through the Factor Zoo: The Science of Factor Investing." 12 | readme = "README.md" 13 | authors = [{ name = "Renjie Liao", email = "auderson@qq.com" }, { name = "Baochen Qiao", email = "baochenqiao@gmail.com" }] 14 | dependencies = [ 15 | "click >= 8.1.3", 16 | "pandas >= 2.2.1", 17 | "matplotlib >= 3.8.3", 18 | "seaborn >= 0.13.2", 19 | "statsmodels >= 0.14.1", 20 | "scipy >= 1.12.0", 21 | "numba >= 0.59.0", 22 | "loguru >= 0.7.2", 23 | "tqdm >= 4.66.4", 24 | "joblib >= 1.4.2", 25 | ] 26 | 27 | [project.scripts] 28 | firefin = "firefin.cli.command:cli" 29 | -------------------------------------------------------------------------------- /tests/evaluation/Beta_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import t as t_dist 3 | import pandas as pd 4 | #这里原假设是β=0(也就是β没有作用) 5 | def beta_t_test(resid: pd.DataFrame, beta: pd.DataFrame, date , window_size, factors:pd.DataFrame ) -> tuple: 6 | """ 7 | 对多因子回归中每个 β 做 t 检验 (H0: β=0)。 8 | 9 | :param resid: 回归残差矩阵,形状 (T, N) 10 | :param betas: β 系数矩阵,形状 (N, K) 11 | :param factors: 因子收益矩阵,形状 (T, K) 12 | :return: 13 | t_stats: t 统计量矩阵,形状 (N, K) 14 | p_values: 双侧 p‑value 矩阵,形状 (N, K) 15 | """ 16 | beta = beta.loc[date].to_numpy() 17 | if beta.ndim == 1: 18 | beta = beta.reshape(1, -1).T # N*1 19 | rows = [] 20 | for i in range(window_size): 21 | resid_i = resid[i].loc[date].to_numpy() 22 | rows.append(resid_i) 23 | resid = np.stack(rows, axis=0).T # resid是N*T矩阵 24 | factors = factors.loc[:date].tail(window_size).to_numpy() 25 | if factors.ndim == 1: 26 | factors = factors.reshape(1, -1) 27 | N, T = resid.shape 28 | K =1 #factors.shape[1] 1*T矩阵 29 | 30 | df = T - K - 1 31 | 32 | cov_factor=np.cov(factors,rowvar=True,ddof=-1).item() 33 | sigma=[] 34 | for i in range(N): 35 | residi=resid[i] 36 | cov_epsilon_i=np.cov(residi,rowvar=True,ddof=K) 37 | sigma.append(cov_epsilon_i.item()) 38 | 39 | t_stats = np.zeros((N, K)) 40 | p_values = np.zeros((N, K)) 41 | 42 | for j in range(N): 43 | t_stats_j=beta[j] * np.sqrt(N) * np.sqrt(cov_factor)/ np.sqrt(sigma[j]) 44 | t_stats[j]=t_stats_j 45 | p_value_j=2 * (1 - t_dist.cdf(np.abs(t_stats_j), df)) 46 | p_values[j]=p_value_j 47 | 48 | 49 | 50 | return t_stats, p_values 51 | 52 | 53 | 54 | def beta_test(resid: pd.DataFrame, beta: pd.DataFrame, date, window_size: int, factors: pd.DataFrame) -> str: 55 | """ 56 | 对单因子回归中每个 β 做 t 检验 (H0: β=0),并直接输出 LaTeX 格式的表格。 57 | 58 | :param resid: 回归残差矩阵,形状 (T, N),index 是日期,columns 是资产名称 59 | :param beta: β 系数矩阵,index 是日期,columns 是资产名称 60 | :param date: 要检验的日期 61 | :param window_size: 窗口大小 62 | :param factors: 因子收益矩阵,index 是日期,columns 是因子名称(仅支持单因子) 63 | :return: LaTeX 表格字符串 64 | """ 65 | # 提取截面 beta 66 | beta_vals = beta.loc[date].to_numpy() 67 | if beta_vals.ndim == 1: 68 | beta_vals = beta_vals.reshape(-1) 69 | 70 | # 构造残差矩阵 N x T 71 | rows = [] 72 | for i in range(window_size): 73 | resid_i = resid[i].loc[date].to_numpy() 74 | rows.append(resid_i) 75 | resid_mat = np.stack(rows, axis=0).T # N x T 矩阵 76 | 77 | # 提取因子收益序列长度 window_size 78 | fac = factors.loc[:date].tail(window_size).to_numpy().reshape(-1) 79 | 80 | N, T = resid_mat.shape 81 | K = 1 82 | df = T - K - 1 83 | 84 | # 因子方差 (无偏估计 ddof=0) 85 | cov_factor = np.var(fac, ddof=0) 86 | # 各资产残差方差 (ddof=K) 87 | sigma = [np.var(resid_mat[i], ddof=K) for i in range(N)] 88 | 89 | # 计算 t 统计量和双侧 p 值 90 | t_stats = beta_vals * np.sqrt(N * cov_factor) / np.sqrt(sigma) 91 | p_values = 2 * (1 - t_dist.cdf(np.abs(t_stats), df)) 92 | 93 | # 组织为 DataFrame 94 | asset_names = resid.columns.tolist() if hasattr(resid, 'columns') else beta.columns.tolist() 95 | df_result = pd.DataFrame({ 96 | 't 统计量': t_stats, 97 | 'p 值': p_values 98 | }, index=asset_names) 99 | df_result.index.name = '资产' 100 | 101 | # 生成 LaTeX 表格 102 | latex_table = df_result.to_latex( 103 | float_format="%.4f", 104 | caption="各资产的 t 统计量和 p 值", 105 | label="tab:beta_t_test", 106 | escape=False 107 | ) 108 | 109 | print(latex_table) 110 | return latex_table -------------------------------------------------------------------------------- /tests/evaluation/MSR_Test.py: -------------------------------------------------------------------------------- 1 | from firefin.evaluation.academia.MSR_Test import MSRTest 2 | from firefin.data.fake import gen_df 3 | 4 | factor1 = gen_df(253, 100, index="day", mock="rand") 5 | factor2 = gen_df(253, 100, index="day", mock="rand") 6 | 7 | def test_MSR_Test(): 8 | result = MSRTest.run_msr_comparison(factor1, factor2, regularize_covariance=True) 9 | print("Model A MSR:", result["msr_a"]) 10 | print("Model B MSR:", result["msr_b"]) 11 | print("Z-statistic:", result["test_stat"]) 12 | print("P-value:", result["p_value"]) 13 | 14 | test_MSR_Test() -------------------------------------------------------------------------------- /tests/evaluation/eva_utils.py: -------------------------------------------------------------------------------- 1 | from firefin.data.fake import gen_df 2 | from firefin.evaluation.eva_utils import factor_to_quantile_dependent_double_sort 3 | 4 | def test_factor_to_quantile_dependent_double_sort(): 5 | factor1 = gen_df(10, 100, index="day", mock="rand") 6 | factor2 = gen_df(10, 100, index="day", mock="rand") 7 | 8 | double_sort = factor_to_quantile_dependent_double_sort(factor1, factor2, quantiles=(3, 5)) 9 | print(double_sort.head()) 10 | 11 | 12 | test_factor_to_quantile_dependent_double_sort() -------------------------------------------------------------------------------- /tests/evaluation/fama_macbeth.py: -------------------------------------------------------------------------------- 1 | from firefin.evaluation.academia.fama_macbeth import FamaMacBeth 2 | from firefin.data.fake import gen_df 3 | from firefin.data import fetch_data 4 | 5 | data = fetch_data(['open','close','volume','return_adj']) 6 | 7 | def test_fama_macbeth_regression(): 8 | r = FamaMacBeth.run_regression(data['close'], data['return_adj'], window=252, verbose=10, n_jobs=24) 9 | print(r) 10 | stats = FamaMacBeth.test_statistics(r) 11 | print(stats) 12 | 13 | test_fama_macbeth_regression() -------------------------------------------------------------------------------- /tests/evaluation/grs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy.stats import f 4 | def grs_test(resid: pd.DataFrame, alpha: pd.DataFrame, date , window_size, factors:pd.DataFrame ,label: str = "tab:grs",caption: str = "GRS 检验结果") -> None: 5 | """ Perform the Gibbons, Ross and Shanken (1989) test. 6 | :param resid: Matrix of residuals from the OLS of size TxN. 7 | :param alpha: Vector of alphas from the OLS of size Nx1. 8 | :param factors: Matrix of factor returns size KxT. 9 | :return: Test statistic and p-value of the test statistic. 10 | """ 11 | #数据处理 12 | alpha = alpha.loc[date].to_numpy() 13 | if alpha.ndim == 1: 14 | alpha = alpha.reshape(1, -1).T #N*1 15 | rows = [] 16 | for i in range(window_size): 17 | resid_i = resid[i].loc[date].to_numpy() 18 | rows.append(resid_i) 19 | resid = np.stack(rows, axis=0).T 20 | factors = factors.loc[:date].tail(window_size).to_numpy() 21 | if factors.ndim == 1: 22 | factors = factors.reshape(1, -1) 23 | 24 | 25 | # Determine the time series and assets 26 | N, T = resid.shape 27 | K= factors.shape[0] # factors是K*T矩阵 28 | try: 29 | T-N-K >0 30 | except ValueError as e: 31 | print(f"time period should be greater than number of assets{e}") 32 | 33 | # Covariance of the residuals 34 | Sigma = np.cov(resid, rowvar=True,ddof=K)#N*N残差协方差矩阵 35 | 36 | # Mean excess returns of the risk factors 37 | factor_mean = np.mean(factors, axis=1,keepdims=True)#K*1的均值矩阵 38 | 39 | 40 | # Covariance matrix of factors 41 | omega=np.cov(factors,rowvar=True,ddof=-1) 42 | omega = np.atleast_2d(omega) 43 | inv_omega = np.linalg.pinv(omega) 44 | inv_Sigma= np.linalg.pinv(Sigma) 45 | mult_=(factor_mean.T @ inv_omega @ factor_mean).item() 46 | mult=1/(1+mult_) 47 | inter=(alpha.T @ inv_Sigma @ alpha).item() 48 | # GRS statistic 49 | dTestStat = (T / N) * ((T - N - K) / (T - K - 1)) * inter * mult 50 | # p-value of the F-test 51 | df1=N 52 | df2=T-N-K 53 | pVal = 1 - f.cdf(dTestStat, df1, df2) 54 | df = pd.DataFrame( 55 | {"Value": [dTestStat, pVal]}, 56 | index=["GRS 统计量", "p‑value"], 57 | ) 58 | 59 | # 打印 LaTeX 代码 60 | print(df.to_latex( 61 | float_format="%.4f", 62 | caption=caption, 63 | label=label, 64 | header=False 65 | )) 66 | -------------------------------------------------------------------------------- /tests/evaluation/portfolio_test.py: -------------------------------------------------------------------------------- 1 | from firefin.evaluation.academia.portfolio_sort import PortfolioSort 2 | from firefin.data.fake import gen_df 3 | 4 | 5 | 6 | factor1 = gen_df(10, 100, index="day", mock="rand") 7 | factor2 = gen_df(10, 100, index="day", mock="rand") 8 | 9 | # 1, 2, 3, 4, 5 periods 10 | forward_returns = { i: gen_df(10, 100, index="day", mock="return") for i in range(1, 6) } 11 | market_cap = gen_df(10, 100, index="day", mock="volume") 12 | 13 | def test_single_sort(): 14 | # test single sort 15 | single_sort_r = PortfolioSort.single_sort(factor1, forward_returns, market_cap, quantiles=5) 16 | statistical_r = PortfolioSort.get_statistics(single_sort_r, quantiles=5) 17 | 18 | def test_dual_sort(): 19 | # test dual sort 20 | dual_sort_r = PortfolioSort.double_sort(factor1, factor2, forward_returns, market_cap, quantiles=(3,5)) 21 | # statistical_r_dual = PortfolioSort.get_statistics(dual_sort_r, quantiles=5) 22 | print(dual_sort_r) 23 | 24 | test_dual_sort() -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | 5 | from firefin.data import fetch_data 6 | from firefin.evaluation.eva_utils import compute_forward_returns 7 | from firefin.compute.window import ts_corr 8 | from firefin.evaluation.industry import Evaluator 9 | # fetch data 10 | data = fetch_data(['open','close','volume','return_adj']) 11 | 12 | # compute pv correlation 13 | def pv_corr(close, volume): 14 | return ts_corr(close, volume, 5, method="pearson") 15 | 16 | # compute factor 17 | factor = pv_corr(data["close"], data["volume"]) 18 | 19 | # compute forward returns 20 | fr = compute_forward_returns(data["open"].shift(-1), [1, 5, 10]) 21 | 22 | # compute industry evaluation 23 | mng = Evaluator(factor, fr) 24 | mng.get_ic("pearson") 25 | mng.get_quantile_returns(5) 26 | 27 | # compute academia evaluation 28 | from firefin.evaluation.academia.AcaEvaluatorModel import AcaEvaluatorModel 29 | 30 | mng = AcaEvaluatorModel(factor=factor, forward_returns=fr, return_adj=data["return_adj"], n_jobs=24, verbose=10) 31 | mng.run_all() -------------------------------------------------------------------------------- /tests/test_algo/test_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Created : 2025/3/27 11:36 3 | # @Author : Liao Renjie 4 | # @Email : liao.renjie@techfin.ai 5 | # @File : test_regression.py 6 | # @Software: PyCharm 7 | 8 | from unittest import TestCase 9 | 10 | import numpy as np 11 | import pandas as pd 12 | from numpy import array, nan 13 | 14 | from firefin.core.algorithm.regression import RollingRegressor, table_regression, rolling_regression 15 | 16 | 17 | class TestRegression(TestCase): 18 | 19 | def test_rolling_regressor_basic(self): 20 | 21 | x = np.arange(12).reshape(6, 2) 22 | y = np.arange(6) 23 | 24 | reg = RollingRegressor(x, y) 25 | self.assertEqual((2, 6, 2), reg.x.shape) 26 | self.assertEqual((6, 1), reg.y.shape) 27 | res = reg.fit(5) 28 | np.testing.assert_array_almost_equal( 29 | array( 30 | [ 31 | [nan, nan], 32 | [nan, nan], 33 | [nan, nan], 34 | [nan, nan], 35 | [0, -0.5], 36 | [0, -0.5], 37 | ] 38 | ), 39 | res.alpha, 40 | ) 41 | np.testing.assert_array_almost_equal( 42 | array( 43 | [ 44 | [nan, nan], 45 | [nan, nan], 46 | [nan, nan], 47 | [nan, nan], 48 | [0.5, 0.5], 49 | [0.5, 0.5], 50 | ] 51 | ), 52 | res.beta, 53 | ) 54 | 55 | def test_rolling_regression(self): 56 | x = pd.DataFrame(np.arange(12).reshape(6, 2), index=list("ABCDEF"), columns=list("ab"), dtype=float) 57 | y = x.copy() 58 | y.iloc[3:, 0] = x.iloc[3:, 0] * 2 + 1 59 | y.iloc[3:, 1] = x.iloc[3:, 1] * 4 + 1 60 | 61 | res0 = rolling_regression(x, y, 3) 62 | pd.testing.assert_frame_equal( 63 | res0.alpha, 64 | pd.DataFrame( 65 | array([[nan, nan], [nan, nan], [0, 0], [-4.66666667, -20.1666667], [-8.16666667, -32.3333333], [1, 1]]), 66 | index=list("ABCDEF"), 67 | columns=list("ab"), 68 | ), 69 | ) 70 | pd.testing.assert_frame_equal( 71 | res0.beta, 72 | pd.DataFrame( 73 | array([[nan, nan], [nan, nan], [1, 1], [2.75, 6.5], [3.25, 8], [2, 4]]), 74 | index=list("ABCDEF"), 75 | columns=list("ab"), 76 | ), 77 | ) 78 | 79 | res1 = rolling_regression([x, x], y, 3) 80 | pd.testing.assert_frame_equal(res0.alpha, res1.alpha) 81 | pd.testing.assert_frame_equal( 82 | pd.concat(res1.beta, axis=1), 83 | pd.DataFrame( 84 | array( 85 | [ 86 | [nan, nan, nan, nan], 87 | [nan, nan, nan, nan], 88 | [0.5, 0.5, 0.5, 0.5], 89 | [1.375, 3.25, 1.375, 3.25], 90 | [1.625, 4.0, 1.625, 4.0], 91 | [1.0, 2.0, 1.0, 2.0], 92 | ] 93 | ), 94 | index=list("ABCDEF"), 95 | columns=list("abab"), 96 | ), 97 | ) 98 | 99 | def test_table_regression(self): 100 | x = pd.DataFrame(np.arange(12).reshape(2, 6), index=list("ab"), columns=list("ABCDEF"), dtype=float) 101 | y = x.copy() 102 | y.iloc[0] = x.iloc[0] * 2 + 1 103 | y.iloc[1] = x.iloc[1] * 4 + 1 104 | w = x.copy() 105 | w.iloc[:] = 1 106 | 107 | for _w in [None, w]: 108 | with self.subTest(w=_w): 109 | res0 = table_regression(x, y, _w, axis=0) 110 | pd.testing.assert_frame_equal(res0.alpha + res0.beta * x, y) 111 | 112 | res1 = table_regression(x, y, _w, axis=1) 113 | pd.testing.assert_frame_equal(x.mul(res1.beta, axis=0).add(res1.alpha, axis=0), y) 114 | 115 | def test_table_regression_weights(self): 116 | x = pd.DataFrame(np.arange(12).reshape(2, 6), index=list("ab"), columns=list("ABCDEF"), dtype=float) 117 | y = x.copy() 118 | y.iloc[0] = x.iloc[0] * 2 + 1 119 | y.iloc[1] = x.iloc[1] * 4 + 1 120 | w = x.copy() 121 | 122 | # 此时w不影响结果 123 | res0 = table_regression(x, y, w, axis=1) 124 | pd.testing.assert_frame_equal(x.mul(res0.beta, axis=0).add(res0.alpha, axis=0), y) 125 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt 3 | 4 | from firefin.data.gateway import fetch_data, DATA_MAPS 5 | 6 | print(DATA_MAPS) 7 | 8 | data = fetch_data(["open", "TradingValue","test_no_data"]) 9 | 10 | print(data["open"]) 11 | print(data["test_no_data"]) 12 | print(data["cn_bond_2y"]) --------------------------------------------------------------------------------