├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── data_request.md
    │   ├── feature_request.md
    │   └── report-data-error.md
    └── workflows
    │   ├── pages.yml
    │   └── pypi.yaml
├── .gitignore
├── LICENSE
├── NOTICE.txt
├── README.md
├── docs
    ├── .gitignore
    ├── 404.html
    ├── Gemfile
    ├── _config.yml
    ├── _includes
    │   └── head_custom.html
    ├── docs
    │   ├── Errata.md
    │   ├── GRS检验的证明.md
    │   ├── compute
    │   │   └── compute.md
    │   ├── configuration.md
    │   ├── data
    │   │   ├── A-share.md
    │   │   └── data.md
    │   ├── evaluation
    │   │   ├── AcaEvaluatorModel.md
    │   │   ├── AcaEvaluatorModelComparison.md
    │   │   ├── aca_evaluator.md
    │   │   └── evaluation.md
    │   └── release-plan.md
    └── index.md
├── file.py
├── firefin
    ├── __init__.py
    ├── cli
    │   └── command.py
    ├── common
    │   ├── config.py
    │   ├── config.yaml
    │   └── const.py
    ├── compute
    │   ├── __init__.py
    │   └── window.py
    ├── core
    │   ├── __init__.py
    │   ├── algorithm
    │   │   ├── __init__.py
    │   │   ├── _numba_funcs.py
    │   │   ├── newey_west_ttest_1samp.py
    │   │   └── regression.py
    │   └── plot
    │   │   ├── __init__.py
    │   │   └── plots.py
    ├── data
    │   ├── __init__.py
    │   ├── datainfo.py
    │   ├── fake.py
    │   ├── file_reader.py
    │   └── gateway.py
    └── evaluation
    │   ├── __init__.py
    │   ├── academia
    │       ├── AcaEvaluatorModel.py
    │       ├── AcaEvaluatorModelComparison.py
    │       ├── MSR_Test.py
    │       ├── __init__.py
    │       ├── anomaly_test.py
    │       ├── fama_macbeth.py
    │       ├── portfolio_sort.py
    │       └── winsorizer.py
    │   ├── eva_utils.py
    │   └── industry
    │       ├── __init__.py
    │       └── evaluator.py
├── pyproject.toml
└── tests
    ├── evaluation
        ├── Beta_test.py
        ├── MSR_Test.py
        ├── aca_eva1_test.ipynb
        ├── eva_utils.py
        ├── fama_macbeth.py
        ├── grs.py
        └── portfolio_test.py
    ├── test.py
    ├── test_algo
        └── test_regression.py
    └── test_data.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: qiaobaochen
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/data_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Data request
 3 | about: Request a new Data
 4 | title: NEW DATA REQUEST
 5 | labels: new data
 6 | assignees: qiaobaochen
 7 | 
 8 | ---
 9 | 
10 | **Which data do you want，describe it in detail**
11 | A clear and concise description of what the data is. Ex. I'm always frustrated when [...]
12 | 
13 | **Put the data source here**
14 | URL here:
15 | 
16 | **Describe how to clean data**
17 | 1. download
18 | 2. remove duplicated
19 | 3. fillna
20 | 
21 | **Additional context**
22 | Add any other context or screenshots about the data request here.
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/report-data-error.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Report data error
 3 | about: Report Data error
 4 | title: ''
 5 | labels: data error
 6 | assignees: qiaobaochen
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages
 7 | name: Deploy Jekyll site to Pages
 8 | 
 9 | on:
10 |   push:
11 |     branches: ["main"]
12 |     paths:
13 |       - "docs/**"
14 | 
15 |   # Allows you to run this workflow manually from the Actions tab
16 |   workflow_dispatch:
17 | 
18 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
19 | permissions:
20 |   contents: read
21 |   pages: write
22 |   id-token: write
23 | 
24 | # Allow one concurrent deployment
25 | concurrency:
26 |   group: "pages"
27 |   cancel-in-progress: true
28 | 
29 | jobs:
30 |   # Build job
31 |   build:
32 |     runs-on: ubuntu-latest
33 |     defaults:
34 |       run:
35 |         working-directory: docs
36 |     steps:
37 |       - name: Checkout
38 |         uses: actions/checkout@v4
39 |       - name: Setup Ruby
40 |         uses: ruby/setup-ruby@v1
41 |         with:
42 |           ruby-version: '3.3' # Not needed with a .ruby-version file
43 |           bundler-cache: true # runs 'bundle install' and caches installed gems automatically
44 |           cache-version: 0 # Increment this number if you need to re-download cached gems
45 |           working-directory: '${{ github.workspace }}/docs' # Set the working-directory param to the docs folder
46 |       - name: Setup Pages
47 |         id: pages
48 |         uses: actions/configure-pages@v5
49 |       - name: Build with Jekyll
50 |         # Outputs to the './_site' directory by default
51 |         run: bundle exec jekyll build --baseurl "${{ steps.pages.outputs.base_path }}"
52 |         env:
53 |           JEKYLL_ENV: production
54 |       - name: Upload artifact
55 |         # Automatically uploads an artifact from the './_site' directory by default
56 |         uses: actions/upload-pages-artifact@v3
57 |         with:
58 |           path: docs/_site # Set the path to the docs folder
59 | 
60 |   # Deployment job
61 |   deploy:
62 |     environment:
63 |       name: github-pages
64 |       url: ${{ steps.deployment.outputs.page_url }}
65 |     runs-on: ubuntu-latest
66 |     needs: build
67 |     steps:
68 |       - name: Deploy to GitHub Pages
69 |         id: deployment
70 |         uses: actions/deploy-pages@v4


--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
  1 | name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
  2 | 
  3 | on: push
  4 | 
  5 | jobs:
  6 |   build:
  7 |     name: Build distribution 📦
  8 |     runs-on: ubuntu-latest
  9 | 
 10 |     steps:
 11 |     - uses: actions/checkout@v4
 12 |       with:
 13 |         persist-credentials: false
 14 |     - name: Set up Python
 15 |       uses: actions/setup-python@v5
 16 |       with:
 17 |         python-version: "3.x"
 18 |     - name: Install pypa/build
 19 |       run: python3 -m pip install build --user
 20 |     - name: Build a binary wheel and a source tarball
 21 |       run: python3 -m build
 22 |     - name: Store the distribution packages
 23 |       uses: actions/upload-artifact@v4
 24 |       with:
 25 |         name: python-package-distributions
 26 |         path: dist/
 27 |     
 28 |   publish-to-pypi:
 29 |     name: >-
 30 |         Publish Python 🐍 distribution 📦 to PyPI
 31 |     if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
 32 |     needs:
 33 |     - build
 34 |     runs-on: ubuntu-latest
 35 |     environment:
 36 |         name: pypi
 37 |         url: https://pypi.org/p/firefin  # Replace <package-name> with your PyPI project name
 38 |     permissions:
 39 |         id-token: write  # IMPORTANT: mandatory for trusted publishing
 40 | 
 41 |     steps:
 42 |     - name: Download all the dists
 43 |       uses: actions/download-artifact@v4
 44 |       with:
 45 |         name: python-package-distributions
 46 |         path: dist/
 47 |     - name: Publish distribution 📦 to PyPI
 48 |       uses: pypa/gh-action-pypi-publish@release/v1
 49 | 
 50 |   github-release:
 51 |     name: >-
 52 |       Sign the Python 🐍 distribution 📦 with Sigstore
 53 |       and upload them to GitHub Release
 54 |     needs:
 55 |     - publish-to-pypi
 56 |     runs-on: ubuntu-latest
 57 | 
 58 |     permissions:
 59 |       contents: write  # IMPORTANT: mandatory for making GitHub Releases
 60 |       id-token: write  # IMPORTANT: mandatory for sigstore
 61 | 
 62 |     steps:
 63 |     - name: Download all the dists
 64 |       uses: actions/download-artifact@v4
 65 |       with:
 66 |         name: python-package-distributions
 67 |         path: dist/
 68 |     - name: Sign the dists with Sigstore
 69 |       uses: sigstore/gh-action-sigstore-python@v3.0.0
 70 |       with:
 71 |         inputs: >-
 72 |           ./dist/*.tar.gz
 73 |           ./dist/*.whl
 74 |     - name: Create GitHub Release
 75 |       env:
 76 |         GITHUB_TOKEN: ${{ github.token }}
 77 |       run: >-
 78 |         gh release create
 79 |         "$GITHUB_REF_NAME"
 80 |         --repo "$GITHUB_REPOSITORY"
 81 |         --notes ""
 82 |     - name: Upload artifact signatures to GitHub Release
 83 |       env:
 84 |         GITHUB_TOKEN: ${{ github.token }}
 85 |       # Upload to GitHub Release using the `gh` CLI.
 86 |       # `dist/` contains the built packages, and the
 87 |       # sigstore-produced signatures and certificates.
 88 |       run: >-
 89 |         gh release upload
 90 |         "$GITHUB_REF_NAME" dist/**
 91 |         --repo "$GITHUB_REPOSITORY"
 92 | 
 93 | 
 94 | 
 95 | # publish-to-testpypi:
 96 |   publish-to-testpypi:
 97 |     name: Publish Python 🐍 distribution 📦 to TestPyPI
 98 |     needs:
 99 |     - build
100 |     runs-on: ubuntu-latest
101 | 
102 |     environment:
103 |       name: testpypi
104 |       url: https://test.pypi.org/p/firefin
105 | 
106 |     permissions:
107 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
108 | 
109 |     steps:
110 |     - name: Download all the dists
111 |       uses: actions/download-artifact@v4
112 |       with:
113 |         name: python-package-distributions
114 |         path: dist/
115 |     - name: Publish distribution 📦 to TestPyPI
116 |       uses: pypa/gh-action-pypi-publish@release/v1
117 |       with:
118 |         repository-url: https://test.pypi.org/legacy/


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | develop.ipynb
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | .idea/
162 | 
163 | # feather datafiles
164 | *.feather
165 | # gzip datafiles
166 | *.gz
167 | 
168 | # vscode
169 | .vscode/
170 | 
171 | # data packages
172 | AStockData.tar.gz


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | Copyright Super Quantum Inc.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # F.I.R.E. Factor Investment Research Engine
 2 | 
 3 | This repo is the bundled opensource toolkit for book _Navigating the Factor Zoo：The Science of Quantitative Investing_.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | # for stable version
 9 | pip install firefin
10 | 
11 | # for test and nightly version
12 | pip install -i https://test.pypi.org/simple/ firefin
13 | 
14 | # Install from source for loacl testing!!!
15 | ## replace $ThisRepoURL with the actual repo url
16 | git clone $ThisRepoURL 
17 | ## install from source
18 | pip install -e .
19 | ```
20 | 
21 | ## Usage
22 | 
23 | Download the data 
24 | from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz)
25 | 
26 | run the command and download data put in correct path automatically.
27 | 
28 | ```bash
29 | # We have not released this repo yet, so you need download the data manually!!! See command below!!!
30 | # Auto download data
31 | firefin download
32 | ```
33 | 
34 | If you have already downloaded the data from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz), you can run the command to check the data and put the data in the correct path
35 | 
36 | ```bash
37 | # replace path_to_data.tar.gz with the actual path
38 | firefin load path_to_data.tar.gz
39 | ```
40 | 
41 | ## Start to code
42 | 
43 | ```python
44 | import firefin
45 | 
46 | # get data
47 | data = firefin.fetch_data(["open", "close", "volume"])
48 | open_price = data["open"]
49 | 
50 | 
51 | def pv_corr(close, volume):
52 |     # price volume correlation
53 |     return close.rolling(20).corr(volume)
54 | 
55 | 
56 | factor = pv_corr(data["close"], data["volume"])
57 | 
58 | # compute forward returns
59 | fr = firefin.compute_forward_returns(open_price.shift(-1), [1, 5, 10])
60 | 
61 | # evaluate factor
62 | mng = firefin.Evaluator(factor, fr)
63 | mng.get_ic("pearson")
64 | mng.get_quantile_returns(5)
65 | 
66 | ```
67 | 
68 | ## Features
69 | 
70 | 1. handy functions for fast factor computation
71 | 2. various tools for factor evaluation
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _site
2 | .sass-cache
3 | .jekyll-cache
4 | .jekyll-metadata
5 | vendor
6 | # gem
7 | Gemfile.lock


--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | permalink: /404.html
 3 | layout: page
 4 | ---
 5 | 
 6 | <style type="text/css" media="screen">
 7 |   .container {
 8 |     margin: 10px auto;
 9 |     max-width: 600px;
10 |     text-align: center;
11 |   }
12 |   h1 {
13 |     margin: 30px 0;
14 |     font-size: 4em;
15 |     line-height: 1;
16 |     letter-spacing: -1px;
17 |   }
18 | </style>
19 | 
20 | <div class="container">
21 |   <h1>404</h1>
22 | 
23 |   <p><strong>Page not found :(</strong></p>
24 |   <p>The requested page could not be found.</p>
25 | </div>
26 | 


--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
 1 | source "https://rubygems.org/"
 2 | # Hello! This is where you manage which Jekyll version is used to run.
 3 | # When you want to use a different version, change it below, save the
 4 | # file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
 5 | #
 6 | #     bundle exec jekyll serve
 7 | #
 8 | # This will help ensure the proper Jekyll version is running.
 9 | # Happy Jekylling!
10 | # gem "jekyll", "~> 4.4.1"
11 | gem "github-pages", "~> 232", group: :jekyll_plugins
12 | # just the docs theme
13 | gem "just-the-docs"
14 | # If you want to use GitHub Pages, remove the "gem "jekyll"" above and
15 | # uncomment the line below. To upgrade, run `bundle update github-pages`.
16 | # gem "github-pages", group: :jekyll_plugins
17 | # If you have any plugins, put them here!
18 | group :jekyll_plugins do
19 |   gem "jekyll-feed", "~> 0.12"
20 | end
21 | 
22 | # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem
23 | # and associated library.
24 | platforms :mingw, :x64_mingw, :mswin, :jruby do
25 |   gem "tzinfo", ">= 1", "< 3"
26 |   gem "tzinfo-data"
27 | end
28 | 
29 | # Performance-booster for watching directories on Windows
30 | gem "wdm", "~> 0.1", :platforms => [:mingw, :x64_mingw, :mswin]
31 | 
32 | # Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem
33 | # do not have a Java counterpart.
34 | gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby]
35 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | title: F.I.R.E.
 2 | description: Factor Investment Research Engine (The bundled opensource toolkit for book Navigating the Factor Zoo：The Science of Quantitative Investing.)
 3 | theme: just-the-docs
 4 | 
 5 | url: fire-institute.github.io
 6 | 
 7 | aux_links:
 8 |   F.I.R.E. on GitHub: https://github.com/fire-institute/fire
 9 | 
10 | color_scheme: dark
11 | 
12 | # Build settings
13 | markdown: kramdown
14 | compress_html:
15 |   blanklines: true


--------------------------------------------------------------------------------
/docs/_includes/head_custom.html:
--------------------------------------------------------------------------------
 1 | <head>
 2 |   <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
 3 |    
 4 |   <script type="text/x-mathjax-config">
 5 |   MathJax.Hub.Config({
 6 |     tex2jax: {
 7 |       inlineMath: [['$','$'], ['\\(','\\)']],
 8 |       processEscapes: true
 9 |     }
10 |   });
11 |   </script>
12 |    
13 |   <script type="text/x-mathjax-config">
14 |       MathJax.Hub.Config({
15 |         tex2jax: {
16 |           skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
17 |         }
18 |       });
19 |   </script>
20 |   </head>
21 |   


--------------------------------------------------------------------------------
/docs/docs/Errata.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Errata
 3 | permalink: /errata/
 4 | nav_order: 2
 5 | ---
 6 | # Errata for *Navigating the Factor Zoo: The Science of Quantitative Investing*
 7 | ---
 8 | ## Overview
 9 | 
10 | This document captures all **verified printing and content errors** identified in *_Navigating the Factor Zoo: The Science of Quantitative Investing_*. It is maintained in the Fire Institute GitHub repository (https://github.com/fire-institute/fire) under `docs/docs/errata.md`.
11 | 
12 | ### Structure of Entries
13 | Each erratum follows this format:
14 | 
15 | | Field              | Description                                                        |
16 | | ------------------ | ------------------------------------------------------------------ |
17 | | **Anchor**         | Unique Markdown heading used as the link target.                   |
18 | | **Original**       | Verbatim the incorrect text, caption, or equation.                 |
19 | | **Correction**     | The accurate replacement text, caption, or equation.               |
20 | | **Note**           | (Optional) Additional context or explanation.                      |
21 | 
22 | ---
23 | 
24 | ### Submitting a New Error Report
25 | To contribute:
26 | 
27 | 1. **Search** existing GitHub issues to avoid duplicates.  
28 | 2. **Open a new issue** with the title:
29 |    ```
30 |    [Errata] Page <number> – brief description
31 |    ```
32 | 3. **Fill in the template** in the issue body:
33 |    ```markdown
34 |    **Page**:  
35 |    **Section or Heading**:  
36 |    **Original**:  
37 |    **Correction**:  
38 |    ```
39 | 4. A maintainer will review, label it **confirmed**, and then add it here.
40 | 
41 | ---
42 | 
43 | ## Table of Content
44 | 
45 | * [First Edition — Routledge (Hardcover & Paperback)](#first-edition-routledge-hardcover--paperback)
46 |   * [Page 66 – Equation 3.19](#page-66-equation-3-19)
47 | 
48 | 
49 | 
50 | ---
51 | 
52 | ## First Edition — Routledge (Hardcover & Paperback) <a name="first-edition-routledge-hardcover--paperback"></a>
53 | 
54 | - **Publisher**: Routledge  
55 | - **Publication Date**: November 20, 2024 (Hardcover) / December 9, 2024 (Paperback)  
56 | - **Formats**: Hardcover (296 pp.) / Paperback (310 pp.)  
57 | - **ISBN-10**: 1032768436 (HC) / 103276841X (PB)  
58 | - **ISBN-13**: 978-1032768434 (HC) / 978-1032768410 (PB)  
59 | 
60 | ### Page 66 – Equation 3.19 <a name="page-66-equation-3-19"></a>
61 | **Original** 
62 | 
63 | > In the limit of $n \rightarrow \infty$，$R V_{t}^{+} \rightarrow \text{ }_{t-1}^{t} \sigma_{s}^{2} ds+\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau}>0 }^{2} $, $ R V_{t}^{-} \rightarrow \int_{t-1}^{t} \sum_{s}^{2} d s+\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau}0 }^{2}  $, and,
64 | >
65 | > $$S J_{t}=\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau}>0 }^{2} -\sum_{t-1 \leq \tau \leq t} J_{\tau J_{\tau} 0}^{2} $$ 
66 | 
67 | **Correction** 
68 | 
69 | > In the limit of $n\to \infty$, $RV_t^+ \to \int _{t- 1}^t\sigma _s^2ds+ \sum_{t- 1\leq \tau \leq t}J_\tau^2 \mathbb{I} _{J_\tau > 0}$, $RV_t^- \to \int_{t- 1}^t \sigma_s^2 ds + \sum_{t-1\leq\tau\leq t}J_\tau^2\mathbb{I}_{J_\tau<0} $, and, 
70 | >
71 | > $$SJ_t = \sum_{t- 1\leq \tau \leq t}J_\tau^2 \mathbb{I} _{J_\tau > 0}-\sum_{t-1\leq\tau\leq t}J_\tau^2\mathbb{I}_{J_\tau<0}.$$
72 | 
73 | **Note** 
74 | 
75 | > Inserted the missing integral symbol, properly representing the continuous term as $\int_{t-1}^t\sigma_s^2\,ds$. Replaced the ambiguous jump‐index notation with indicator functions $\mathbb{I}_{J_\tau>0}$ and $\mathbb{I}_{J_\tau<0}$ to clearly separate positive and negative jumps.
76 | 
77 | 


--------------------------------------------------------------------------------
/docs/docs/GRS检验的证明.md:
--------------------------------------------------------------------------------
 1 | $$
 2 | {线性回归的基础带hat的都是估计量，不带的是我们假设的隐藏真值;y_{i}，x_{i}是观测值}\\
 3 | \vec{y}_{i}=\vec{\alpha}+\vec{\beta}{x_{i}}+\stackrel{\rightharpoonup}{\epsilon} \quad \vec{y}_{N \times 1} \quad \vec{\alpha}_{N \times 1} \quad \vec{\beta}_{N \times K} \quad \vec{\varepsilon}_{N \times 1}\\
 4 | { }\\
 5 | \begin{array}{l}
 6 | \vec{y}=\vec{\alpha}+\vec{\beta} x_{i} \\
 7 | \vec{y_{i}}=\vec{\alpha}+\vec{\beta} x_{i}+\vec{\varepsilon_{i}} \\
 8 | \vec{y}_{i}=\hat{\vec{\alpha}}+\hat{\vec{\beta}} x_{i}+\hat{\vec{\varepsilon_{i}}}
 9 | \end{array}
10 | $$
11 | 
12 | $$
13 | \begin{array}{l}
14 |  { 最小二乗:(以下所有的推导均在\vec{\epsilon}服从正态分布的假设下导出) } \\
15 | \underset{a}\arg\min\sum_{i=1}^{T}\left\|\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right\|^{2} \\
16 | \frac{\partial}{\partial \vec{a}} \sum_{i=1}^{T}\left(\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right)^{\prime}\left(\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right) \\
17 | =\frac{\partial}{\partial \vec{a}} \sum_{i=1}^{T} \operatorname{Tr}\left(\vec{y}_{i}-\vec{\alpha}-\vec{\beta} x_{i}\right)^{\prime}\left(\overrightarrow{y_{i}}-\vec{\alpha}-\vec{\beta} x_{i}\right) \\
18 | =\frac{\partial}{\partial \vec{a}} \sum_{i=1}^{T} \operatorname{Tr}\left(\vec{y}_{i}^{\prime} y_{i}-\vec{y}_{i}^{\prime} \vec{a}-\vec{y}_{i}^{\prime} \vec{\beta}^{\prime} x_{i}-\vec{\alpha}^{\prime} \vec{y}_{i}+\vec{\alpha}^{\prime} \vec{\alpha}+\vec{\alpha}^{\prime} \vec{\beta} x_{i} - x_{i}^{\prime} \vec{\beta}^{\prime} \vec{y}_{i}+x_{i}^{\prime} \vec{\beta}^{\prime} \vec{\alpha}+\left(\vec{\beta} x_{i}\right)^{\prime}\left(\vec{\beta} x_{i}\right)\right) \\
19 | 
20 | { }\\
21 | { 对\alpha求导等于0 } \\
22 | \Rightarrow\sum_{i=1}^{T}-2 \vec{y}_{i}+2 \vec{a}+2 \vec{\beta}x_{i}=0 \\
23 | 
24 | { 对\beta求导等于0}\\
25 | \frac{\partial}{\partial \vec{\beta}} \sum_{i=1}^{T} \operatorname{Tr}\left(\vec{y_{i}}-\vec{\alpha}-\vec{\beta} x_{i}\right)^{\prime}\left(\vec{y_{i}}-\vec{\alpha}-\vec{\beta} x_{i}\right)=\sum_{i=1}^{T}-\vec{y_{i}} x_{i}^{\prime}+\vec{\alpha} x_{i}^{\prime}-x_{i} \vec{y}_{i}^{\prime} + x_{i} \vec{\alpha}^{\prime}+2 \vec{\beta}{x}_{i}{x}_{i}^{\prime}=0 \\
26 | { }\\
27 | {求\alpha和\beta的值}\\
28 | \left\{\begin{array}{ll}
29 | \sum_{i=1}^{T}-2 \overrightarrow{y_{i}}+2 \vec{\alpha}+2 \vec{\beta} x_{i}=0  \\
30 | \sum_{i=1}^{T}-2 \overrightarrow{y_{i}} x_{i}^{\prime}+2 \vec{\alpha} x_{i}^{\prime}+2 \vec{\beta} {x}_{i} x_{i}^{\prime}=0 \\
31 | \end{array}\right. \\
32 | \Rightarrow\left\{\begin{array}{lll}
33 | 2 T \bar{y}+2 T \vec{\alpha}+2 T \vec{\beta} \vec{x}=0 \Rightarrow \hat{\vec{\alpha}}=\bar{y}-\vec{\beta} {\bar{x}}\\
34 | \sum_{i=1}^{T}-2 \vec{y_{i}} x_{i}^{\prime}+(2 {\vec{y}}-2 \vec{\beta} \bar{x}) x_{i}^{\prime}+2 \vec{\beta} \vec{x}_{i} \vec{x}_{i}^{\prime}=0\\
35 | \end{array}\right. \\
36 | { }\\
37 | {\hat{\vec{\beta}}的方差}\\
38 | 
39 | \hat{\vec{\beta}}=\sum_{i=1}^{T}\left(\overrightarrow{y_{i}}-\vec{y}\right) {x}_{i}^{\prime}\cdot \left(\sum_{i=1}^{T}\left({x_{i}}-\bar{x}\right) \vec{x}_{i}^{\prime}\right)^{-1}
40 | {}
41 | \hat{\vec{\beta}}\\
42 | =\vec{\beta}+\sum_{i=1}^{T}\left(\vec{\varepsilon}_{i}-\bar{\varepsilon}_{i}\right) x_{i}^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right) x_{i}^{\prime}\right]^{-1}\\
43 | =\vec{\beta}+\sum_{i=1}^{T}\left(\vec{\varepsilon}_{i}-\varepsilon_{i}\right)\left(x_{i}-\bar{x}\right)^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}\right]^{-1} \\
44 | 
45 | 
46 | {原因：x_{i}以及\bar{x}都是常数(这个不管一元多元都是这样的)，用到的假设是\epsilon的方差与x无关，即\operatorname{Var}(\epsilon|x)=\sigma^2,\Sigma=\sigma^2I}\\
47 | \operatorname{Var}(\hat{\beta})=\sigma^{2}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}\right]^{-1}[\Sigma_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}]\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime}\right]^{-1}=\Omega^{-1} \Sigma \quad \quad \quad \quad \Omega=\frac{1}{T} \sum_{i=1}^{T}\left(x_{i}-\bar{x}\right)\left(x_{i}-\bar{x}\right)^{\prime} \\
48 | 
49 | {\hat{\alpha}的方差}\\
50 | \operatorname{Var}(\hat{\alpha})=\frac{\sum}{T}+\bar{x}^{\prime} \operatorname{Var}(\hat{\beta})\bar{x}=\frac{1}{T}\left(1+\bar{x}^{\prime} \Omega^{-1} \bar{x}\right) \Sigma \\
51 | \left[\hat{\vec{a}}=\bar{y}-\hat{\vec{\beta}} \bar{x}=\alpha+\vec{\beta} \bar{x}+\frac{1}{T} \sum_{i=1}^{T} \vec{\varepsilon}_{i}-\left(\vec{\beta}+\sum_{i=1}^{T}\left(\varepsilon_{i}-\bar{\varepsilon}_{i}\right) x_{i}^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right) x_{i}^{\prime}\right]^{-1}\right) \bar{x} =\frac{1}{T} \sum_{i=1}^{T} \vec{\varepsilon}_{i}+\sum_{i=1}^{T}\left(\vec{\varepsilon}_{i}-\bar{\varepsilon}_{i}\right) x_{i}^{\prime}\left[\sum_{i=1}^{T}\left(x_{i}-\bar{x}\right) x_{i}^{\prime}\right]^{-1} \bar{x}\right] \\
52 | \Rightarrow \hat{\vec{\alpha}} \sim N_{N}\left(\vec{\alpha}, \frac{1}{T}\left(1+\bar{x} \Omega^{-1} \bar{x}\right)^{-1} \Sigma\right)\\
53 | 
54 | 
55 | { }\\
56 | { 假设\vec{\alpha}=0} \\
57 | 
58 | (T-K-1) \hat{\Sigma}\sim W_{N}(T-K-1, \Sigma)\quad \quad({因为\Sigma的估计量为} \hat{\Sigma}=\frac{1}{T-k-1}(\hat{\varepsilon}-\bar{\hat{\varepsilon}})(\hat{\varepsilon}-\bar{\hat{\varepsilon}})^{\prime}\quad最小二乘的假设里E(\hat{\epsilon})=0,因此\bar{\hat{\varepsilon}}=0) \\
59 | \sqrt{T / (1+\bar{x} \Omega^{-1} \bar{x}}) \cdot \hat{\vec{\alpha}} \sim N_{N}\left(0,  \Sigma\right)\\
60 | { }\\
61 | { 构造 Hotelling  T^{2}  statistics}\\
62 | \left(统计量的构造方法如下：x \sim N_{p}(0, \Sigma),\quad w \sim w_{p}(n, \Sigma)\quad \Rightarrow \frac{n-p+1}{p n} n x^{\prime} w^{-1} x \sim F(p, n-p+1)\right) \\
63 | { 用\sqrt{T / (1+\bar{x} \Omega^{-1} \bar{x}}) \cdot \hat{\vec{\alpha}} \sim N_{N}\left(0,  \Sigma\right)代替x}\\
64 | { 用(T-K-1) \hat{\Sigma}\sim W_{N}(T-K-1, \Sigma)代替\omega}\\
65 | { 即p=N，n=T-K-1}\\
66 | \Rightarrow \frac{T(T-K-N)}{N(T-K-1)}\cdot
67 | \hat{\alpha}^{\prime} \sum^{-1} \hat{\alpha}\left(\frac{1}{1+\bar{x}^{\prime} \Omega^{-1} \bar{x}}\right) \sim F_{N, T-K-N}
68 | \end{array}
69 | $$
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/docs/docs/compute/compute.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Construct Factor
3 | nav_order: 5
4 | permalink: /construct/
5 | ---
6 | How to use data in your project
7 | {: .fs-6 .fw-300 }


--------------------------------------------------------------------------------
/docs/docs/configuration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Installation
 3 | nav_order: 3
 4 | ---
 5 | 
 6 | # Installation
 7 | 
 8 | ```bash
 9 | # We have not released the package to pypi yet, so you need to install from source!!!
10 | # Install from source for loacl testing!!!
11 | ## replace $ThisRepoURL with the actual repo url
12 | git clone $ThisRepoURL 
13 | ## install from source
14 | pip install -e .
15 | ```
16 | 
17 | # Load Data
18 | 
19 | Download the data 
20 | from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz)
21 | 
22 | run the command and download data put in correct path automatically.
23 | 
24 | ```bash
25 | # We have not released this repo yet, so you need download the data manually!!! See command below!!!
26 | # Auto download data
27 | fire download
28 | ```
29 | 
30 | If you have already downloaded the data from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz), you can run the command to check the data and put the data in the correct path
31 | 
32 | ```bash
33 | # replace path_to_data.tar.gz with the actual path
34 | fire load path_to_data.tar.gz
35 | ```
36 | 


--------------------------------------------------------------------------------
/docs/docs/data/A-share.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: A-Share Data
  3 | permalink: /data/a-share/
  4 | parent: Data Management
  5 | nav_order: 4.1
  6 | ---
  7 | 
  8 | # A-Share Data
  9 | 
 10 | Fire provides comprehensive data for the Chinese A-share market, including historical prices, financial 
 11 | statements, and other relevant information. This section outlines the available datasets and how to access them.
 12 | 
 13 | ## Available Datasets
 14 | 
 15 | ### Historical Prices
 16 | 
 17 | - **Daily Prices**: Contains daily open, high, low, close, money, vwap and volume data.
 18 | - **Daily Valuations**: Provides daily valuation metrics such as P/E ratio, P/B ratio, etc
 19 | 
 20 | 
 21 | 数据范围：
 22 | 2015.01.01-2025.05.01
 23 | 
 24 | 数据内容分类：
 25 | 
 26 | 1. quote：量价数据
 27 | 字段数量：28 个
 28 | 数据类型：DataFrame
 29 | 数据规模：(2509 行，5363 列)
 30 | 
 31 | |字段|中文名|
 32 | |---|---|
 33 | |open|开盘价——日级|
 34 | |close|收盘价——日级|
 35 | |high|最高价——日级|
 36 | |low|最低价——日级|
 37 | |volume|成交量(股/份)——日级|
 38 | |money|成交额(元)——日级|
 39 | |return_adj|涨跌幅——日级|
 40 | |vwap|成交量加权均价——日级|
 41 | |adj_factor|复权因子|
 42 | |open_dr|开盘价——日级|
 43 | |high_dr|最高价——日级|
 44 | |low_dr|最低价——日级|
 45 | |close_dr|收盘价——日级|
 46 | |volume_dr|成交量(股/份)——日级|
 47 | |vwap_dr|成交量加权均价——日级|
 48 | |FinanceValue|融资余额(元)|
 49 | |FinanceBuyValue|融资买入额(元)|
 50 | |FinanceRefundValue|融资偿还额(元)|
 51 | |SecurityVolume|融券余量(股)|
 52 | |SecuritySellVolume|融券卖出量(股)|
 53 | |SecurityRefundVolume|融券偿还量(股)|
 54 | |SecurityValue|融券余额(元)|
 55 | |TradingValue|融资融券余额(元)|
 56 | |FinaInTotalRatio|融资占交易所融资余额比(%)|
 57 | |SecuInTotalRatio|融券占交易所融券余额比(%)|
 58 | |shares_holding|持股数量(股)|
 59 | |hold_ratio|持股占比(%)|
 60 | |adjusted_hold_ratio|调整后的持股占比(%)|
 61 | 
 62 | 
 63 | 2. valuation：估值数据
 64 | 字段数量：14 个
 65 | 数据类型：DataFrame
 66 | 数据规模：(2509 行，5363 列)
 67 | 
 68 | |字段|中文名|
 69 | |---|---|
 70 | |circulating_market_cap|流通市值(亿元)(含港股)|
 71 | |pcf_ratio|市现率(PCF, 现金净流量TTM)|
 72 | |market_cap|总市值(亿元)(含港股)|
 73 | |pe_ratio_lyr|静态市盈率(PE)|
 74 | |circulating_cap|流通股本(万股)(含港股)|
 75 | |capitalization|总股本(万股)(含港股)|
 76 | |pb_ratio|市净率(PB)|
 77 | |pe_ratio|市盈率(PE, TTM)|
 78 | |ps_ratio|市销率(PS, TTM)|
 79 | |turnover_ratio|换手率(%)|
 80 | |circulating_market_cap_ashare|A股流通市值（亿元）|
 81 | |market_cap_ashare|A股总市值（亿元）|
 82 | |circulating_cap_ashare|A股流通股本（万股）|
 83 | |capitalization_ashare|A股总股本（万股）|
 84 | 
 85 | 
 86 | 3. financial：财务数据
 87 | 字段数量：11 个
 88 | 数据类型：DataFrame
 89 | 数据规模：(2509 行，5363 列)
 90 | 
 91 | |字段|中文名|
 92 | |---|---|
 93 | |inventories|存货(元)|
 94 | |total_current_assets|流动资产合计(元)|
 95 | |fixed_assets|固定资产(元)|
 96 | |good_will|商誉(元)|
 97 | |total_assets|资产总计(元)|
 98 | |total_liability|负债合计(元)|
 99 | |operating_revenue|营业收入(元)|
100 | |operating_profit|营业利润(元)|
101 | |total_profit|利润总额(元)|
102 | |net_profit|净利润(元)|
103 | |basic_eps|基本每股收益(元)|
104 | 
105 | 


--------------------------------------------------------------------------------
/docs/docs/data/data.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Data Management
 3 | permalink: /data/
 4 | nav_order: 4
 5 | ---
 6 | 
 7 | # Data Management
 8 | 
 9 | Fire provides a user-friendly interface for downloading and managing financial data. By leveraging the pre-cleaned and processed data pipeline from the Fire Institute, you can focus more on research and modeling rather than data preparation.
10 | 
11 | 
12 | Currently, Fire only porvides the data from the Chinese A stock market. We will provide more data in the future.
13 | 
14 | 
15 | # Download Data
16 | 
17 | We provide a simple command-line interface to download the data. You can use the following command to download the data:
18 | 
19 | ```bash
20 | firefin download
21 | ```
22 | 
23 | This command will download the latest data from the Fire Institute and store it in the `~/.fire/data/raw` directory, all data
24 | will be organized in feather format. (Maybe we will consider other database or k-v store in the future) cause we do not update the data frequently, so we choose feather format for its fast read/write speed.
25 | 
26 | # Load Data
27 | 
28 | If you have downloaded the data manually or received it from another source, you can use the following command to load it into the Firefin system:
29 | 
30 | ```bash
31 | firefin load <file_path>
32 | ```
33 | 
34 | This command will extract the contents of the provided tar file and place them in the appropriate directory within the Firefin system.
35 | 
36 | # Data Structure
37 | 
38 | The data is organized in a structured format to facilitate easy access and manipulation. Here is an overview of the data structure:
39 | 
40 | 
41 | | Date       | security1  | security2  | security2  | ...  | securityN  |   
42 | |------------|------------|------------|------------|------|------------|
43 | | 2023-01-01 | 10.5   | 10.7   | 10.8   | ...  | 10.9   |   
44 | | 2023-01-02 | 10.6   | 10.8   | 10.9   | ...  | 11.1  |   
45 | | ...        | ...    | ...    | ...    | ...  |
46 | | 2023-12-31 | 11.0   | 11.2   | 11.3   | ...  | 11.4   |   
47 | 
48 | 
49 | 1. ALL data is stored in a single Feather file named `data_name.feather`.
50 | 2. Each row represents a date.
51 | 3. Each column represents a security, identified by its ticker symbol.
52 | 4. The values in the cells represent the closing prices of the securities on the corresponding dates.
53 | 5. **index(date) and columns(securities) are exactly the same across A datasets.** For example, 'A-share chinese market'
54 | 
55 | With the above structure, you can easily perform time-series analysis, portfolio optimization, and other financial analyses, with out thinking about the data alignment issue.
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/docs/evaluation/AcaEvaluatorModel.md:
--------------------------------------------------------------------------------
  1 | # AcaEvaluatorModel
  2 | 
  3 | `AcaEvaluatorModel` is a class designed for evaluating the performance of a **single-factor model** using various asset pricing methodologies. It supports portfolio sorting, cross-sectional regression, information coefficient calculations, and anomaly tests.
  4 | 
  5 | ---
  6 | 
  7 | ## Class Initialization
  8 | 
  9 | ```python
 10 | AcaEvaluatorModel(factor: pd.DataFrame, forward_returns: ForwardReturns, return_adj: pd.DataFrame)
 11 | ````
 12 | 
 13 | **Parameters**
 14 | 
 15 | * `factor` *(pd.DataFrame)*: Factor exposure data (Time × Stock)
 16 | * `forward_returns` *(dict\[str, pd.DataFrame])*: Future returns mapped by holding periods (Time × Stock)
 17 | * `return_adj` *(pd.DataFrame)*: DataFrame of adjusted returns (Time × Stock)
 18 | ---
 19 | 
 20 | ## Methods
 21 | 
 22 | ### `run_single_sort`
 23 | 
 24 | Perform single-factor portfolio sorting.
 25 | 
 26 | **Parameters**
 27 | 
 28 | * `quantiles` *(int)*: Number of quantile groups (e.g. 5 for quintiles)
 29 | * `value_weighted` *(bool)*: Use value-weighted portfolios if `True`; otherwise, equal-weighted
 30 | * `return_stats` *(bool)*: Whether to return statistics for H-L portfolio
 31 | * `market_cap` *(pd.DataFrame)*: Market cap data (required if `value_weighted=True`)
 32 | * `get_quantile_sorts` *(bool)*: Whether to return quantile labels for each stock
 33 | 
 34 | **Returns**
 35 | 
 36 | * If `return_stats=True`:
 37 |   `Tuple[QuantileReturns, dict]`
 38 | * Else:
 39 |   `QuantileReturns`
 40 | 
 41 | ---
 42 | 
 43 | ### `run_fama_macbeth`
 44 | 
 45 | Run two-stage Fama-MacBeth cross-sectional regression.
 46 | 
 47 | **Parameters**
 48 | 
 49 | * `window` *(int)*: Rolling window size for first-stage regression (default: 252)
 50 | * `return_stats` *(bool)*: Whether to return statistical summary
 51 | 
 52 | **Returns**
 53 | 
 54 | * If `return_stats=True`:
 55 |   `Tuple[RegressionResult, dict]`
 56 | * Else:
 57 |   `RegressionResult`
 58 | 
 59 | ---
 60 | 
 61 | ### `run_ic`
 62 | 
 63 | Compute Information Coefficients (IC) across time.
 64 | 
 65 | **Parameters**
 66 | 
 67 | * `method` *(str)*: Correlation type, one of `'pearson'`, `'spearman'`, or `'kendall'`
 68 | 
 69 | **Returns**
 70 | 
 71 | * `pd.DataFrame`: IC values by period
 72 | 
 73 | ---
 74 | 
 75 | ### `run_regression`
 76 | 
 77 | Run static or rolling regression of returns on factor exposures.
 78 | 
 79 | **Parameters**
 80 | 
 81 | * `rolling` *(bool)*: Whether to run rolling regression
 82 | * `window` *(int)*: Rolling window size (only used if `rolling=True`)
 83 | * `fit_intercept` *(bool)*: Include intercept term if `True`
 84 | 
 85 | **Returns**
 86 | 
 87 | * `BatchRegressionResult` or `dict` (if rolling)
 88 | 
 89 | ---
 90 | 
 91 | ### `run_anomaly_test`
 92 | 
 93 | Conduct anomaly tests by regressing returns on the factor.
 94 | 
 95 | **Parameters**
 96 | 
 97 | * `portfolio_returns` *(dict\[str, pd.DataFrame])*:  DataFrame of portfolio returns, with each column representing a distinct portfolio. (Quantile returns)
 98 | * `cov_type` *(Optional\[str])*: Type of covariance estimator (e.g., `'HAC'`, `'HC0'`, etc.)
 99 | * `cov_kwds` *(Optional\[dict])*: Additional keyword arguments for the covariance estimator
100 | * `return_stats` *(bool)*: Whether to return summary statistics
101 | 
102 | **Returns**
103 | 
104 | * If `return_stats=True`:
105 |   `Tuple[AnomalyTest, pd.DataFrame]`
106 | * Else:
107 |   `AnomalyTest`
108 | 
109 | ---
110 | 
111 | ### `run_all`
112 | 
113 | Run all available evaluation methods and return results in a dictionary.
114 | 
115 | **Returns**
116 | 
117 | * `dict`: Keys include:
118 | 
119 |   * `'single_sort_res'`, `'single_sort_stat'`
120 |   * `'fama_macbeth'`
121 |   * `'information_coefficient'`
122 |   * `'regression'`
123 |   * `'anomaly'`
124 | 
125 | ---
126 | 
127 | ## Notes
128 | 
129 | * `run_regression` uses either time-by-time OLS or rolling regression depending on the `rolling` flag.
130 | * `run_all` is useful for executing a full evaluation pipeline for a single factor.
131 | * Ensure `market_cap` is provided when performing value-weighted portfolio sorts.
132 | * `return_adj` in `run_fama_macbeth` should be matched to the target return horizon.
133 | 


--------------------------------------------------------------------------------
/docs/docs/evaluation/AcaEvaluatorModelComparison.md:
--------------------------------------------------------------------------------
 1 | # AcaEvaluatorModelComparison
 2 | 
 3 | `AcaEvaluatorModelComparison` is a class designed for evaluating **multi-factor models**, supporting methods such as double portfolio sorting and maximum Sharpe ratio (MSR) comparison between two models.
 4 | 
 5 | ---
 6 | 
 7 | ## Class Initialization
 8 | 
 9 | ```python
10 | AcaEvaluatorModelComparison(factor1: pd.DataFrame, factor2: pd.DataFrame, forward_returns: ForwardReturns)
11 | ````
12 | 
13 | **Parameters**
14 | 
15 | * `factor1` *(pd.DataFrame)*: First factor exposure matrix (Time × Stock)
16 | * `factor2` *(pd.DataFrame)*: Second factor exposure matrix (Time × Stock)
17 | * `forward_returns` *(dict\[str, pd.DataFrame])*: Future returns by holding period (Time × Stock)
18 | 
19 | ---
20 | 
21 | ## Methods
22 | 
23 | ### `run_double_sort`
24 | 
25 | Perform double-sort portfolio sorting based on two factors.
26 | 
27 | **Parameters**
28 | 
29 | * `quantiles` *(Tuple\[int, int])*: Number of quantiles for each factor (e.g., (5, 5))
30 | * `dependent` *(bool)*: Whether to apply dependent (conditional) sorting
31 | * `value_weighted` *(bool)*: Use value-weighted returns if `True`; otherwise equal-weighted
32 | * `market_cap` *(pd.DataFrame)*: Market cap data, required if `value_weighted=True`
33 | * `get_quantile_sorts` *(bool)*: Whether to return the portfolio labels for each stock
34 | 
35 | **Returns**
36 | 
37 | * `QuantileReturns` or `dict[str, pd.DataFrame]` (if `get_quantile_sorts=True`)
38 | 
39 | ---
40 | 
41 | ### `run_msr_test`
42 | 
43 | Compare the Maximum Sharpe Ratios (MSR) between two models using a statistical test.
44 | 
45 | **Parameters**
46 | 
47 | * `regularize` *(bool)*: Whether to apply shrinkage regularization to the covariance matrix
48 | 
49 | **Returns**
50 | 
51 | * `dict`:
52 | 
53 |   * `'msr_a'`: Maximum Sharpe Ratio for model A
54 |   * `'msr_b'`: Maximum Sharpe Ratio for model B
55 |   * `'test_stat'`: Z-statistic of the MSR test
56 |   * `'p_value'`: Corresponding two-sided p-value
57 | 
58 | ---
59 | 
60 | ### `run_all`
61 | 
62 | Run all available evaluation methods in the class.
63 | 
64 | **Parameters**
65 | 
66 | * `market_cap` *(pd.DataFrame)*: Required if `value_weighted=True` in `run_double_sort`
67 | 
68 | **Returns**
69 | 
70 | * `dict`:
71 | 
72 |   * `'double_sort'`: Result of double-sort sorting
73 |   * `'msr_test'`: Result of MSR comparison between the two factor models
74 | 
75 | ---
76 | 
77 | ## Notes
78 | 
79 | * `run_double_sort` can support both independent and nested (conditional) sorting based on two factors.
80 | * `run_msr_test` is based on a Z-test for comparing Sharpe Ratios under multivariate settings.
81 | * `run_all` is a quick way to benchmark model performance using all implemented tools.


--------------------------------------------------------------------------------
/docs/docs/evaluation/aca_evaluator.md:
--------------------------------------------------------------------------------
  1 | # AcaEvaluator
  2 | 
  3 | `AcaEvaluator` is a modular Python class designed to evaluate asset pricing factors using a comprehensive suite of financial econometrics tools. It supports portfolio sorts, cross-sectional and time-series regressions, model comparison tests, and robustness diagnostics.
  4 | 
  5 | ---
  6 | 
  7 | ## Key Features
  8 | 
  9 | | Method                         | Purpose                                                                            |
 10 | | ------------------------------ | ---------------------------------------------------------------------------------- |
 11 | | `run_single_sort()`            | Perform univariate portfolio sorting based on a factor                             |
 12 | | `run_double_sort()`            | Perform bivariate (conditional or independent) portfolio sorts                     |
 13 | | `run_fama_macbeth()`           | Run Fama-MacBeth two-pass cross-sectional regressions                              |
 14 | | `run_ic()`                     | Compute the Information Coefficient (IC) between factor values and forward returns |
 15 | | `run_msr_test()`               | Compute and compare Max Sharpe Ratios between two factor models                    |
 16 | | `run_regression()`             | Run time-series or rolling regression on test portfolios to obtain alpha and beta  |
 17 | | `get_grs_test()`               | Conduct the Gibbons-Ross-Shanken (GRS) test to check model pricing accuracy        |
 18 | | `get_hj_distance_test()`       | Compute Hansen–Jagannathan distance to assess pricing error                        |
 19 | | `compare_model_alphas()`       | Compare model alphas across multiple asset pricing models                          |
 20 | | `run_horse_race_regression()`  | Evaluate marginal explanatory power of factors (horse race regression)             |
 21 | | `get_spanning_test()`          | Check whether a new factor is spanned by an existing model                         |
 22 | | `run_subsample_analysis()`     | Perform robustness tests by splitting the sample                                   |
 23 | | `compute_vif()`                | Calculate Variance Inflation Factors to detect multicollinearity                   |
 24 | 
 25 | ---
 26 | 
 27 | ## Class Initialization
 28 | 
 29 | ```python
 30 | AcaEvaluator(factor: pd.DataFrame, forward_returns: dict[str, pd.DataFrame])
 31 | ```
 32 | 
 33 | **Parameters:**
 34 | 
 35 | * `factor`: A (Time × Stock) DataFrame of factor exposures
 36 | * `forward_returns`: A (Time × Stock) DataFrame of forward return
 37 | 
 38 | ---
 39 | 
 40 | ## Method Documentation
 41 | 
 42 | ### `run_single_sort()`
 43 | 
 44 | Perform single-factor portfolio sorting.
 45 | 
 46 | **Parameters:**
 47 | 
 48 | * `quantiles` (int): Number of quantile groups (e.g., 5 for quintiles)
 49 | * `value_weighted` (bool): If True, portfolios are value-weighted; otherwise equal-weighted
 50 | * `return_stats` (bool): Whether to return statistics (mean, t-value, p-value) of high-minus-low (H-L) portfolios
 51 | * `market_cap` (pd.DataFrame): Required if `value_weighted` is True; same shape as `factor`
 52 | * `get_quantile_sorts` (bool): Return group labels of stocks by quantile
 53 | 
 54 | **Returns:**
 55 | 
 56 | * Quantile portfolio returns or tuple of (returns, statistics) if `return_stats=True`
 57 | 
 58 | ---
 59 | 
 60 | ### `run_double_sort()`
 61 | 
 62 | Perform double sorting based on two factors.
 63 | 
 64 | **Parameters:**
 65 | 
 66 | * `factor2` (pd.DataFrame): Second factor
 67 | * `quantiles` (tuple\[int, int]): Quantile group counts for each factor
 68 | * `dependent` (bool): Use nested sort if True
 69 | * `value_weighted` (bool): If True, portfolios are value-weighted; otherwise equal-weighted
 70 | * `market_cap` (pd.DataFrame): Required if `value_weighted` is True; same shape as `factor`
 71 | * `get_quantile_sorts` (bool): Return group labels of stocks by quantile
 72 | 
 73 | **Returns:**
 74 | 
 75 | * Portfolio return structure or dictionary of quantile groupings
 76 | 
 77 | ---
 78 | 
 79 | ### `run_fama_macbeth()`
 80 | 
 81 | Run two-pass Fama-MacBeth regression.
 82 | 
 83 | **Parameters:**
 84 | 
 85 | * `return_adj` (pd.DataFrame): Adjusted returns matrix
 86 | * `window` (int): First-stage rolling window (default: 252)
 87 | * `return_stats` (bool): Return t-statistics and significance
 88 | 
 89 | **Returns:**
 90 | 
 91 | * Regression result or (result, statistics) tuple
 92 | 
 93 | ---
 94 | 
 95 | ### `run_ic()`
 96 | 
 97 | Calculate Information Coefficient between factor and future returns.
 98 | 
 99 | **Parameters:**
100 | 
101 | * `method` (str): Correlation method ('pearson', 'spearman', or 'kendall')
102 | 
103 | **Returns:**
104 | 
105 | * `pd.DataFrame`: Time series of IC values
106 | 
107 | ---
108 | 
109 | ### `run_msr_test()`
110 | 
111 | Compare the maximum Sharpe ratios (MSRs) of two factor models using a z-test.
112 | 
113 | **Parameters:**
114 | 
115 | * `model_a_factors` (pd.DataFrame): Factor returns of Model A (Time × K)
116 | * `model_b_factors` (pd.DataFrame): Factor returns of Model B (Time × K)
117 | * `regularize_covariance` (bool): If True, regularize the covariance matrix.
118 | 
119 | **Returns:**
120 | 
121 | * Dictionary with keys:
122 |   - `msr_a`: Maximum Sharpe ratio of Model A
123 |   - `msr_b`: Maximum Sharpe ratio of Model B
124 |   - `test_stat`: z-test statistic comparing MSRs
125 |   - `p_value`: p-value of the test
126 | 
127 | ---
128 | 
129 | ### `run_regression()`
130 | 
131 | Run either standard or rolling time-series regression on test portfolios, based on a flag.
132 | 
133 | **Parameters:**
134 | 
135 | * `rolling` (bool): optionalWhether to perform rolling regression, by default False.
136 | * `window` (int): optionalRolling window size (only used if rolling=True), by default 60.
137 | * `fit_intercept` (bool): optionalWhether to include an intercept in the regression, by default True.
138 | 
139 | **Returns:**
140 | 
141 | * `BatchRegressionResult` (dict): Regression result object (static) or a dictionary of rolling results.
142 | ---
143 | 
144 | ### `get_grs_test()`
145 | 
146 | Run GRS test for overall model explanatory power.
147 | 
148 | **Parameters:**
149 | 
150 | * `test_portfolios`: Time-series returns of test portfolios
151 | * `plot`: Whether to generate visual output
152 | 
153 | **Returns:**
154 | 
155 | * Dictionary with keys: `grs_stat`, `p_value`, `alphas`, `t_stats`, `residual_cov`, `betas`
156 | 
157 | ---
158 | 
159 | ### `get_hj_distance_test()`
160 | 
161 | Compute HJ distance to assess pricing error.
162 | 
163 | **Parameters:**
164 | 
165 | * `test_portfolios`: Portfolio return matrix
166 | * `plot`: Whether to visualize
167 | 
168 | **Returns:**
169 | 
170 | * Dictionary with HJ distance, t-stat, alpha, betas, residual\_cov
171 | 
172 | ---
173 | 
174 | ### `compare_model_alphas()`
175 | 
176 | Compare intercepts across different models.
177 | 
178 | **Parameters:**
179 | 
180 | * `models`: Dictionary of model name → factor return
181 | * `test_portfolios`: Test portfolio returns
182 | * `plot`: Whether to display comparison plot
183 | 
184 | **Returns:**
185 | 
186 | * Dictionary of results per model: alpha, t\_stat, mean\_abs\_alpha, mean\_abs\_t
187 | 
188 | ---
189 | 
190 | ### `run_horse_race_regression()`
191 | 
192 | Run horse race regression to assess marginal explanatory power.
193 | 
194 | **Parameters:**
195 | 
196 | * `candidate_factors`: Dict of factor name → exposure DataFrame
197 | * `forward_return_key`: Key to select the return horizon
198 | * `date`: If set, single-period regression; otherwise multi-period
199 | * `plot`: Whether to visualize t-stats
200 | 
201 | **Returns:**
202 | 
203 | * Dictionary with `coefs`, `mean_coef`, `t_stat`, `p_value`
204 | 
205 | ---
206 | 
207 | ### `get_spanning_test()`
208 | 
209 | Test whether a new factor can be spanned by base factors.
210 | 
211 | **Parameters:**
212 | 
213 | * `new_factor`: Series of the new factor
214 | * `base_model_factors`: Existing model factors (DataFrame)
215 | * `plot`: Whether to show visualization
216 | 
217 | **Returns:**
218 | 
219 | * Dictionary with `r_squared`, `alpha`, `t_stat`, `p_value`, `beta`, `resid_std`
220 | 
221 | ---
222 | 
223 | ### `run_subsample_analysis()`
224 | 
225 | Run out-of-sample robustness checks across different time periods.
226 | 
227 | **Parameters:**
228 | 
229 | * `method`: "ic", "alpha", or "quantile\_returns"
230 | * `split_dates`: List of split timestamps
231 | * `forward_return_key`: Return name (if needed)
232 | * `quantiles`: Group count for sorting (if used)
233 | * `plot`: Whether to visualize comparison
234 | 
235 | **Returns:**
236 | 
237 | * Dictionary with per-sample evaluation results
238 | 
239 | ---
240 | 
241 | ### `compute_vif()`
242 | 
243 | Detect multicollinearity using variance inflation factors (VIF).
244 | 
245 | **Parameters:**
246 | 
247 | * `factors`: Factor exposure matrix (T × K)
248 | * `plot`: Show bar plot of VIFs
249 | 
250 | **Returns:**
251 | 
252 | * Dictionary with `vif` (Series), `max_vif`, `mean_vif`
253 | 


--------------------------------------------------------------------------------
/docs/docs/evaluation/evaluation.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Evaluate Factor
3 | permalink: /evaluate/
4 | nav_order: 6
5 | ---
6 | How to use data in your project
7 | {: .fs-6 .fw-300 }


--------------------------------------------------------------------------------
/docs/docs/release-plan.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Release Plan
 3 | permalink: /release-plan/
 4 | nav_order: 2
 5 | ---
 6 | 
 7 | # Release Plan
 8 | 
 9 | | Stage Name                    | Begin Time | End Time  | Note                                               |
10 | | ----------------------------- | ---------- | --------- | -------------------------------------------------- |
11 | | Collect key features          | 2025/2/10  | 2025/2/20 | 版本需求收集|
12 | | Change Review                 | 2025/2/20  | 2025/2/25 | Review project 需求（升级/退役/淘汰）|
13 | | Develop                       | 2025/2/25  | 2025/3/15 | 新特性开发，合入master|
14 | | Build & Alpha                 | 2025/3/16  | 2025/3/30 | 新开发特性合入，Alpha版本发布|
15 | | Test round 1                  | 2025/3/30  | 2025/4/07 | 第一轮测试|
16 | | Beta version release          | 2025/4/08  | 2025/4/08 | Beta版本发布|
17 | | Test round 2                  | 2025/4/09  | 2025/4/15 | bug fix and 第二轮测试|
18 | | Release Review                | 2025/4/16  | 2025/4/16 | 版本发布决策|
19 | | Release preparation           | 2025/4/16  | 2025/4/16 | 发布前准备阶段，发布件梳理|
20 | | Release                       | 2025/4/17  | 2025/4/17 | Release正式发布|
21 | 
22 | 
23 | 
24 | 
25 | 
26 | # Feature list
27 | 
28 | 状态说明：
29 | 
30 | - Discussion(方案讨论，需求未接受)
31 | - Developing(开发中)
32 | - Testing(测试中)
33 | - Accepted(已验收)
34 | - Reject(已拒绝\未交付)
35 | 
36 | 
37 | | Feature Name | Status | Owner |
38 | | ------------ | ------ | ----- |
39 | | Consensus Factor Models: CAPM, FF3, FF3+Mom, FF3+MOM+LIQ, FF5 | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) |
40 | | Portfolio Sort | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi)  |
41 | | Double sorting | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) |
42 | | Fama-MacBeth Regression | Developing |  [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi)  |
43 | | Testing Anomalies (Time-Series Regression) | Developing | [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi) hi |
44 | | GRS Test | Developing |  [@feathertop](https://github.com/feathertop) [@mitcshi](https://github.com/mitcshi)  |
45 | | Support More Algorithms | Developing | [@qiaobaochen](https://github.com/qiaobaochen) |
46 | | Project CI/CD | Developing | [@qiaobaochen](https://github.com/qiaobaochen)|
47 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Home
 3 | layout: home
 4 | nav_order: 1
 5 | ---
 6 | # F.I.R.E. Factor Investment Research Engine
 7 | 
 8 | This project is the bundled opensource toolkit for book _Navigating the Factor Zoo：The Science of Quantitative Investing_. 
 9 | 
10 | The Fire project serves as a development and evaluation toolkit for factor research and portfolio construction. It is designed specifically to be simple, easy to use, and built on top of popular Python libraries like pandas, numpy, and scikit-learn.
11 | 
12 | Fire focuses on three critical aspects of factor research and portfolio construction:
13 | 
14 | 1. **Data Management**: Fire provides a user-friendly interface for downloading and managing financial data. By leveraging the pre-cleaned and processed data pipeline from the Fire Institute, you can focus more on research and modeling rather than data preparation.
15 | 
16 | 2. **Construction (Calculation)**: Fire offers a variety of algorithms for factor construction. Additionally, it allows users to build their own factors using popular libraries such as pandas, numpy, and scikit-learn.
17 | 
18 | 3. **Evaluation**: Factor evaluation is a complex and crucial step in research. Fire provides a comprehensive set of tools to assess factor performance, bridging the gap between academic and industry evaluation practices.
19 | 
20 | ----
21 | 
22 | ## Quick Start
23 | 
24 | ## Installation
25 | 
26 | ```bash
27 | # We have not released the package to pypi yet, so you need to install from source!!!
28 | pip install firefin
29 | 
30 | # Install from source for loacl testing!!!
31 | ## replace $ThisRepoURL with the actual repo url
32 | git clone $ThisRepoURL 
33 | ## install from source
34 | pip install -e .
35 | ```
36 | 
37 | ## Usage
38 | 
39 | Download the data 
40 | from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz)
41 | 
42 | run the command and download data put in correct path automatically.
43 | 
44 | ```bash
45 | # We have not released this repo yet, so you need download the data manually!!! See command below!!!
46 | # Auto download data
47 | firefin download
48 | ```
49 | 
50 | If you have already downloaded the data from [here](https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz), you can run the command to check the data and put the data in the correct path
51 | 
52 | ```bash
53 | # replace path_to_data.tar.gz with the actual path
54 | firefin load path_to_data.tar.gz
55 | ```
56 | 
57 | ## Start to code
58 | 
59 | ```python
60 | import firefin
61 | 
62 | # get data
63 | data = firefin.fetch_data(["open", "close", "volume"])
64 | open_price = data["open"]
65 | 
66 | 
67 | def pv_corr(close, volume):
68 |     # price volume correlation
69 |     return close.rolling(20).corr(volume)
70 | 
71 | 
72 | factor = pv_corr(data["close"], data["volume"])
73 | 
74 | # compute forward returns
75 | fr = firefin.compute_forward_returns(open_price.shift(-1), [1, 5, 10])
76 | 
77 | # evaluate factor
78 | mng = firefin.Evaluator(factor, fr)
79 | mng.get_ic("pearson")
80 | mng.get_quantile_returns(5)
81 | 
82 | ```


--------------------------------------------------------------------------------
/file.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/file.py


--------------------------------------------------------------------------------
/firefin/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | from .compute.window import *
 5 | from .data.gateway import fetch_data
 6 | from .evaluation.eva_utils import compute_forward_returns, compute_ic, compute_quantile_returns
 7 | from .evaluation.industry.evaluator import Evaluator
 8 | from .core.plot.plots import plt_ic, plt_quantile_cumulated_end_returns, plt_quantile_cumulative_returns
 9 | from .evaluation.academia.AcaEvaluatorModel import AcaEvaluatorModel
10 | from .evaluation.academia.AcaEvaluatorModelComparison import AcaEvaluatorModelComparison


--------------------------------------------------------------------------------
/firefin/cli/command.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import os
 5 | import sys
 6 | import click
 7 | import subprocess
 8 | from ..common.config import DATA_PATH, logger
 9 | 
10 | 
11 | @click.group()
12 | def cli():
13 |     pass
14 | 
15 | 
16 | @click.command(help="Display help")
17 | def help():
18 |     click.echo("Help")
19 | 
20 | 
21 | def _prepare_folder():
22 |     # check if data directory exists
23 |     if not DATA_PATH.exists():
24 |         DATA_PATH.mkdir(parents=True, exist_ok=True)
25 |         logger.info("Data directory is created.")
26 |     else:
27 |         logger.info("Data directory already exists. Skipping creating directory.")
28 | 
29 | 
30 | # TODO: Add more data source
31 | @click.command(help="Download data")
32 | @click.option('--data_url', default=None, help='download from provided url')
33 | def download(data_url):
34 |     logger.info("Preparing Data for the first time ...")
35 |     _prepare_folder()
36 | 
37 |     if data_url:
38 |         if not data_url.startswith("http"):
39 |             raise Exception("Please provide a valid url to download data from.")
40 |         if not data_url.endswith(".tar.gz"):
41 |             raise Exception("Please provide a valid url to download data from. The url should end with .tar.gz")
42 |         data_file_name = data_url.split("/")[-1]
43 |         raw_data_path = DATA_PATH / data_file_name
44 |         request_url = data_url
45 |     else:
46 |         # download default package
47 |         logger.info("No URL provided, will download default AStockData.tar.gz from GitHub.")
48 |         raw_data_path = DATA_PATH / "AStockData.tar.gz"
49 |         request_url = (
50 |             "https://github.com/fire-institute/fire/releases/download/marketdata/AStockData.tar.gz"
51 |         )
52 | 
53 |     # Check if gz file is exist
54 |     if raw_data_path.exists():
55 |         logger.info("Data already exists, Remove Data First...")
56 |         # ensure the file is removed before downloading again
57 |         raw_data_path.unlink(missing_ok=True)  # remove the file
58 |         logger.info("Data Removed.")
59 | 
60 |     logger.info("Downloading data ...")
61 |     # Download data from file server
62 |     try:
63 |         subprocess.run(f"wget {request_url} -O {raw_data_path}", shell=True, check=True)
64 |         subprocess.run(
65 |             f'tar -xvf {raw_data_path} -C {DATA_PATH} --strip-components=1',
66 |             shell=True,
67 |             check=True
68 |         )
69 |     except subprocess.CalledProcessError as e:
70 |         logger.error(f"Command execution failed: {e}")
71 |         sys.exit(1)
72 |     except KeyboardInterrupt:
73 |         logger.info("KeyboardInterrupt (Ctrl+C), program terminated")
74 |         sys.exit(0)
75 |  
76 | 
77 | @click.command(help="Prepare data")
78 | @click.argument("file_path", type=click.Path(exists=True))
79 | def load(file_path: str = None):
80 |     logger.info("Preparing Data for the first time ...")
81 |     _prepare_folder()
82 | 
83 |     # tar unzip file, print progress
84 |     try:
85 |         subprocess.run(
86 |             f'tar -xvf {file_path} -C {DATA_PATH} --strip-components=1',
87 |             shell=True,
88 |             check=True
89 |         )
90 |     except subprocess.CalledProcessError as e:
91 |         logger.error(f"Command execution failed: {e}")
92 |         sys.exit(1)
93 |     except KeyboardInterrupt:
94 |         logger.info("KeyboardInterrupt (Ctrl+C), program terminated")
95 |         sys.exit(0)
96 | 
97 | cli.add_command(help)
98 | cli.add_command(download)
99 | cli.add_command(load)


--------------------------------------------------------------------------------
/firefin/common/config.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | import os
 4 | import pathlib
 5 | import yaml
 6 | import json
 7 | from loguru import logger
 8 | 
 9 | # Load configuration from YAML file
10 | with open(os.path.join(os.path.dirname(__file__), "config.yaml"), "r") as stream:
11 |     config = yaml.safe_load(stream)
12 | 
13 | # Load configuration from YAML file
14 | 
15 | # Define DATA_PATH based on configuration, expanding user and resolving path
16 | 
17 | if os.name == "posix":
18 |     DATA_PATH = config.get("paths", {}).get("unix", "~/.fire/data/raw/")
19 | else:
20 |     DATA_PATH = config.get("paths", {}).get("windows", "%USERPROFILE%\\.fire\\data\\raw")
21 | 
22 | # resolve ~ and envars
23 | DATA_PATH = pathlib.Path(DATA_PATH).expanduser().resolve()
24 | 
25 | # load data maps from config
26 | DATA_MAPS = config.get("data_maps", {})
27 | 
28 | json_files = list(DATA_PATH.glob("*.json"))
29 | if json_files:
30 |     for json_file in json_files:
31 |         with open(json_file, "r") as f:
32 |             DATA_MAPS.update(json.load(f))
33 | else:
34 |     import multiprocessing as _mp
35 | 
36 |     if _mp.current_process().name == "MainProcess":
37 |         logger.info("No additional JSON files found in DATA_PATH, load default DATA_MAPS.")
38 | 


--------------------------------------------------------------------------------
/firefin/common/config.yaml:
--------------------------------------------------------------------------------
 1 | data_paths:
 2 |   unix: ~/.fire/data/raw/
 3 |   windows: '%USERPROFILE%\.fire\data\raw\'
 4 | 
 5 | data_maps:
 6 |   # quote：量价数据
 7 |   open: file::feather
 8 |   close: file::feather
 9 |   high: file::feather
10 |   low: file::feather
11 |   volume: file::feather
12 |   money: file::feather
13 |   return_adj: file::feather
14 |   vwap: file::feather
15 |   adj_factor: file::feather
16 |   open_dr: file::feather
17 |   high_dr: file::feather
18 |   low_dr: file::feather
19 |   close_dr: file::feather
20 |   volume_dr: file::feather
21 |   vwap_dr: file::feather
22 |   FinanceValue: file::feather
23 |   FinanceBuyValue: file::feather
24 |   FinanceRefundValue: file::feather
25 |   SecurityVolume: file::feather
26 |   SecuritySellVolume: file::feather
27 |   SecurityRefundVolume: file::feather
28 |   SecurityValue: file::feather
29 |   TradingValue: file::feather
30 |   FinaInTotalRatio: file::feather
31 |   SecuInTotalRatio: file::feather
32 |   shares_holding: file::feather
33 |   hold_ratio: file::feather
34 |   adjusted_hold_ratio: file::feather
35 |   # valuation：估值数据
36 |   circulating_market_cap: file::feather
37 |   pcf_ratio: file::feather
38 |   market_cap: file::feather
39 |   pe_ratio_lyr: file::feather
40 |   circulating_cap: file::feather
41 |   capitalization: file::feather
42 |   pb_ratio: file::feather
43 |   pe_ratio: file::feather
44 |   ps_ratio: file::feather
45 |   turnover_ratio: file::feather
46 |   circulating_market_cap_ashare: file::feather
47 |   market_cap_ashare: file::feather
48 |   circulating_cap_ashare: file::feather
49 |   capitalization_ashare: file::feather
50 |   # financial：财务
51 |   inventories: file::feather
52 |   total_current_assets: file::feather
53 |   fixed_assets: file::feather
54 |   good_will: file::feather
55 |   total_assets: file::feather
56 |   total_liability: file::feather
57 |   operating_revenue: file::feather
58 |   operating_profit: file::feather
59 |   total_profit: file::feather
60 |   net_profit: file::feather
61 |   basic_eps: file::feather
62 | 
63 | 


--------------------------------------------------------------------------------
/firefin/common/const.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import pandas as pd
 5 | 
 6 | # for minute data bartimes
 7 | _morning = pd.date_range("2020-01-01 09:30", "2020-01-01 11:30", freq="1 min")
 8 | _afternoon = pd.date_range("2020-01-01 13:00", "2020-01-01 15:00", freq="1 min")
 9 | MIN_BARTIMES = _morning.union(_afternoon).strftime("%H:%M")
10 | 


--------------------------------------------------------------------------------
/firefin/compute/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
3 | 
4 | """
5 | utility functions for computing
6 | 
7 | """
8 | 


--------------------------------------------------------------------------------
/firefin/compute/window.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import pandas as pd
 5 | import typing
 6 | from ..core.algorithm import _numba_funcs
 7 | 
 8 | __all__ = ["ts_corr"]
 9 | 
10 | 
11 | def ts_corr(x: pd.DataFrame, y: pd.DataFrame, n: int, method: typing.Literal["pearson", "kendall", "spearman"]):
12 |     x, y = x.align(y, join="outer", copy=False)
13 |     result = pd.DataFrame(
14 |         _numba_funcs.ts_corr(x.values, y.values, n, method),
15 |         index=x.index,
16 |         columns=x.columns,
17 |     )
18 |     return result
19 | 


--------------------------------------------------------------------------------
/firefin/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/core/__init__.py


--------------------------------------------------------------------------------
/firefin/core/algorithm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/core/algorithm/__init__.py


--------------------------------------------------------------------------------
/firefin/core/algorithm/_numba_funcs.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import numpy as np
 5 | from numba import njit
 6 | 
 7 | 
 8 | @njit
 9 | def _validate_pairwise(x, y):
10 |     if x.ndim != 2 or y.ndim != 2:
11 |         raise ValueError("_validate_pairwise: Both inputs must be 2D arrays")
12 |     n1, m1 = x.shape
13 |     n2, m2 = y.shape
14 |     if n1 != n2:
15 |         raise ValueError("_validate_pairwise: Both inputs must have the same number of rows")
16 |     if m1 != m2 and min(m1, m2) != 1:
17 |         raise ValueError("_validate_pairwise: Both inputs must have the same number of columns or one column")
18 | 
19 | 
20 | @njit
21 | def _corr_pearson(x, y):
22 |     assert len(x) == len(y)
23 |     msk = np.isfinite(x) & np.isfinite(y)
24 |     if msk.sum() <= 3:
25 |         return np.nan
26 |     elif msk.all():
27 |         x_ = x
28 |         y_ = y
29 |     else:
30 |         x_ = x[msk]
31 |         y_ = y[msk]
32 |     mean_x = np.mean(x_)
33 |     mean_y = np.mean(y_)
34 |     x_centered = x_ - mean_x
35 |     y_centered = y_ - mean_y
36 |     var_x = np.sum(x_centered**2)
37 |     if var_x == 0:
38 |         return np.nan
39 |     var_y = np.sum(y_centered**2)
40 |     if var_y == 0:
41 |         return np.nan
42 |     cov = np.sum(x_centered * y_centered)
43 |     return cov / np.sqrt(var_x * var_y)
44 | 
45 | 
46 | @njit
47 | def corr(x, y, method="pearson"):
48 |     if x.ndim != 1 or y.ndim != 1:
49 |         raise ValueError("corr: Both inputs must be 1D arrays")
50 |     if x.shape != y.shape:
51 |         raise ValueError("corr: Both inputs must have the same shape")
52 |     if method == "pearson":
53 |         return _corr_pearson(x, y)
54 |     else:
55 |         raise NotImplementedError("corr: Only Pearson correlation is supported")
56 | 
57 | 
58 | @njit
59 | def ts_corr(x, y, w, method="pearson"):
60 |     _validate_pairwise(x, y)
61 |     n, m1 = x.shape
62 |     _, m2 = y.shape
63 |     k = max(m1, m2)
64 |     out = np.full((n, k), np.nan)
65 |     for i in range(n):
66 |         x_ = x[max(0, i - w + 1) : i + 1]
67 |         y_ = y[max(0, i - w + 1) : i + 1]
68 |         for j in range(k):
69 |             x__ = x_[:, min(j, m1 - 1)]
70 |             y__ = y_[:, min(j, m2 - 1)]
71 |             out[i, j] = corr(x__, y__, method)
72 |     return out
73 | 


--------------------------------------------------------------------------------
/firefin/core/algorithm/newey_west_ttest_1samp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Calculate Newey-West Adjusted Standard Error in t-test for Academic Research
 3 | ---------------------------------------------------
 4 | This module provides a class for performing a one-sample t-test with Newey-West adjusted standard errors.
 5 | The implementation focuses on clarity, comprehensive documentation, and best practices for financial research.
 6 | """
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | from typing import Union, Tuple
11 | import statsmodels.api as sm 
12 | 
13 | class NeweyWestTTest:
14 |     """
15 |     A class for performing a one-sample t-test using Newey-West adjusted standard errors.
16 |     """
17 | 
18 |     @staticmethod
19 |     def newey_west_ttest_1samp(data: Union[np.ndarray, pd.Series, list],
20 |                                popmean: float = 0.0,
21 |                                lags: int = 4,
22 |                                nan_policy: str = 'omit') -> Tuple[float, float, float]:
23 |         """
24 |         Perform a one-sample t-test using Newey-West adjusted standard errors.
25 | 
26 |         Parameters
27 |         ----------
28 |         data : array-like
29 |             The sample data.
30 |         popmean : float, optional
31 |             The hypothesized population mean (default is 0.0).
32 |         lags : int, optional
33 |             The number of lags for Newey-West adjustment (default is 4).
34 |         nan_policy : {'propagate', 'omit', 'raise'}, optional
35 |             Defines how to handle input NaNs:
36 |                 'propagate' : if a NaN is present in the input, return NaN for all outputs.
37 |                 'omit'      : omit NaNs when performing the calculation. If insufficient data remains, return NaN.
38 |                 'raise'     : if a NaN is present, raise a ValueError.
39 |             (default is 'omit').
40 | 
41 |         Returns
42 |         -------
43 |         t_value : float
44 |             The t-statistic.
45 |         p_value : float
46 |             The p-value for the t-test.
47 |         se : float
48 |             The Newey-West adjusted standard error.
49 | 
50 |         Raises
51 |         ------
52 |         ValueError
53 |             If the input data is not one-dimensional or if nan_policy is set to 'raise' and data contains NaNs.
54 |         """
55 |         # Convert input data to a NumPy array
56 |         data_arr = np.asarray(data)
57 |         # Ensure the data is one-dimensional
58 |         if data_arr.ndim != 1:
59 |             raise ValueError("Input data must be a one-dimensional array or series. Only a single variable is allowed.")
60 | 
61 |         # Validate nan_policy argument
62 |         if nan_policy not in ['propagate', 'omit', 'raise']:
63 |             raise ValueError("nan_policy must be one of 'propagate', 'omit', or 'raise'.")
64 | 
65 |         # Handle NaN values according to nan_policy
66 |         if nan_policy == 'propagate':
67 |             if np.isnan(data_arr).any():
68 |                 return np.nan, np.nan, np.nan
69 |         elif nan_policy == 'raise':
70 |             if np.isnan(data_arr).any():
71 |                 raise ValueError("Input data contains NaN values.")
72 |         elif nan_policy == 'omit':
73 |             data_arr = data_arr[~np.isnan(data_arr)]
74 |             # If no sufficient data remains after omitting NaNs, return NaN values
75 |             if data_arr.size < 2:
76 |                 raise ValueError("No sufficient data (length < 2) remains after omitting NaNs.")
77 | 
78 |         # If the data length is insufficient, return NaNs
79 |         if data_arr.size < 2:
80 |             raise ValueError("No sufficient data (length < 2).")
81 | 
82 |         # Adjust the data by subtracting the hypothesized population mean
83 |         adjusted_data = data_arr - popmean
84 |         # Create an intercept term (a column of ones)
85 |         X = np.ones(len(adjusted_data))
86 |         # Fit an OLS model with Newey-West (HAC) standard errors
87 |         model = sm.OLS(adjusted_data, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})
88 |         # Extract the t-statistic, p-value, and standard error
89 |         t_value = model.tvalues[0]
90 |         p_value = model.pvalues[0]
91 |         se = model.bse[0]
92 | 
93 |         return t_value, p_value, se
94 | 


--------------------------------------------------------------------------------
/firefin/core/algorithm/regression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Created : 2025/3/26 17:01
  3 | # @Author  : Liao Renjie
  4 | # @Email   : liao.renjie@techfin.ai
  5 | # @File    : least_square.py
  6 | # @Software: PyCharm
  7 | 
  8 | import textwrap
  9 | import typing
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | import statsmodels.api as sm
 14 | from joblib import Parallel, delayed
 15 | 
 16 | __all__ = ["least_square", "RollingRegressor", "rolling_regression", "table_regression"]
 17 | 
 18 | NotProvided = object()
 19 | 
 20 | 
 21 | class RegressionResult:
 22 |     """
 23 |     Encapsulate the results from `least_square`.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     sm_result: sm.regression.linear_model.RegressionResults
 28 |         The regression results object from the statsmodels library.
 29 |     fit_intercept: bool
 30 |         Whether to fit an intercept term.
 31 |     univariate: bool
 32 |         Whether it is a univariate regression.
 33 | 
 34 |     """
 35 | 
 36 |     def __init__(self, sm_result: sm.regression.linear_model.RegressionResults, fit_intercept: bool, univariate: bool):
 37 |         self.sm_result = sm_result
 38 |         self.fit_intercept = fit_intercept
 39 |         self.univariate = univariate
 40 | 
 41 |     @property
 42 |     def alpha(self):
 43 |         """float or None"""
 44 |         if self.fit_intercept:
 45 |             return self.sm_result.params[0]
 46 |         else:
 47 |             return None
 48 | 
 49 |     @property
 50 |     def beta(self):
 51 |         """1D array if multivariate or float if univariate"""
 52 |         if self.univariate:
 53 |             return self.sm_result.params[-1]
 54 |         else:
 55 |             # multivariate
 56 |             if self.fit_intercept:
 57 |                 return self.sm_result.params[1:]
 58 |             else:
 59 |                 return self.sm_result.params
 60 | 
 61 |     @property
 62 |     def r2(self):
 63 |         """
 64 |         Return the coefficient of determination R² of the regression.
 65 | 
 66 |         Returns
 67 |         -------
 68 |         float
 69 |             The R² value.
 70 |         """
 71 |         return self.sm_result.rsquared
 72 | 
 73 |     @property
 74 |     def r2_adj(self):
 75 |         """
 76 |         Return the adjusted coefficient of determination R² of the regression.
 77 | 
 78 |         Returns
 79 |         -------
 80 |         float
 81 |             The adjusted R² value.
 82 |         """
 83 |         return self.sm_result.rsquared_adj
 84 | 
 85 |     @property
 86 |     def residuals(self):
 87 |         """
 88 |         Return the residuals of the regression.
 89 | 
 90 |         Returns
 91 |         -------
 92 |         array
 93 |             The array of residuals.
 94 |         """
 95 |         return self.sm_result.resid
 96 | 
 97 | 
 98 | class BatchRegressionResult:
 99 |     """
100 |     Encapsulate the results of batch regression.
101 | 
102 |     Parameters
103 |     ----------
104 |     beta
105 |         The regression coefficients.
106 |     alpha: optional
107 |         The intercept term, default is None.
108 |     r2: optional
109 |         The coefficient of determination R², default is None.
110 |     r2_adj: optional
111 |         The adjusted coefficient of determination R², default is None.
112 |     residuals: optional
113 |         The residuals, default is None.
114 |     """
115 | 
116 |     def __init__(
117 |         self,
118 |         beta,
119 |         alpha=None,
120 |         r2=None,
121 |         r2_adj=None,
122 |         residuals=None,
123 |     ):
124 |         # NOTE: public names will be displayed in __repr__
125 |         self.alpha = alpha
126 |         self.beta = beta
127 |         self.r2 = r2
128 |         self.r2_adj = r2_adj
129 |         self.residuals = residuals
130 | 
131 |     def __repr__(self):
132 |         content = {a: getattr(self, a) for a in dir(self) if not a.startswith("_")}
133 |         content = ",\n".join(
134 |             [f" {k}:\n{textwrap.indent(repr(v), prefix='    ')}" for k, v in content.items() if v is not None]
135 |         )
136 |         return f"{self.__class__.__name__}(\n{content}\n)"
137 | 
138 | 
139 | def _regression(
140 |     x: pd.DataFrame | pd.Series,
141 |     y: pd.Series,
142 |     w: pd.Series = None,
143 |     fit_intercept: bool = True,
144 |     cov_type: str | None = None,
145 |     cov_kwds: dict | None = None,
146 | ) -> sm.regression.linear_model.RegressionResults:
147 |     """
148 |     Perform a linear regression using either OLS or WLS.
149 | 
150 |     Parameters
151 |     ----------
152 |     x: pd.DataFrame | pd.Series
153 |         The independent variable(s).
154 |     y: pd.Series
155 |         The dependent variable.
156 |     w: pd.Series, optional
157 |         The weights for WLS, default is None.
158 |     fit_intercept: bool, optional
159 |         Whether to fit an intercept term, default is True.
160 |     cov_type: str | None, optional
161 |         The covariance estimator, default is None.
162 |         - If None: use the default homoskedastic standard errors.
163 |         - If "HAC": Newey–West heteroskedasticity-and-autocorrelation robust SE.
164 |         - Other options supported by statsmodels (e.g. "HC0", "HC1", …).
165 |     cov_kwds: dict | None, optional
166 |         The keyword arguments for the covariance estimator, default is None.
167 |         For Newey–West, you’d typically pass `{"maxlags": L}` to control lag length.
168 | 
169 |     Returns
170 |     -------
171 |     sm.regression.linear_model.RegressionResults
172 |         The regression results.
173 |     """
174 |     
175 |     ## if x contains nan, fill nan with 0
176 |     ## TODO: fill nan with 0 is not a good idea, we should use the mean of the column to fill nan
177 |     x = np.nan_to_num(x, nan=0)
178 |     y = np.nan_to_num(y, nan=0)
179 | 
180 |     if fit_intercept:
181 |         x = sm.add_constant(x)
182 |     if w is None:
183 |         model = sm.OLS(y, x)
184 |     else:
185 |         ## TODO: fill nan with 0 is not a good idea, we should use the mean of the column to fill nan
186 |         y = np.nan_to_num(y, nan=0)
187 |         model = sm.WLS(y, x, weights=w)
188 | 
189 |     if cov_type is None:
190 |         return model.fit()
191 |     else:
192 |         return model.fit(cov_type=cov_type, cov_kwds=cov_kwds or {})
193 | 
194 | 
195 | def least_square(
196 |     x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray,
197 |     y: pd.Series | np.ndarray,
198 |     w: pd.Series | np.ndarray | None = None,
199 |     fit_intercept: bool = True,
200 | ) -> RegressionResult:
201 |     """
202 |     A simple wrapper around sm.OLS or sm.WLS.
203 | 
204 |     Parameters
205 |     ----------
206 |     x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
207 |         The independent variable(s). If one-dimensional, the regression is considered univariate, otherwise is
208 |         considered multivariate. This affects the format of returned beta.
209 |     y: pd.Series | np.ndarray
210 |         The dependent variable.
211 |     w: pd.Series | np.ndarray | None, optional
212 |         The weights for WLS, default is None.
213 |     fit_intercept: bool, optional
214 |         Whether to fit an intercept term, default is True.
215 | 
216 |     Returns
217 |     -------
218 |     RegressionResult
219 |         The regression result object.
220 |     """
221 |     if isinstance(x, (tuple, list)):
222 |         x = pd.concat(x, axis=1)
223 | 
224 |     if isinstance(x, pd.Series):
225 |         x = x.to_frame()
226 | 
227 |     if isinstance(x, np.ndarray):
228 |         if x.ndim == 1:
229 |             x = x.reshape(-1, 1)
230 |         if x.ndim != 2:
231 |             raise ValueError("x must be 1d or 2d array")
232 | 
233 |     univariate = x.shape[1] == 1
234 | 
235 |     result = _regression(x, y, w=w, fit_intercept=fit_intercept)
236 |     return RegressionResult(result, fit_intercept=fit_intercept, univariate=univariate)
237 | 
238 | 
239 | @delayed
240 | def calculate_window(x_wind, y_wind, w_wind, m1, m2, m3, m, fit_intercept, univariate, cov_type, cov_kwds):
241 |     alphas: list[float] = []
242 |     betas: list[float | np.ndarray | None] = []
243 |     for j in range(m):
244 |         x_j = x_wind[:, :, min(j, m1 - 1)].T
245 |         y_j = y_wind[:, min(j, m2 - 1)]
246 |         w_j = None if w_wind is None else w_wind[:, min(j, m3 - 1)]
247 |         # if any x is all nan, skip regression
248 |         if np.isnan(x_j).all(axis=0).any() or np.isnan(y_j).all():
249 |             # alpha 一定是float；beta可能是array或者float，所以用None表示🈳
250 |             alpha = np.nan
251 |             beta = None
252 |         else:
253 |             res = RegressionResult(
254 |                 # fit_intercept is always False, because we've padded X in __init__
255 |                 _regression(x_j, y_j, w_j, fit_intercept=False, cov_type=cov_type, cov_kwds=cov_kwds),
256 |                 fit_intercept=fit_intercept,
257 |                 univariate=univariate,
258 |             )
259 |             alpha = res.alpha
260 |             beta = res.beta
261 |         alphas.append(alpha)
262 |         betas.append(beta)
263 | 
264 |     return alphas, betas
265 | 
266 | 
267 | class RollingRegressor:
268 |     """
269 |     Perform rolling regression.
270 | 
271 |     Parameters
272 |     ----------
273 |     x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
274 |         The independent variable(s).
275 |     y: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
276 |         The dependent variable.
277 |     w: optional
278 |         The weights for WLS, default is None.
279 |     mode: typing.Literal["single", "multi"], optional
280 |         The mode of regression, default is None.
281 |     fit_intercept: bool, optional
282 |         Whether to fit an intercept term, default is True.
283 |     """
284 | 
285 |     def __init__(
286 |         self,
287 |         x,
288 |         y,
289 |         w=None,
290 |         *,
291 |         mode: typing.Literal["single", "multi"] = None,
292 |         fit_intercept: bool = True,
293 |     ):
294 |         # We generally don't check the alignment of the inputs. It's the user's obligation to make sure the inputs are
295 |         # compatible in turns of shape and align with each other.
296 |         self._keys = {}
297 |         self._index = {}
298 |         self._columns = {}
299 |         self.x = self._parse_data(x, "x")
300 |         self.y = self._parse_data(y, "y")
301 |         self.w = self._parse_data(w, "w", allow_none=True)
302 | 
303 |         # "multi": x is 3d array
304 |         # "single": x is 2d array
305 |         if isinstance(self.x, np.ndarray):
306 |             if self.x.ndim == 2:
307 |                 self.inferred_mode = "single"
308 |                 self.x = self.x.reshape(1, *self.x.shape)
309 |             elif self.x.ndim == 3:
310 |                 self.inferred_mode = "multi"
311 |             else:
312 |                 raise ValueError("x must be 2d or 3d array")
313 |         else:
314 |             raise ValueError("parsed x should be array")
315 | 
316 |         # now x is 3d array: key-index-columns
317 |         if fit_intercept:
318 |             self.x = np.concatenate([np.ones((1, *self.x.shape[1:])), self.x])
319 | 
320 |         if mode is not None and mode != self.inferred_mode:
321 |             raise ValueError(f"inferred mode ({self.inferred_mode}) is not equal to the specified mode ({mode})")
322 | 
323 |         self.keys = None if not self._keys else next(iter(self._keys.values()))
324 |         self.index = None if not self._index else next(iter(self._index.values()))
325 | 
326 |         if not self._columns:
327 |             self.columns = None
328 |         else:
329 |             len_col = list(map(len, self._columns.values()))
330 |             max_len_loc = len_col.index(max(len_col))
331 |             self.columns = list(self._columns.values())[max_len_loc]
332 | 
333 |         self.fit_intercept = fit_intercept
334 | 
335 |     @property
336 |     def is_univariate(self):
337 |         """
338 |         Check if the regression is univariate.
339 | 
340 |         Returns
341 |         -------
342 |         bool
343 |             True if univariate, False otherwise.
344 |         """
345 |         if self.inferred_mode == "single":
346 |             return True
347 |         else:
348 |             assert self.inferred_mode == "multi"
349 |             return False
350 | 
351 |     def _parse_data(self, a, data_name: typing.Literal["x", "y", "w"], allow_none=False):
352 |         """
353 |         Parse the input data.
354 | 
355 |         Parameters
356 |         ----------
357 |         a
358 |             The input data.
359 |         data_name: typing.Literal["x", "y", "w"]
360 |             The name of the data.
361 |         allow_none: bool, optional
362 |             Whether to allow None as input, default is False.
363 | 
364 |         Returns
365 |         -------
366 |         np.ndarray
367 |             The parsed data.
368 |         """
369 |         if a is None:
370 |             if allow_none:
371 |                 return
372 |             else:
373 |                 raise ValueError(f"input {data_name} cannot be None")
374 |         if isinstance(a, pd.Series):
375 |             a = a.to_frame()
376 |         if isinstance(a, pd.DataFrame):
377 |             self._index[data_name] = a.index
378 |             self._columns[data_name] = a.columns
379 |             return a.values
380 |         elif isinstance(a, np.ndarray):
381 |             if a.ndim == 1:
382 |                 a = a.reshape(-1, 1)
383 |             if a.ndim not in (2, 3):
384 |                 raise ValueError(f"input {data_name} should be 2-d or 3-d if it's array")
385 |             return a
386 |         else:
387 |             if data_name in ("x", "w"):
388 |                 if isinstance(a, dict):
389 |                     self._keys[data_name] = list(a.keys())
390 |                     a = list(a.values())
391 | 
392 |                 if isinstance(a, (list, tuple)):
393 |                     if len(set([i.shape for i in a])) != 1:
394 |                         raise ValueError(f"input {data_name} should have same shape")
395 |                     if not all(i.ndim == 2 for i in a):
396 |                         raise ValueError(f"input contents of {data_name} should be 2-d, if it's list")
397 |                     if isinstance(a[0], pd.DataFrame):
398 |                         self._index[data_name] = a[0].index
399 |                         self._columns[data_name] = a[0].columns
400 |                     a = np.array(a)
401 |                     return a
402 |                 else:
403 |                     raise TypeError(f"input {data_name} should be array-like or list")
404 | 
405 |             else:
406 |                 raise ValueError(f"input {data_name}'s type not supported")
407 | 
408 |     @classmethod
409 |     def _transpose_or_none(cls, _x):
410 |         """
411 |         Transpose the array if it is not None.
412 | 
413 |         Parameters
414 |         ----------
415 |         _x
416 |             The input array.
417 | 
418 |         Returns
419 |         -------
420 |         np.ndarray or None
421 |             The transposed array or None.
422 |         """
423 |         # the last 2 axes are always time x stocks
424 |         if _x is not None:
425 |             return np.swapaxes(_x, -1, -2)
426 | 
427 |     def fit(
428 |         self,
429 |         window: int | None = None,
430 |         axis=0,
431 |         cov_type: str | None = None,
432 |         cov_kwds: dict | None = None,
433 |         n_jobs: int = 4,
434 |         verbose: int = 0,
435 |     ):
436 |         """
437 |         Fit the rolling regression model.
438 | 
439 |         Parameters
440 |         ----------
441 |         window: int | None, optional
442 |             The window size for rolling regression, default is None. If None, window = len(data)
443 |         axis: int, optional
444 |             The axis along which to perform the regression, default is 0.
445 |         cov_type: str | None, optional
446 |             The covariance estimator, default is None.
447 |             - If None: use the default homoskedastic standard errors.
448 |             - If "HAC": Newey–West heteroskedasticity-and-autocorrelation robust SE.
449 |             - Other options supported by statsmodels (e.g. "HC0", "HC1", …).
450 |         cov_kwds: dict | None, optional
451 |             The keyword arguments for the covariance estimator, default is None.
452 |             For Newey–West, you’d typically pass `{"maxlags": L}` to control lag length.
453 |         n_jobs: int
454 |             num of parallel workers, passed to Parallel
455 |         verbose: int
456 |             verbosity of progress, passed to Parallel
457 | 
458 |         Returns
459 |         -------
460 |         BatchRegressionResult
461 |             The batch regression result object.
462 |         """
463 |         x = self.x
464 |         y = self.y
465 |         w = self.w
466 | 
467 |         keys = self.keys
468 |         index = self.index
469 |         columns = self.columns
470 | 
471 |         fit_intercept = self.fit_intercept
472 |         univariate = self.inferred_mode == "single"
473 |         transpose = axis != 0
474 | 
475 |         if transpose:
476 |             x = self._transpose_or_none(x)
477 |             y = self._transpose_or_none(y)
478 |             w = self._transpose_or_none(w)
479 | 
480 |         # generic shape compat
481 |         k, n1, m1 = x.shape
482 |         n2, m2 = y.shape
483 |         if m1 != m2 and min(m1, m2) != 1:
484 |             raise ValueError(f"incompatible x, y shapes: {x.shape} vs {y.shape}")
485 |         if n1 != n2:
486 |             raise ValueError(f"x, y should have same length")
487 | 
488 |         n = n1
489 |         m = max(m1, m2)
490 |         m3 = 1
491 | 
492 |         if w is not None:
493 |             n3, m3 = w.shape
494 |             if m3 > 1 and m3 != m:
495 |                 raise ValueError(f"incompatible x, y, w shapes: {x.shape} vs {y.shape} vs {w.shape}")
496 |             if n3 != n:
497 |                 raise ValueError(f"x, w should have same length")
498 | 
499 |         # window not specified, use total length as window
500 |         # in this case, result should also be pruned
501 |         is_table = window is None
502 |         if is_table:
503 |             window = n
504 | 
505 |         alpha = None
506 |         if fit_intercept:
507 |             alpha = np.full((n, m), np.nan)
508 |         beta = np.full((k - fit_intercept, n, m), np.nan)
509 | 
510 |         result_gen = Parallel(n_jobs=n_jobs, verbose=verbose, return_as="generator")(
511 |             calculate_window(
512 |                 x_wind=x[:, i : i + window],
513 |                 y_wind=y[i : i + window],
514 |                 w_wind=None if w is None else w[i : i + window],
515 |                 m1=m1,
516 |                 m2=m2,
517 |                 m3=m3,
518 |                 m=m,
519 |                 fit_intercept=fit_intercept,
520 |                 univariate=univariate,
521 |                 cov_type=cov_type,
522 |                 cov_kwds=cov_kwds,
523 |             )
524 |             for i in range(n - window + 1)
525 |         )
526 |         for i, (alphas, betas) in enumerate(result_gen):
527 |             alpha[i + window - 1] = alphas
528 |             for j, _beta in enumerate(betas):
529 |                 if _beta is not None:
530 |                     beta[:, i + window - 1, j] = _beta
531 | 
532 |         # squeeze if table
533 |         if is_table:
534 |             # columns
535 |             alpha = alpha[-1]
536 |             # keys x columns
537 |             beta = beta[:, -1]
538 |         # maybe transpose back
539 |         if transpose:
540 |             beta = self._transpose_or_none(beta)
541 |         # wrap dataframe if possible
542 |         if is_table:
543 |             alpha = pd.Series(alpha, index=index if transpose else columns, name="alpha")
544 |             if transpose:
545 |                 # axis = 1
546 |                 beta = pd.DataFrame(beta, index=index, columns=keys)
547 |             else:
548 |                 beta = pd.DataFrame(beta, index=keys, columns=columns)
549 |             if self.is_univariate:
550 |                 beta = beta.squeeze(axis=axis)
551 |         else:
552 |             alpha = pd.DataFrame(alpha, index=index, columns=columns)
553 |             if self.is_univariate:
554 |                 beta = pd.DataFrame(np.squeeze(beta, axis=0), index=index, columns=columns)
555 |             else:
556 |                 beta = [pd.DataFrame(beta[i], index=index, columns=columns) for i in range(k - fit_intercept)]
557 |                 if keys is not None:
558 |                     for _key, _beta in zip(keys, beta):
559 |                         _beta.name = _key
560 |         return BatchRegressionResult(beta, alpha=alpha)
561 | 
562 | 
563 | def rolling_regression(x, y, window, w=None, *, fit_intercept=True):
564 |     """
565 |     Perform rolling regression.
566 | 
567 |     Parameters
568 |     ----------
569 |     x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
570 |         The independent variable(s).
571 |     y: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
572 |         The dependent variable.
573 |     window: int
574 |         The window size for rolling regression.
575 |     w: optional
576 |         The weights for WLS, default is None.
577 |     fit_intercept: bool, optional
578 |         Whether to fit an intercept term, default is True.
579 | 
580 |     Returns
581 |     -------
582 |     BatchRegressionResult
583 |         The batch regression result object.
584 |     """
585 |     return RollingRegressor(x, y, w, fit_intercept=fit_intercept).fit(window)
586 | 
587 | 
588 | def table_regression(x, y, w=None, *, fit_intercept=True, axis=1):
589 |     """
590 |     Perform table regression (apply regression column-wise or row-wise)
591 | 
592 |     Parameters
593 |     ----------
594 |     x: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
595 |         The independent variable(s).
596 |     y: pd.Series | pd.DataFrame | list[pd.Series] | np.ndarray
597 |         The dependent variable.
598 |     w: optional
599 |         The weights for WLS, default is None.
600 |     fit_intercept: bool, optional
601 |         Whether to fit an intercept term, default is True.
602 |     axis: int, optional
603 |         The axis along which to perform the regression, default is 1.
604 | 
605 |     Returns
606 |     -------
607 |     BatchRegressionResult
608 |         The batch regression result object.
609 |     """
610 |     return RollingRegressor(x, y, w, fit_intercept=fit_intercept).fit(None, axis=axis)
611 | 


--------------------------------------------------------------------------------
/firefin/core/plot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/core/plot/__init__.py


--------------------------------------------------------------------------------
/firefin/core/plot/plots.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
  2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
  3 | 
  4 | from pathlib import Path
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | import statsmodels.api as sm
 11 | from matplotlib.gridspec import GridSpec
 12 | from numba import njit
 13 | from scipy import stats
 14 | 
 15 | from ...evaluation.eva_utils import PeriodType, IC, QuantileReturns
 16 | 
 17 | __all__ = [
 18 |     "plt_ic",
 19 |     "plt_cumulative_returns",
 20 |     "plt_quantile_cumulative_returns",
 21 |     "plt_quantile_cumulated_end_returns",
 22 | ]
 23 | 
 24 | 
 25 | sns.set_style("whitegrid")
 26 | 
 27 | 
 28 | def _plt_cumsum_ic(summarized_data, ax, factor_name, data_name):
 29 |     y_mean = summarized_data.resample("YE", label="left").mean()
 30 |     for c, sr in y_mean.T.iterrows():
 31 |         ax.scatter(sr.index, sr, marker="x")
 32 |     ax.set_title(f"{factor_name} Cumulative {data_name} & Yearly Mean")
 33 |     ax.axhline(0, linestyle="-", color="black", lw=1)
 34 | 
 35 |     axr = ax.twinx()
 36 |     axr.plot(summarized_data.cumsum())
 37 |     axr.legend(summarized_data.columns, loc=2)
 38 |     axr.grid(linestyle=":")
 39 | 
 40 |     # make sure the left axis has visible 0-line
 41 |     b, t = ax.get_ylim()
 42 |     if b > 0:
 43 |         ax.set_ylim(0, t)
 44 |     elif t < 0:
 45 |         ax.set_ylim(b, 0)
 46 | 
 47 | 
 48 | def _summarize_ic_data(data):
 49 |     origin_columns = data.columns
 50 | 
 51 |     _mean = data.mean()
 52 |     _std = data.std()
 53 |     _ir = data.mean() / data.std()
 54 | 
 55 |     summary_columns = [f"{c}, AVG={_mean[c]:.2%}, STD={_std[c]:.2%}, IR={_ir[c]:.2f}" for c in origin_columns]
 56 | 
 57 |     summarized_data = data.rename(columns=dict(zip(origin_columns, summary_columns)))
 58 | 
 59 |     return summarized_data
 60 | 
 61 | 
 62 | def _plt_monthly_and_20ma_ic(data, axs, data_name, color_bounds):
 63 |     origin_columns = data.columns
 64 |     markersize = 5
 65 | 
 66 |     data_month = data.resample("ME").mean()
 67 |     summarized_data = _summarize_ic_data(data)
 68 | 
 69 |     for i_p, col in enumerate(origin_columns):
 70 |         ax = axs[i_p]
 71 | 
 72 |         data_col = summarized_data.iloc[:, i_p]
 73 |         ax.plot(data_col.rolling(20, min_periods=1).mean())
 74 | 
 75 |         month_p_data = data_month.iloc[:, i_p]
 76 | 
 77 |         for color, bounds in color_bounds.items():
 78 |             data_sel = month_p_data[(month_p_data >= bounds[0]) & (month_p_data <= bounds[1])]
 79 |             data_sel.plot(color=color, linestyle="", marker="D", markersize=markersize, ax=ax)
 80 | 
 81 |         ax.set(
 82 |             xlabel="",
 83 |             title=f"{col} {data_name}, Monthly Average and 20-day MA",
 84 |         )
 85 | 
 86 |     return summarized_data
 87 | 
 88 | 
 89 | def plt_ic(ic_data: IC, factor_name="Factor", dist=True, plot_dir=None, show=True):
 90 |     """
 91 |     Plot an IC plot with Monthly IC, Cumulative IC and IC distribution.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     ic_data: IC
 96 |     factor_name: str
 97 |     plot_dir: None or Path
 98 |     show: bool
 99 |     dist: bool
100 |         if True, show distribution of IC and its QQ-plot.
101 | 
102 |     """
103 | 
104 |     ic_data = ic_data.dropna(how="all")
105 |     if not isinstance(ic_data.index, pd.DatetimeIndex):
106 |         ic_data.index = pd.DatetimeIndex(ic_data.index)
107 |     columns = ic_data.columns
108 |     n_cols = len(columns)
109 | 
110 |     if dist:
111 |         fig_width = 20
112 |         grid_width = 4
113 |     else:
114 |         fig_width = 10
115 |         grid_width = 2
116 | 
117 |     fig = plt.figure(figsize=(fig_width, 3.5 * n_cols))
118 |     grid = GridSpec(n_cols * 4, grid_width, figure=fig)
119 | 
120 |     # 折线图，每个period一张，高度为3
121 |     ax0 = fig.add_subplot(grid[:3, :2])
122 |     axs = [ax0]
123 |     for i_p in range(1, n_cols):
124 |         axs.append(fig.add_subplot(grid[i_p * 3 : (i_p + 1) * 3, :2], sharex=ax0, sharey=ax0))
125 |     # 累计图，一张，高度为n_periods
126 |     axs.append(fig.add_subplot(grid[n_cols * 3 :, :2]))
127 | 
128 |     if dist:
129 |         # 分布图，每个period一张，高度为4
130 |         for i_p in range(n_cols):
131 |             for i in range(2):
132 |                 axs.append(fig.add_subplot(grid[i_p * 4 : (i_p + 1) * 4, 2 + i]))
133 | 
134 |     # matplotlib/_color_data.py
135 |     # https://drafts.csswg.org/css-color-4/#named-colors
136 |     # 都是闭区间，先画暗色，再画两端的亮色以及0的灰色
137 |     color_bounds = {
138 |         "grey": [-0.02, 0.02],
139 |         "darkblue": [-0.05, -0.02],
140 |         "darkred": [0.02, 0.05],
141 |         "blue": [-np.inf, -0.05],
142 |         "red": [0.05, np.inf],
143 |     }
144 | 
145 |     summarized_data = _plt_monthly_and_20ma_ic(ic_data, axs, "IC", color_bounds=color_bounds)
146 |     _plt_cumsum_ic(summarized_data, axs[n_cols], factor_name, "IC")
147 | 
148 |     if dist:
149 |         for i_p, p in enumerate(columns):
150 |             ic_data_p = ic_data.iloc[:, i_p].dropna()
151 | 
152 |             ax1, ax2 = axs[n_cols + 1 + i_p * 2], axs[n_cols + 2 + i_p * 2]
153 | 
154 |             sns.histplot(ic_data_p, kde=True, bins=int(np.ceil(np.log(ic_data_p.size) * 10)), stat="density", ax=ax1)
155 |             ax1.set(
156 |                 xlabel=f"{p}, Mean {ic_data_p.mean():.2f}, Skew {ic_data_p.skew():.2f}, Kurt {ic_data_p.kurt():.2f}"
157 |             )
158 |             sm.qqplot(ic_data_p, stats.norm, fit=True, line="45", ax=ax2)
159 |             ax2.set(ylabel="Observed Quantile", xlabel="Norm Distribution Quantile")
160 | 
161 |     if plot_dir:
162 |         plt.savefig(Path(plot_dir) / f"{factor_name} IC plot.png", bbox_inches="tight")
163 |     if show:
164 |         plt.show()
165 | 
166 |     summary_table = pd.DataFrame(
167 |         np.nan,
168 |         index=["mean", "std", "ir", "> 0", "< 0", "> 3%", "< -3%", "> 5%", "< -5%"],
169 |         columns=columns,
170 |     )
171 | 
172 |     ic_mean = ic_data.mean()
173 |     ic_std = ic_data.std()
174 |     ir = ic_mean / ic_std
175 | 
176 |     summary_table.loc["mean"] = ic_mean.values
177 |     summary_table.loc["std"] = ic_std.values
178 |     summary_table.loc["ir"] = ir.values
179 |     summary_table.loc["> 0"] = ((ic_data > 0).sum() / np.isfinite(ic_data).sum()).values
180 |     summary_table.loc["< 0"] = ((ic_data < 0).sum() / np.isfinite(ic_data).sum()).values
181 |     summary_table.loc["> 3%"] = ((ic_data > 0.03).sum() / np.isfinite(ic_data).sum()).values
182 |     summary_table.loc["< -3%"] = ((ic_data < -0.03).sum() / np.isfinite(ic_data).sum()).values
183 |     summary_table.loc["> 5%"] = ((ic_data > 0.05).sum() / np.isfinite(ic_data).sum()).values
184 |     summary_table.loc["< -5%"] = ((ic_data < -0.05).sum() / np.isfinite(ic_data).sum()).values
185 |     print(summary_table)
186 | 
187 | 
188 | def _get_annual_and_end_returns(daily_cum_returns):
189 |     daily_cum_returns = np.asarray(daily_cum_returns)
190 |     n, m = daily_cum_returns.shape
191 |     na_mask = np.isfinite(daily_cum_returns)
192 |     end_returns = []
193 | 
194 |     for j in range(m):
195 |         for i in range(n):
196 |             if na_mask[n - 1 - i, j]:
197 |                 end_returns.append(daily_cum_returns[n - 1 - i, j])
198 |                 break
199 |     end_returns = np.asarray(end_returns)
200 |     annual_returns = np.float_power(np.add(end_returns, 1), 244 / na_mask.sum(axis=0)) - 1
201 |     return annual_returns, end_returns
202 | 
203 | 
204 | def plt_cumulative_returns(
205 |     *,
206 |     daily_returns=None,
207 |     daily_cum_returns=None,
208 |     show_min_max=True,
209 |     title="Cumulative Returns",
210 |     ax=None,
211 |     show=False,
212 |     plot_dir=None,
213 | ):
214 |     """
215 | 
216 |     Parameters
217 |     ----------
218 |     daily_returns: pd.DataFrame
219 |     daily_cum_returns: pd.DataFrame
220 |     show_min_max: bool
221 |     title: str
222 |     ax: matplotlib axis
223 |     show: bool
224 |     plot_dir: Path, default=None
225 | 
226 |     """
227 | 
228 |     if daily_returns is None:
229 |         if daily_cum_returns is None:
230 |             raise ValueError(f"one of daily_returns or daily_cum_returns must be provided")
231 |         else:
232 |             daily_cum_returns = daily_cum_returns.dropna(how="all")
233 |             daily_returns = daily_cum_returns.add(1).pct_change()
234 |             daily_returns.iloc[0] = daily_cum_returns.iloc[0]
235 | 
236 |     else:
237 |         if daily_cum_returns is not None:
238 |             raise ValueError(f"exactly one of daily_returns or daily_cum_returns should be provided")
239 |         else:
240 |             daily_returns = daily_returns.dropna(how="all")
241 |             daily_cum_returns = daily_returns.add(1).cumprod() - 1
242 | 
243 |     # set name for columns
244 |     annual_returns, end_returns = _get_annual_and_end_returns(daily_cum_returns)
245 |     daily_cum_returns.columns = [
246 |         f"{c}, ANN. {art:.2%}, TOT. {ert:.2%}"
247 |         for c, art, ert in zip(daily_cum_returns.columns, annual_returns, end_returns)
248 |     ]
249 |     daily_returns.columns = daily_cum_returns.columns
250 | 
251 |     if ax is None:
252 |         _, ax = plt.subplots()
253 |     daily_cum_returns.plot(cmap=plt.cm.coolwarm, ax=ax)
254 |     if show_min_max:
255 |         max_group = daily_returns.columns[[0, -1]][np.argmax(annual_returns[[0, -1]])]
256 |         min_group = daily_returns.columns[[0, -1]][np.argmin(annual_returns[[0, -1]])]
257 |         min_max_diff = (daily_returns.loc[:, max_group] - daily_returns.loc[:, min_group] + 1).cumprod() - 1
258 |         [annual_returns], [end_returns] = _get_annual_and_end_returns(min_max_diff.to_frame())
259 |         min_max_diff.name = f"Min Max, ANN. {annual_returns:.2%}, TOT. {end_returns:.2%}"
260 |         min_max_diff.plot(lw=2, color="black", alpha=0.8, ax=ax)
261 | 
262 |     ax.set(xlabel="", ylabel="Cumulative Returns", title=title)
263 |     ax.legend(loc=2, ncol=int(np.ceil(len(daily_returns.columns) / 25)), fontsize=8)
264 |     ax.axhline(0.0, linestyle="-", color="black", lw=1)
265 | 
266 |     # if logy:
267 |     #     from matplotlib.ticker import FuncFormatter
268 |     #
269 |     #     log_return_locator_cls = get_log_return_locator()
270 |     #
271 |     #     fwd, ivt = lambda x: np.log1p(x), lambda x: np.exp(x) - 1
272 |     #     ax.set_yscale("function", functions=(fwd, ivt))
273 |     #     ax.set_ylim([np.exp(np.log(1 + np.nanmin(daily_cum_returns)) * 1.1) - 1, None])
274 |     #     ax.yaxis.set_major_locator(log_return_locator_cls(base=10, linthresh=1))
275 |     #     ax.yaxis.set_major_formatter(FuncFormatter(log_return_formater))
276 | 
277 |     if plot_dir:
278 |         plt.savefig(plot_dir / f"{title}.png", bbox_inches="tight")
279 |     if show:
280 |         plt.show()
281 | 
282 | 
283 | def return_to_daily(data: pd.Series | pd.DataFrame, period: PeriodType):
284 |     """Convert period returns to daily returns."""
285 |     if period == 1:
286 |         return data.copy(deep=False)
287 |     return ((data + 1) ** (1 / period)) - 1
288 | 
289 | 
290 | def compute_cum_returns(daily_ret: pd.Series | pd.DataFrame):
291 |     return (1 + daily_ret).cumprod() - 1
292 | 
293 | 
294 | def _can_plot_recent(data: pd.Series | pd.DataFrame, years=3) -> tuple[bool, pd.Timestamp]:
295 |     """check if longer than 3 years and return the -3 year loc if possible"""
296 |     # index of this is datetime index
297 |     index = data.index
298 |     if (index[-1] - index[0]).days // 365 >= years:
299 |         plot_recent = True
300 |         loc = index[-1] - pd.Timedelta(days=365 * years)
301 |     else:
302 |         plot_recent = False
303 |         loc = None
304 |     return plot_recent, loc
305 | 
306 | 
307 | def plt_quantile_cumulative_returns(quantile_returns: QuantileReturns, factor_name="Factor", plot_dir=None, show=True):
308 |     """
309 |     Plot the cumulative returns of each quantile.
310 | 
311 |     Parameters
312 |     ----------
313 |     quantile_returns: QuantileReturns
314 |     factor_name: str
315 |     plot_dir: Path, default None
316 |     show: bool, default True
317 | 
318 |     """
319 |     cum_returns = {
320 |         period: compute_cum_returns(return_to_daily(period_returns, period))
321 |         for period, period_returns in quantile_returns.items()
322 |     }
323 |     periods = sorted(cum_returns.keys())
324 | 
325 |     plot_recent, loc = _can_plot_recent(next(iter(quantile_returns.values())))
326 | 
327 |     fig, axs = plt.subplots(len(periods), 1 + plot_recent, figsize=(10 + (3 * plot_recent), 7 * len(periods)))
328 |     for (period, period_cum_returns), ax in zip(cum_returns.items(), axs):
329 |         if plot_recent:
330 |             ax1, ax2 = ax
331 |         else:
332 |             ax1, ax2 = ax, NotImplemented
333 | 
334 |         plt_cumulative_returns(
335 |             daily_cum_returns=period_cum_returns,
336 |             ax=ax1,
337 |             show_min_max=True,
338 |             title=f"{factor_name} ({period} Fwd Period)",
339 |             show=False,
340 |         )
341 |         if plot_recent:
342 |             recent_data = period_cum_returns.loc[loc:] + 1
343 |             recent_data = recent_data.pct_change(fill_method=None).add(1).cumprod().sub(1)
344 |             plt_cumulative_returns(
345 |                 daily_cum_returns=recent_data,
346 |                 ax=ax2,
347 |                 show_min_max=True,
348 |                 title=f"{factor_name} (Recent) ({period} Fwd Period)",
349 |                 show=False,
350 |             )
351 |     if plot_dir:
352 |         plt.savefig(
353 |             Path(plot_dir) / f"{factor_name} Quantile Cum Returns.png",
354 |             bbox_inches="tight",
355 |         )
356 |     if show:
357 |         plt.show()
358 | 
359 | 
360 | @njit
361 | def get_cum_end_returns(daily_rt):
362 |     started = False
363 |     cum_rt = 1
364 |     total = 0
365 |     n_cs_nans = 0
366 |     for x in daily_rt:
367 |         if np.isfinite(x):
368 |             n_cs_nans = 0
369 |             started = True
370 |             cum_rt *= 1 + x
371 |             total += 1
372 |         else:
373 |             n_cs_nans += 1
374 |             if started:
375 |                 total += 1
376 |     if not started:
377 |         return np.nan
378 |     total -= n_cs_nans
379 |     return cum_rt ** (244 / total) - 1
380 | 
381 | 
382 | def get_cumulated_end_returns(daily_ret: pd.Series | pd.DataFrame, std=False):
383 |     """
384 |     Get cumulated end returns of each quantile
385 | 
386 |     Parameters
387 |     ----------
388 |     std: bool, default False
389 |         If True, returns the standard deviation of the cumulated end returns
390 | 
391 |     """
392 | 
393 |     returns_avg = daily_ret.apply(get_cum_end_returns, raw=True)
394 | 
395 |     if std:
396 |         returns_std = daily_ret.std() * np.sqrt(244)
397 |         return returns_avg, returns_std
398 |     else:
399 |         return returns_avg
400 | 
401 | 
402 | def _get_avg_and_std(quantile_returns: QuantileReturns):
403 |     returns_avg = {}
404 |     returns_std = {}
405 |     for period, period_returns in quantile_returns.items():
406 |         returns_avg[period], returns_std[period] = get_cumulated_end_returns(period_returns, std=True)
407 | 
408 |     # quantile x periods
409 |     return pd.DataFrame(returns_avg), pd.DataFrame(returns_std)
410 | 
411 | 
412 | def plt_quantile_cumulated_end_returns(
413 |     quantile_returns: QuantileReturns, factor_name="Factor", plot_dir=None, show=True
414 | ):
415 |     """
416 |     Plot the cumulated end returns of each quantile.
417 | 
418 |     Parameters
419 |     ----------
420 |     quantile_returns: QuantileReturns
421 |     factor_name: str
422 |     plot_dir: Path, default None
423 |     show: bool, default True
424 | 
425 |     """
426 |     returns_avg, returns_std = _get_avg_and_std(quantile_returns)
427 | 
428 |     plot_recent, loc = _can_plot_recent(next(iter(quantile_returns.values())))
429 | 
430 |     w, h = (4 * len(returns_avg) * (1 + plot_recent) + 50) / 9, 16
431 | 
432 |     def _plot(avg, std, axavg, axstd, name):
433 |         avg.plot(kind="bar", width=0.8, ax=axavg)
434 |         axavg.set(
435 |             xlabel="",
436 |             ylabel="Return Mean (Ann.)",
437 |             title=f"{name} Return Mean By Quantile",
438 |         )
439 |         std.plot(kind="bar", width=0.8, ax=axstd)
440 |         axstd.set(
441 |             xlabel="",
442 |             ylabel="Return Std (Ann.)",
443 |             title=f"{name} Return Std By Quantile",
444 |         )
445 | 
446 |     fig, axs = plt.subplots(2, 1 + plot_recent, figsize=(w, h))
447 |     if plot_recent:
448 |         (ax_avg1, ax_avg2), (ax_std1, ax_std2) = axs
449 |     else:
450 |         ax_avg1, ax_std1 = axs
451 |         ax_avg2 = ax_std2 = None
452 |     _plot(returns_avg, returns_std, ax_avg1, ax_std1, factor_name)
453 | 
454 |     if plot_recent:
455 |         returns_avg_rct, returns_std_rct = _get_avg_and_std(
456 |             QuantileReturns({k: v.loc[loc:] for k, v in quantile_returns.items()})
457 |         )
458 |         _plot(returns_avg_rct, returns_std_rct, ax_avg2, ax_std2, f"{factor_name} (Recent)")
459 | 
460 |     for ax in axs.flatten():
461 |         ax.yaxis.set_major_formatter(plt.FuncFormatter("{:.0%}".format))
462 | 
463 |     if plot_dir:
464 |         plt.savefig(
465 |             Path(plot_dir) / f"{factor_name} Quantile End Returns.png",
466 |             bbox_inches="tight",
467 |         )
468 |     if show:
469 |         plt.show()
470 | 


--------------------------------------------------------------------------------
/firefin/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
3 | 
4 | """
5 | interface for fetching data
6 | 
7 | """
8 | from .gateway import fetch_data


--------------------------------------------------------------------------------
/firefin/data/datainfo.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import os
 5 | import pandas as pd
 6 | from ..common.config import DATA_PATH
 7 | 
 8 | def load_AStock_info() -> tuple[pd.DataFrame, pd.DataFrame]:
 9 |     try:
10 |         columns = pd.read_feather(os.path.join(DATA_PATH, "columns.feather"))
11 |         index = pd.read_feather(os.path.join(DATA_PATH, "index.feather"))
12 |     except FileNotFoundError as e:
13 |         raise FileNotFoundError(f"File not found: {e}, please download data first")
14 |     return columns, index


--------------------------------------------------------------------------------
/firefin/data/fake.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
  2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
  3 | 
  4 | import typing
  5 | from functools import partial
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | from ..common.const import MIN_BARTIMES
 11 | 
 12 | IndexType = typing.Literal["d", "day", "m", "min", "minute", "l1", "l2"]
 13 | MockType = typing.Literal["rand", "norm", "price", "volume", "return", "arange"]
 14 | 
 15 | 
 16 | def _get_l2_seconds():
 17 |     morning = pd.timedelta_range("09:15:00", "11:30:00", freq="3 s")
 18 |     afternoon = pd.timedelta_range("13:00:00", "15:30:00", freq="3 s")
 19 |     return morning.union(afternoon)
 20 | 
 21 | 
 22 | l2_seconds = _get_l2_seconds()
 23 | 
 24 | 
 25 | def _index_maker(n, index_type: IndexType = "day"):
 26 |     if index_type in ("d", "day"):
 27 |         return pd.date_range("2010/1/1", periods=n, name="trade_date").strftime("%Y-%m-%d")
 28 |     elif index_type in ("m", "min", "minute"):
 29 |         n_days, n_minute = divmod(n, len(MIN_BARTIMES))
 30 |         if n_minute > 0:
 31 |             n_days += 1
 32 |         day_part = _index_maker(n_days, index_type="day")
 33 |         total_index = pd.MultiIndex.from_product([day_part, MIN_BARTIMES], names=["trade_date", "bartime"])
 34 |         if n_minute == 0:
 35 |             return total_index
 36 |         else:
 37 |             return total_index[: -(len(MIN_BARTIMES) - n_minute)]
 38 |     elif index_type in ("l1", "l2"):
 39 |         n_days, n_sec = divmod(n, 5702)
 40 |         if n_sec > 0:
 41 |             n_days += 1
 42 |         day_part = _index_maker(n_days, index_type="day")
 43 | 
 44 |         total_index = pd.concat(
 45 |             [pd.Series(0, index=pd.DatetimeIndex([dt]).repeat(5702) + l2_seconds) for dt in day_part]
 46 |         ).index
 47 |         if n_sec == 0:
 48 |             return total_index
 49 |         else:
 50 |             return total_index[: -(5702 - n_sec)]
 51 |     raise NotImplementedError(f"index_type {index_type} not implemented")
 52 | 
 53 | 
 54 | def _nb_random(shape, mock):
 55 |     if mock == "rand":
 56 |         return np.random.random(shape)
 57 |     elif mock == "norm":
 58 |         return np.random.randn(*shape)
 59 |     elif mock == "return":
 60 |         return np.random.normal(0.0, 0.03, shape)
 61 |     elif mock == "price":
 62 |         rt = _nb_random(shape, mock="return")
 63 |         price = (rt + 1).cumprod().reshape((shape[0], -1))
 64 |         price *= np.exp(np.random.normal(3.5, 1.06, price.shape[-1]))
 65 |         return price.reshape(shape)
 66 |     elif mock == "volume":
 67 |         return np.exp(np.random.normal(14.26, 1.29, shape))
 68 |     elif mock == "arange":
 69 |         total = 1
 70 |         for s in shape:
 71 |             total *= s
 72 |         return np.arange(total, dtype=np.float64).reshape(shape)
 73 | 
 74 | 
 75 | def _value_maker(shape, fill_value=np.nan, mock: MockType = "rand"):
 76 |     if fill_value is np.nan:
 77 |         if mock in MockType.__args__:
 78 |             return _nb_random(shape, mock)
 79 |         else:
 80 |             raise ValueError(f"mock {mock} not implemented")
 81 |     else:
 82 |         return np.full(shape, fill_value)
 83 | 
 84 | 
 85 | def _generate_stock_code(i):
 86 |     c = f"{i:06}."
 87 |     if not c.startswith(("0", "3", "6")):
 88 |         c = ("0", "3", "6")[int(c[0]) % 3] + c[1:]
 89 | 
 90 |     if c.startswith(("0", "3")):
 91 |         c += "SZ"
 92 |     else:
 93 |         c += "SH"
 94 |     return c
 95 | 
 96 | 
 97 | def _columns_maker(n):
 98 |     return pd.Index(sorted(map(_generate_stock_code, range(n))), name="stock_code")
 99 | 
100 | 
101 | def gen_df(*shape, fill_value=np.nan, index: IndexType = "day", mock: MockType = "rand", **joblib_kwargs):
102 |     """quickly generate stock like DataFrames for test"""
103 |     if not shape:
104 |         shape = (10, 3)
105 |     shape = tuple(np.ravel(shape))
106 | 
107 |     index_maker = partial(_index_maker, index_type=index)
108 |     value_maker = partial(_value_maker, fill_value=fill_value, mock=mock)
109 | 
110 |     if len(shape) == 1:
111 |         container = pd.Series
112 |         idx_col = {"index": index_maker(shape[0])}
113 |     elif len(shape) == 2:
114 |         container = pd.DataFrame
115 |         idx_col = {"index": index_maker(shape[0]), "columns": _columns_maker(shape[1])}
116 |     else:
117 |         raise NotImplementedError(f"shape {shape} not implemented")
118 | 
119 |     out = container(value_maker(shape), **idx_col)
120 |     return out
121 | 


--------------------------------------------------------------------------------
/firefin/data/file_reader.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import os
 5 | import pandas as pd
 6 | from ..common.config import DATA_PATH
 7 | 
 8 | data_path = os.path.join(os.path.dirname(__file__), 'raw')
 9 | 
10 | # TODO: support other file types
11 | # TODO: support start and end date, only read the data in the range
12 | def read_feather(names):
13 |     try:
14 |         result = {n : pd.read_feather(f"{DATA_PATH}/{n}.feather") for n in names}
15 |     except FileNotFoundError as e:
16 |         raise FileNotFoundError(f"File not found: {e}, please download data first")
17 |     return result
18 | 
19 | 
20 | def file_reader(info: dict[str, list[str]]) -> dict[str, pd.DataFrame]:
21 |     # TODO: support other file types
22 |     feather_reader_names = info['feather']
23 |     return read_feather(feather_reader_names)


--------------------------------------------------------------------------------
/firefin/data/gateway.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
  2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from ..common.config import logger, DATA_MAPS
  7 | from .datainfo import load_AStock_info
  8 | from .fake import gen_df
  9 | from .file_reader import file_reader
 10 | 
 11 | def _get_clean_names(names) -> list:
 12 |     output = []
 13 | 
 14 |     def _add_from_str(s):
 15 |         for n in s.split(","):
 16 |             n = n.replace(" ", "")
 17 |             if n and n not in output:
 18 |                 output.append(n)
 19 | 
 20 |     for name in np.ravel(names):
 21 |         if isinstance(name, str):
 22 |             _add_from_str(name)
 23 |         else:
 24 |             # we assume it's iterable of strings
 25 |             for _name in name:
 26 |                 _add_from_str(_name)
 27 |     return output
 28 | 
 29 | 
 30 | def _get_clean_se(start=None, end=None, dates=None):
 31 |     """basic checks, no transformation for input ts"""
 32 |     if dates is None:
 33 |         if start is not None:
 34 |             assert np.ndim(start) == 0, f"start must be a scalar, got {start}"
 35 |         if end is not None:
 36 |             assert np.ndim(end) == 0, f"end must be a scalar, got {end}"
 37 | 
 38 |     else:
 39 |         if start is not None or end is not None:
 40 |             raise ValueError("start and end cannot be used with dates")
 41 |         else:
 42 |             if isinstance(dates, slice):
 43 |                 dates = [dates.start, dates.stop]
 44 |             elif pd.api.types.is_list_like(dates):
 45 |                 pass
 46 |             else:
 47 |                 # str, datetime-like
 48 |                 dates = np.atleast_1d(dates)
 49 |             start, end = dates[0], dates[-1]
 50 | 
 51 |     return start, end
 52 | 
 53 | 
 54 | def _parse_args(names, start_date, end_date, dates):
 55 |     """
 56 |     parse names, start_date, end_date
 57 | 
 58 |     Notes
 59 |     -----
 60 |     define name string as a str of a single data or a str of multiple data names separated by comma.
 61 |     `names` can be: single name sting, iterable of name sting, or an iterable containing name string
 62 |     and trailing datetime-like
 63 |     if `names` has trailing datetime-like, `start_date`, `end_date` and `dates` should be None
 64 | 
 65 |     Examples
 66 |     --------
 67 |     names can be:
 68 |         "close"
 69 |         "close, open"
 70 |         ["close", "open"]
 71 |         ["close, open"]
 72 |     tailing datetime-like can be:
 73 |         "2020/1/1"
 74 |         ["2020/1/1", "2020/1/2"]
 75 |         slice("2020/1/1", "2020/1/2")
 76 | 
 77 |     """
 78 | 
 79 |     def is_datetime_like(obj):
 80 |         try:
 81 |             pd.to_datetime(obj)
 82 |         except Exception:
 83 |             return False
 84 |         else:
 85 |             return True
 86 | 
 87 |     if dates is None:
 88 |         # datetime is list-like or slice
 89 |         if isinstance(n1 := names[-1], slice):
 90 |             names = names[:-1]
 91 |             start_date, end_date = _get_clean_se(start_date, end_date, dates=n1)
 92 |         elif pd.api.types.is_list_like(n1) and is_datetime_like(t := np.ravel(n1)):
 93 |             names = names[:-1]
 94 |             start_date, end_date = _get_clean_se(start_date, end_date, dates=t)
 95 |         else:
 96 |             if is_datetime_like(n1):
 97 |                 end_date = n1
 98 |                 names = names[:-1]
 99 |                 if len(names) >= 2 and is_datetime_like(n2 := names[-1]):
100 |                     start_date = n2
101 |                     names = names[:-1]
102 |                 else:
103 |                     start_date = end_date
104 |             start_date, end_date = _get_clean_se(start_date, end_date)
105 |     else:
106 |         start_date, end_date = _get_clean_se(start_date, end_date, dates=dates)
107 | 
108 |     return _get_clean_names(names), start_date, end_date
109 | 
110 | 
111 | def check_if_valid(names: list[str]) -> dict[str, bool]:
112 |     return {n: n in DATA_MAPS.keys() for n in names}
113 | 
114 | 
115 | def fetch_data(
116 |     *args,
117 |     names=None,
118 |     start_date=None,
119 |     end_date=None,
120 |     dates=None,
121 |     market_range="ALL",
122 | ) -> dict[str, pd.DataFrame]:
123 |     if names is None:
124 |         names = args
125 |     elif args:
126 |         raise ValueError("you may only use `names` or `*args` to specify the data to be queried")
127 | 
128 |     names, start_date, end_date = _parse_args(names, start_date, end_date, dates)
129 | 
130 |     results = {}
131 |     if not names:
132 |         return results
133 | 
134 |     valid = check_if_valid(names)
135 | 
136 |     for k, v in valid.items():
137 |         if not v:
138 |             columns, index = load_AStock_info()
139 |             logger.warning(f"{k} is not a valid data name, mock with random data")
140 |             results[k] = gen_df((len(index), len(columns)))
141 |             names.remove(k)
142 | 
143 |     if len(names) == 0:
144 |         return results
145 | 
146 |     # only support file reader for now
147 |     file_reader_names = dict()
148 | 
149 |     for name in names:
150 |         try:
151 |             l, r = DATA_MAPS[name].split("::")  # noqa: E741
152 |         except Exception as e:
153 |             logger.error(f"cannot find data source for {name}, reason: {e}")
154 |             continue
155 | 
156 |         if l == "file":
157 |             if r not in file_reader_names:
158 |                 file_reader_names[r] = [name]
159 |             else:
160 |                 file_reader_names[r].append(name)
161 |         else:
162 |             raise ValueError(f"{name} unsupported data source: {l}::{r}")
163 | 
164 |     # only support file reader for now
165 |     results.update(file_reader(file_reader_names))
166 |     return results
167 | 


--------------------------------------------------------------------------------
/firefin/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/evaluation/__init__.py


--------------------------------------------------------------------------------
/firefin/evaluation/academia/AcaEvaluatorModel.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | import pandas as pd
  3 | from ..eva_utils import compute_ic, ForwardReturns, QuantileReturns
  4 | from ...core.algorithm.regression import least_square, rolling_regression, BatchRegressionResult
  5 | from ...common.config import logger
  6 | from .anomaly_test import AnomalyTest
  7 | from .fama_macbeth import FamaMacBeth
  8 | from .portfolio_sort import PortfolioSort
  9 | 
 10 | class AcaEvaluatorModel:
 11 |     def __init__(self,
 12 |                  factor: pd.DataFrame,
 13 |                  forward_returns: ForwardReturns, 
 14 |                  return_adj: pd.DataFrame,
 15 |                  n_jobs: int = 10,
 16 |                  verbose: int = 0):
 17 |         """
 18 |         Parameters:
 19 |             factor: pd.DataFrame
 20 |                 Factor exposure data (Time × Stock)
 21 |             forward_returns: dict[str, pd.DataFrame]
 22 |                 A dictionary where each key is a holding period, and the value is a DataFrame of future returns (Time × Stock)
 23 |             return_adj: pd.DataFrame
 24 |                 DataFrame of adjusted returns (Time × Stock)
 25 |             n_jobs: int
 26 |                 Number of jobs to run in parallel
 27 |             verbose: int
 28 |                 Verbosity level
 29 |         """
 30 | 
 31 |         self.factor = factor
 32 |         self.forward_returns = forward_returns
 33 |         self.return_adj = return_adj
 34 |         self.n_jobs = n_jobs
 35 |         self.verbose = verbose
 36 | 
 37 |     def run_single_sort(self,
 38 |                         quantiles: int = 5,
 39 |                         value_weighted: bool = True,
 40 |                         return_stats: bool = False,
 41 |                         market_cap: pd.DataFrame = None,
 42 |                         get_quantile_sorts: bool = False):
 43 |         """
 44 |         Perform single-factor portfolio sorting to compute returns for each quantile group, 
 45 |         with optional return of statistics and quantile labels.
 46 | 
 47 |         Parameters:
 48 |             quantiles: int
 49 |                 Number of quantile groups (e.g., 5 for quintile sorting)
 50 |             value_weighted: bool
 51 |                 Whether to use value-weighted portfolios; False indicates equal-weighted portfolios
 52 |             return_stats: bool
 53 |                 Whether to compute and return statistics (mean, t-stat, p-value, etc.) for the H-L portfolio
 54 |             market_cap: pd.DataFrame
 55 |                 Market capitalization data, with the same dimensions as the factor; required if value_weighted is True
 56 |             get_quantile_sorts: bool
 57 |                 Whether to return the quantile label assigned to each stock
 58 | 
 59 |         Returns:
 60 |             If return_stats is True:
 61 |                 Tuple[QuantileReturns, dict] → (portfolio returns, dictionary of statistics)
 62 |             Otherwise:
 63 |                 QuantileReturns
 64 |         """
 65 | 
 66 |         if value_weighted and market_cap is None:
 67 |             raise ValueError("You must provide market_cap when value_weighted=True.")
 68 | 
 69 |         portfolio_returns = PortfolioSort.single_sort(
 70 |             factor=self.factor,
 71 |             forward_returns=self.forward_returns,
 72 |             market_cap=market_cap,
 73 |             quantiles=quantiles,
 74 |             value_weighted=value_weighted,
 75 |             get_quantile_sorts=get_quantile_sorts
 76 |         )
 77 | 
 78 |         if return_stats:
 79 |             stats = PortfolioSort.get_statistics(portfolio_returns, quantiles)
 80 |             return portfolio_returns, stats
 81 | 
 82 |         return portfolio_returns
 83 | 
 84 |     def run_fama_macbeth(self,
 85 |                          window: int = 252,
 86 |                          return_stats: bool = False):
 87 |         """
 88 |         Perform Fama-MacBeth two-stage cross-sectional regression estimation.
 89 | 
 90 |         Parameters:
 91 |             window: int
 92 |                 Rolling window size for the first-stage regressions (default is 252, i.e., one year)
 93 |             return_stats: bool
 94 |                 Whether to return t-statistics and significance test results
 95 | 
 96 |         Returns:
 97 |             If return_stats is True:
 98 |                 Tuple[RegressionResult, dict] → (regression results, statistics)
 99 |             Otherwise:
100 |                 RegressionResult
101 |         """
102 | 
103 |         results = FamaMacBeth.run_regression(self.factor, self.return_adj, window=window, n_jobs=self.n_jobs, verbose=self.verbose)
104 |         if return_stats:
105 |             stats = FamaMacBeth.test_statistics(results)
106 |             return results, stats
107 |         return results
108 |         
109 |     def run_ic(self, method: str = "pearson") -> pd.DataFrame:
110 |         """
111 |         Compute the Information Coefficient (IC) between the factor and future returns.
112 | 
113 |         Parameters:
114 |             method: str
115 |                 Correlation method to use; options are: 'pearson', 'spearman', 'kendall'
116 |     
117 |         Returns:
118 |             pd.DataFrame
119 |                 IC values for each period
120 |         """
121 | 
122 |         return compute_ic(self.factor, self.forward_returns, method=method)
123 | 
124 |     def run_regression(self, rolling: bool = False, window: int = 60, fit_intercept: bool = True) -> BatchRegressionResult | dict:
125 |         """
126 |         Run either static or rolling regression of returns on factor exposures.
127 | 
128 |         Parameters
129 |         ----------
130 |         rolling : bool, optional
131 |             Whether to perform rolling regression, by default False.
132 |         window : int, optional
133 |             Rolling window size (only used if rolling=True), by default 60.
134 |         fit_intercept : bool, optional
135 |             Whether to include an intercept in the regression, by default True.
136 | 
137 |         Returns
138 |         -------
139 |         BatchRegressionResult | dict
140 |             Regression result object (static) or a dictionary of rolling results.
141 |         """
142 |         if rolling:
143 |             # Use rolling_regression function
144 |             result = rolling_regression(x=self.factor, y=self.return_adj, window=window, fit_intercept=fit_intercept, n_jobs=self.n_jobs, verbose=self.verbose)
145 |         else:
146 |             # Time-by-time regression using least_square
147 |             from collections import defaultdict
148 |             results = defaultdict(list)
149 |             for t in self.factor.index:
150 |                 x_t = self.factor.loc[t]
151 |                 y_t = self.return_adj.loc[t]
152 |                 if x_t.isnull().any() or y_t.isnull().any():
153 |                     continue
154 |                 reg_result = least_square(x=x_t, y=y_t, fit_intercept=fit_intercept)
155 |                 results['alpha'].append(reg_result.alpha)
156 |                 results['beta'].append(reg_result.beta)
157 |                 results['r2'].append(reg_result.r2)
158 |                 results['r2_adj'].append(reg_result.r2_adj)
159 |                 results['residuals'].append(reg_result.residuals)
160 |             result = BatchRegressionResult(alpha=results['alpha'], beta=results['beta'], r2=results['r2'], r2_adj=results['r2_adj'], residuals=results['residuals'])
161 |         return result
162 |         
163 |     def run_anomaly_test(self,
164 |                          portfolio_returns: QuantileReturns,
165 |                          cov_type: typing.Optional[str] = None,
166 |                          cov_kwds: typing.Optional[dict] = None,
167 |                          return_stats: bool = False):
168 |         """
169 |         Perform anomaly test by regressing portfolio returns on a factor model.
170 | 
171 |         Parameters:
172 |             return_stats : bool
173 |                 Whether to return regression statistics summary.
174 | 
175 |         Returns:
176 |             If return_stats is True:
177 |                 Tuple[AnomalyTest, pd.DataFrame]
178 |             Else:
179 |                 AnomalyTest
180 |         """
181 |         mkt_ret = pd.DataFrame(self.return_adj.mean(axis=1))
182 |         tester = AnomalyTest(portfolio_returns= portfolio_returns, factor_model=mkt_ret)
183 |         
184 |         if return_stats:
185 |             summary = tester.fit(cov_type=cov_type, cov_kwds=cov_kwds).test_statistics()
186 |             return summary
187 |         return tester
188 | 
189 | 
190 |     def run_all(self) -> dict:
191 |         """
192 |         Run all available evaluation methods and return results in a dictionary.
193 | 
194 |         Returns
195 |         -------
196 |         dict
197 |             A dictionary containing the results of all evaluation methods.
198 |         """
199 |         results = {}
200 |         #Single Sort
201 |         logger.info("Running Single Sort")
202 |         results['single_sort_res'], results['single_sort_stat'] = self.run_single_sort(
203 |             quantiles=5,
204 |             value_weighted=False,
205 |             return_stats=True
206 |         )
207 |         logger.info("Single Sort Completed")
208 |         #Fama-MacBeth Regression
209 |         logger.info("Running Fama-MacBeth Regression")
210 |         results['fama_macbeth_res'], results['fama_macbeth_stat']= self.run_fama_macbeth(
211 |             window=252,
212 |             return_stats=True
213 |         )
214 |         logger.info("Fama-MacBeth Regression Completed")
215 |         # IC
216 |         logger.info("Running IC")
217 |         results['information_coefficient'] = self.run_ic(method="pearson")
218 |         logger.info("IC Completed")
219 | 
220 |         # Static Regression
221 |         logger.info("Running Static Regression")
222 |         results['regression'] = self.run_regression(rolling=False, fit_intercept=True)
223 |         logger.info("Static Regression Completed")
224 |             
225 |         # Anomaly Test
226 |         logger.info("Running Anomaly Test")
227 |         for k, v in results['single_sort_res'].items():
228 |             results['anomaly_stat'] = {k:self.run_anomaly_test(portfolio_returns= pd.DataFrame(v.iloc[:,-1]), return_stats= True)}
229 |         logger.info("Anomaly Test Completed")
230 | 
231 |         return results


--------------------------------------------------------------------------------
/firefin/evaluation/academia/AcaEvaluatorModelComparison.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from .portfolio_sort import PortfolioSort
  3 | from .MSR_Test import MSRTest
  4 | from ..eva_utils import ForwardReturns
  5 | 
  6 | class AcaEvaluatorModelComparison:
  7 |     def __init__(self, factor1: pd.DataFrame, factor2: pd.DataFrame, forward_returns: ForwardReturns):
  8 |         """
  9 |         Parameters:
 10 |             factor1 & factor2: pd.DataFrame
 11 |                 Factor exposure data (Time × Stock)
 12 |             forward_returns: dict[str, pd.DataFrame]
 13 |                 A dictionary where each key is a holding period, and the value is a DataFrame of future returns (Time × Stock)
 14 |         """
 15 | 
 16 |         self.factor1 = factor1
 17 |         self.factor2 = factor2
 18 |         self.forward_returns = forward_returns
 19 | 
 20 |     def run_double_sort(self,
 21 |                         quantiles: tuple = (5, 5),
 22 |                         dependent: bool = False,
 23 |                         value_weighted: bool = True,
 24 |                         market_cap: pd.DataFrame = None,
 25 |                         get_quantile_sorts: bool = False):
 26 |         """
 27 |         Perform double-factor sorting by jointly grouping assets based on factor1 and factor2, and calculate returns.
 28 | 
 29 |         Parameters:
 30 |             quantiles: Tuple[int, int]
 31 |                 Number of quantile groups for the primary and secondary factors (e.g., (5, 5))
 32 |             dependent: bool
 33 |                 Whether to use conditional (nested) sorting
 34 |             value_weighted: bool
 35 |                 Whether to use value-weighted portfolios
 36 |             market_cap: pd.DataFrame
 37 |                 Market capitalization data, same dimensions as the factors; required if value_weighted is True
 38 |             get_quantile_sorts: bool
 39 |                 Whether to return portfolio labels (i.e., the group each stock belongs to)
 40 | 
 41 |         Returns:
 42 |             QuantileReturns or dict[str, pd.DataFrame] (if get_quantile_sorts is True)
 43 |         """
 44 | 
 45 |         if value_weighted and market_cap is None:
 46 |             raise ValueError("You must provide market_cap when value_weighted=True.")
 47 | 
 48 |         return PortfolioSort.double_sort(
 49 |             factor1=self.factor1,
 50 |             factor2=self.factor2,
 51 |             forward_returns=self.forward_returns,
 52 |             market_cap=market_cap,
 53 |             quantiles=quantiles,
 54 |             dependent=dependent,
 55 |             value_weighted=value_weighted,
 56 |             get_quantile_sorts=get_quantile_sorts
 57 |         )
 58 | 
 59 |     def run_msr_test(self, regularize=True):
 60 |         """
 61 |         Compare the Maximum Sharpe Ratios of two factor models using a Z-test.
 62 |         Args:
 63 |             regularize_covariance (bool): If True, regularize the covariance matrix.
 64 |         Returns:
 65 |             dict: {
 66 |                 'msr_a': float,  # MSR of model A
 67 |                 'msr_b': float,  # MSR of model B
 68 |                 'test_stat': float,  # Z-statistic
 69 |                 'p_value': float  # two-sided p-value
 70 |             }
 71 |         """
 72 |         return MSRTest.run_msr_comparison(model_a=self.factor1, model_b=self.factor2, regularize_covariance=True)
 73 |         
 74 |     def run_all(self, market_cap: pd.DataFrame = None) -> dict:
 75 |         """
 76 |         Run all evaluation methods and return results as a dictionary.
 77 |     
 78 |         Parameters:
 79 |             market_cap: pd.DataFrame（Required for value-weighted double sort）
 80 |     
 81 |         Returns:
 82 |             dict:
 83 |                 {'double_sort': result of double sort,
 84 |                 'msr_test': result of MSR test}
 85 |         """
 86 |         results = {}
 87 |     
 88 |         try:
 89 |             results['double_sort'] = self.run_double_sort(
 90 |                 quantiles=(5, 5),
 91 |                 value_weighted=True,
 92 |                 market_cap=market_cap,
 93 |                 get_quantile_sorts=False
 94 |             )
 95 |         except Exception as e:
 96 |             results['double_sort'] = f"Error: {e}"
 97 |     
 98 |         try:
 99 |             results['msr_test'] = self.run_msr_test(
100 |                 regularize=True
101 |             )
102 |         except Exception as e:
103 |             results['msr_test'] = f"Error: {e}"
104 |     
105 |         return results


--------------------------------------------------------------------------------
/firefin/evaluation/academia/MSR_Test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from scipy.stats import norm
 4 | 
 5 | class MSRTest:
 6 |     """
 7 |     A class to compute and statistically compare the Maximum Sharpe Ratios (MSRs)
 8 |     between two factor models using the asymptotic test from Barillas & Shanken (2018).
 9 |     """
10 | 
11 |     @staticmethod
12 |     def compute_max_sharpe_ratio(factor_returns: pd.DataFrame, regularize_covariance: bool = False) -> tuple:
13 |         """
14 |         Compute the maximum Sharpe ratio for a factor model.
15 | 
16 |         Args:
17 |             factor_returns (pd.DataFrame): T × K matrix of factor returns.
18 |             regularize_covariance (bool): If True, regularize the covariance matrix.
19 |         Returns:
20 |             tuple:
21 |                 - float: Maximum Sharpe Ratio
22 |                 - np.ndarray: Mean return vector (μ)
23 |                 - np.ndarray: Covariance matrix (Σ)
24 |         """
25 |         mu = factor_returns.mean().values
26 |         sigma = np.cov(factor_returns.T, ddof=1)
27 |         
28 |         # Regularizing the covariance matrix
29 |         if regularize_covariance:
30 |             epsilon = 1e-6  # small constant for regularization
31 |             sigma += np.eye(sigma.shape[0]) * epsilon
32 | 
33 |         msr = np.sqrt(mu @ np.linalg.inv(sigma) @ mu)
34 |         return msr, mu, sigma
35 | 
36 |     @staticmethod
37 |     def asymptotic_variance_msr_squared(mu: np.ndarray, sigma: np.ndarray, T: int) -> float:
38 |         """
39 |         Compute the asymptotic variance of the squared maximum Sharpe ratio.
40 | 
41 |         Args:
42 |             mu (np.ndarray): Mean return vector.
43 |             sigma (np.ndarray): Covariance matrix.
44 |             T (int): Sample size.
45 | 
46 |         Returns:
47 |             float: Asymptotic variance of MSR².
48 |         """
49 |         inv_sigma = np.linalg.inv(sigma)
50 |         term = 4 * (mu @ inv_sigma @ sigma @ inv_sigma @ mu)
51 |         return term / T
52 | 
53 |     @staticmethod
54 |     def run_msr_comparison(model_a: pd.DataFrame, model_b: pd.DataFrame, regularize_covariance: bool = False) -> dict:
55 |         """
56 |         Compare the Maximum Sharpe Ratios of two factor models using a Z-test.
57 | 
58 |         Args:
59 |             model_a (pd.DataFrame): T × K matrix of factor returns for model A.
60 |             model_b (pd.DataFrame): T × K matrix of factor returns for model B.
61 |             regularize_covariance (bool): If True, regularize the covariance matrix.
62 |         Returns:
63 |             dict: {
64 |                 'msr_a': float,  # MSR of model A
65 |                 'msr_b': float,  # MSR of model B
66 |                 'test_stat': float,  # Z-statistic
67 |                 'p_value': float  # two-sided p-value
68 |             }
69 |         """
70 |         T = model_a.shape[0]
71 |         # Compute MSRs and their components
72 |         msr_a, mu_a, sigma_a = MSRTest.compute_max_sharpe_ratio(model_a, regularize_covariance)
73 |         msr_b, mu_b, sigma_b = MSRTest.compute_max_sharpe_ratio(model_b, regularize_covariance)
74 | 
75 |         # Compute variances of MSR²
76 |         msr2_a = msr_a ** 2
77 |         msr2_b = msr_b ** 2
78 |         var_a = MSRTest.asymptotic_variance_msr_squared(mu_a, sigma_a, T)
79 |         var_b = MSRTest.asymptotic_variance_msr_squared(mu_b, sigma_b, T)
80 | 
81 |         # Z-test for MSR² difference
82 |         diff = msr2_a - msr2_b
83 |         std_error = np.sqrt(var_a + var_b)
84 |         z_stat = diff / std_error
85 |         p_value = 2 * (1 - norm.cdf(np.abs(z_stat)))
86 | 
87 |         return {
88 |             "msr_a": msr_a,
89 |             "msr_b": msr_b,
90 |             "test_stat": z_stat,
91 |             "p_value": p_value
92 |         }


--------------------------------------------------------------------------------
/firefin/evaluation/academia/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fire-institute/fire/ca564999d00b983d5d1181fb57f914906ebfcf5e/firefin/evaluation/academia/__init__.py


--------------------------------------------------------------------------------
/firefin/evaluation/academia/anomaly_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pandas as pd
  4 | from typing import List, Optional, Union
  5 | 
  6 | from ...core.algorithm.regression import _regression, RegressionResult
  7 | from ..eva_utils import QuantileReturns
  8 | 
  9 | 
 10 | class AnomalyTest:
 11 |     """
 12 |     Perform anomaly tests by regressing portfolio returns on a specified factor model
 13 |     and summarizing the resulting parameter estimates and test statistics.
 14 | 
 15 |     Attributes
 16 |     ----------
 17 |     portfolio_returns : pd.DataFrame
 18 |         DataFrame of portfolio returns, with each column representing a distinct portfolio.
 19 |     factor_model : pd.DataFrame
 20 |         DataFrame containing factor return series as independent variables.
 21 |     _regression_results : dict[str, RegressionResult]
 22 |         Mapping from portfolio name to its fitted RegressionResult.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         portfolio_returns: QuantileReturns,
 28 |         factor_model: Union[pd.DataFrame, List[pd.Series]],
 29 |     ) -> None:
 30 |         """
 31 |         Initialize the AnomalyTest object.
 32 | 
 33 |         Parameters
 34 |         ----------
 35 |         portfolio_returns : QuantileReturns
 36 |             Data structure holding portfolio returns. Must be convertible to a DataFrame
 37 |             and have a .columns attribute.
 38 |         factor_model : DataFrame or list of Series
 39 |             Factor return series used as regressors. Can be a DataFrame or a list of Series.
 40 | 
 41 |         Raises
 42 |         ------
 43 |         TypeError
 44 |             If inputs are not of the expected types.
 45 |         ValueError
 46 |             If factor_model is empty.
 47 |         """
 48 |         # Convert portfolio_returns to DataFrame if needed
 49 |         if not hasattr(portfolio_returns, "columns"):
 50 |             raise TypeError("`portfolio_returns` must have a `columns` attribute.")
 51 |         self.portfolio_returns = (
 52 |             portfolio_returns
 53 |             if isinstance(portfolio_returns, pd.DataFrame)
 54 |             else pd.DataFrame(portfolio_returns)
 55 |         )
 56 | 
 57 |         # Build factor DataFrame
 58 |         if isinstance(factor_model, list):
 59 |             if not factor_model:
 60 |                 raise ValueError("`factor_model` cannot be an empty list.")
 61 |             self.factor_model = pd.concat(factor_model, axis=1)
 62 |         elif isinstance(factor_model, pd.DataFrame):
 63 |             if factor_model.empty:
 64 |                 raise ValueError("`factor_model` DataFrame cannot be empty.")
 65 |             self.factor_model = factor_model
 66 |         else:
 67 |             raise TypeError("`factor_model` must be a DataFrame or a list of Series.")
 68 |         
 69 |         self.factor_model.columns = [f'Factor_{i}' for i in self.factor_model.columns]
 70 |         self._regression_results: dict[str, RegressionResult] = {}
 71 | 
 72 |     def fit(
 73 |         self,
 74 |         cov_type: Optional[str] = None,
 75 |         cov_kwds: Optional[dict] = None,
 76 |     ) -> AnomalyTest:
 77 |         """
 78 |         Fit time-series regressions of each portfolio return on the factor model.
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         cov_type : str, optional
 83 |             Covariance estimator type (e.g., 'HAC' for Newey–West or 'HC1').
 84 |             If None, uses the default homoskedastic standard errors.
 85 |         cov_kwds : dict, optional
 86 |             Keyword arguments for the covariance estimator (e.g., {'maxlags': L}).
 87 | 
 88 |         Returns
 89 |         -------
 90 |         self : AnomalyTest
 91 |             The fitted AnomalyTest instance, with regression results stored internally.
 92 |         """
 93 |         # Align portfolio and factor data on the same dates
 94 |         df_all = pd.concat([self.portfolio_returns, self.factor_model], axis=1).dropna()
 95 |         y_df = df_all[self.portfolio_returns.columns]
 96 |         x_df = df_all[self.factor_model.columns]
 97 | 
 98 |         for port in y_df.columns:
 99 |             raw_res = _regression(
100 |                 x=x_df,
101 |                 y=y_df[port],
102 |                 w=None,
103 |                 fit_intercept=True,
104 |                 cov_type=cov_type,
105 |                 cov_kwds=cov_kwds,
106 |             )
107 |             # Determine if regression is univariate
108 |             univariate = (x_df.shape[1] == 1)
109 |             wrapped = RegressionResult(raw_res, fit_intercept=True, univariate=univariate)
110 |             self._regression_results[port] = wrapped
111 |         return self
112 | 
113 |     def test_statistics(self) -> pd.DataFrame:
114 |         """
115 |         Generate a comprehensive summary table of parameter estimates and test statistics.
116 | 
117 |         Returns
118 |         -------
119 |         summary : pd.DataFrame
120 |             A MultiIndex DataFrame with rows labeled by (portfolio, parameter) and
121 |             columns ['coef', 'tvalue', 'stderr', 'pvalue'].
122 |         """
123 |         records = []
124 |         for port, res in self._regression_results.items():
125 |             params = res.sm_result.params
126 |             tvals = res.sm_result.tvalues
127 |             stderrs = res.sm_result.bse
128 |             pvals = res.sm_result.pvalues
129 |             records.append({
130 |                 'coef': params,
131 |                 'tvalue': tvals,
132 |                 'stderr': stderrs,
133 |                 'pvalue': pvals
134 |             })
135 |         return records
136 | 
137 |     def alpha(self, portfolio: str) -> float:
138 |         """
139 |         Retrieve the intercept (alpha) from the regression of a specific portfolio.
140 | 
141 |         Parameters
142 |         ----------
143 |         portfolio : str
144 |             The name of the portfolio column.
145 | 
146 |         Returns
147 |         -------
148 |         alpha : float
149 |             The estimated intercept term.
150 |         """
151 |         return self._regression_results[portfolio].alpha  # type: ignore
152 | 
153 |     def betas(self, portfolio: str) -> pd.Series:
154 |         """
155 |         Retrieve the factor loadings (betas) for a specific portfolio.
156 | 
157 |         Parameters
158 |         ----------
159 |         portfolio : str
160 |             The name of the portfolio column.
161 | 
162 |         Returns
163 |         -------
164 |         betas : pd.Series
165 |             Series of estimated factor coefficients, indexed by factor name.
166 |         """
167 |         return self._regression_results[portfolio].beta  # type: ignore
168 | 


--------------------------------------------------------------------------------
/firefin/evaluation/academia/fama_macbeth.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from ...core.algorithm.newey_west_ttest_1samp import NeweyWestTTest
 4 | from ...core.algorithm.regression import RollingRegressor, BatchRegressionResult
 5 | 
 6 | 
 7 | class FamaMacBeth:
 8 | 
 9 |     @staticmethod
10 |     def run_regression(
11 |         factor: pd.DataFrame | pd.Series, return_adj: pd.DataFrame, window: int = 252, n_jobs=4, verbose: int = 0
12 |     ) -> BatchRegressionResult:
13 |         """
14 |         Run Fama-MacBeth regression."
15 |         """
16 |         if isinstance(factor, pd.Series):
17 |             # Convert series to DataFrame for consistency
18 |             factor = pd.concat([factor] * return_adj.shape[1], axis=1)
19 |             factor.columns = return_adj.columns
20 |         elif isinstance(factor, pd.DataFrame):
21 |             pass
22 |         else:
23 |             raise ValueError("Factor must be a pandas Series or DataFrame.")
24 | 
25 |         # Note: Calculate excess returns if necessary
26 |         # return_adj = return_adj - risk_free_rate
27 |         # excess return is different in many cases, we leave it to the user to handle this.
28 | 
29 |         # First step: Time-series regressions
30 |         r = RollingRegressor(factor, return_adj, None, fit_intercept=True).fit(window, n_jobs=n_jobs, verbose=verbose)
31 | 
32 |         # Second step: Cross-sectional regressions
33 |         # This step involves regressing the time-series regression coefficients on the factors
34 |         r = RollingRegressor(r.beta, return_adj, None, fit_intercept=True).fit(window=None, axis=1, n_jobs=n_jobs, verbose=verbose)
35 | 
36 |         return r
37 | 
38 |     @staticmethod
39 |     def test_statistics(results: BatchRegressionResult) -> pd.Series:
40 |         # mean and std
41 | 
42 |         mean_beta = results.beta.mean()
43 |         std_beta = results.beta.std()
44 | 
45 |         mean_alpha = results.alpha.mean()
46 |         std_alpha = results.alpha.std()
47 | 
48 |         # t-statistics
49 | 
50 |         t_stat, p_value, se = NeweyWestTTest.newey_west_ttest_1samp(results.beta, popmean=0, lags=6, nan_policy="omit")
51 | 
52 |         return pd.Series(
53 |             {
54 |                 "mean_beta": mean_beta,
55 |                 "std_beta": std_beta,
56 |                 "mean_alpha": mean_alpha,
57 |                 "std_alpha": std_alpha,
58 |                 "t_stat": t_stat,
59 |                 "p_value": p_value,
60 |                 "se": se,
61 |             }
62 |         )
63 | 


--------------------------------------------------------------------------------
/firefin/evaluation/academia/portfolio_sort.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Portfolio Sort Implementation for Academic Research
  3 | ---------------------------------------------------
  4 | This module provides a class for performing single and double portfolio sorts
  5 | based on characteristics, market capitalization, and returns. The implementation
  6 | focuses on clarity, documentation, and best practices for financial research.
  7 | """
  8 | 
  9 | import typing
 10 | import numpy as np
 11 | import pandas as pd
 12 | from ...core.algorithm.newey_west_ttest_1samp import NeweyWestTTest
 13 | from ..eva_utils import factor_to_quantile, factor_to_quantile_dependent_double_sort
 14 | from ..eva_utils import _compute_quantile_df, _compute_weighted_quantile_df
 15 | from ..eva_utils import ForwardReturns, QuantileReturns
 16 | 
 17 | StatisticResults = typing.NewType("StatisticResults", dict[str, pd.DataFrame])
 18 | 
 19 | class PortfolioSort:
 20 |     """
 21 |     Class to perform single and double portfolio sorts based on characteristics.
 22 |     """
 23 | 
 24 |     @staticmethod
 25 |     def single_sort(
 26 |         factor: pd.DataFrame,
 27 |         forward_returns: ForwardReturns,
 28 |         quantiles: int,
 29 |         value_weighted: bool = True,
 30 |         get_quantile_sorts: bool = False,
 31 |         market_cap: pd.DataFrame | None = None,
 32 |     ) -> typing.Union[QuantileReturns, pd.DataFrame]:
 33 |         """
 34 |         Perform single portfolio sort based on characteristic and create long-short portfolio.
 35 |         
 36 |         Args:
 37 |             factor: (Time x Stock) DataFrame of characteristic exposures
 38 |             forward_returns: period : (Time x Stock) DataFrame of returns
 39 |             market_cap: (Time x Stock) DataFrame of market capitalizations
 40 |             quantiles: number of quantiles
 41 |             value_weighted: Use market cap weighting (True) or equal weighting (False)
 42 |             get_quantile_sorts: Return portfolio assignments
 43 |         Returns:
 44 |             Portfolio returns and statistical results
 45 |         """
 46 |         # 1. DATA PREPARATION
 47 |         # assume factor, forward_return, market_cap are aligned DataFrames in our case
 48 |         # 2. QUANTILE CALCULATIONS
 49 |         quantile_sorts = factor_to_quantile(factor, quantiles)
 50 | 
 51 |         # Early exit if quantile assignments requested
 52 |         if get_quantile_sorts:
 53 |             return quantile_sorts
 54 |         
 55 |         # 3. RETURN CALCULATIONS
 56 |         # TODO: Add support for other weighting schemes
 57 |         # TODO: Add transaction costs
 58 |         if value_weighted:
 59 |             portfolio_returns = QuantileReturns ({
 60 |                 period: _compute_weighted_quantile_df(quantile_sorts, period_returns, market_cap,quantiles=quantiles)
 61 |                 for period, period_returns in forward_returns.items()
 62 |                 })
 63 |         else:
 64 |             # equal weighted
 65 |             portfolio_returns = QuantileReturns ({
 66 |                 period: _compute_quantile_df(quantile_sorts, period_returns, quantiles=quantiles)
 67 |                 for period, period_returns in forward_returns.items()
 68 |                 })
 69 | 
 70 |         # 4. HEDGE PORTFOLIO (High-Low)
 71 |         for period, _ in forward_returns.items():
 72 |             portfolio_returns[period]["H-L"] = (
 73 |                 portfolio_returns[period][quantiles] - portfolio_returns[period][1]
 74 |             )
 75 |     
 76 |         return portfolio_returns
 77 | 
 78 |     @staticmethod
 79 |     def double_sort(
 80 |         factor1: pd.DataFrame,
 81 |         factor2: pd.DataFrame,
 82 |         forward_returns: ForwardReturns,
 83 |         quantiles: typing.Tuple[int, int] = (5, 5),
 84 |         dependent: bool = False,
 85 |         value_weighted: bool = True,
 86 |         get_quantile_sorts: bool = False,
 87 |         market_cap: pd.DataFrame | None = None,
 88 |     ) -> typing.Union[QuantileReturns, pd.DataFrame]:
 89 |         """
 90 |         Sort securities based on two characteristics.
 91 | 
 92 |         Args:
 93 |             factor1: (Time x Stock) DataFrame of characteristic exposures
 94 |             factor2: (Time x Stock) DataFrame of characteristic exposures
 95 |             forward_returns: period : (Time x Stock) DataFrame of returns
 96 |             market_cap: (Time x Stock) DataFrame of market capitalizations
 97 |             dependent: Whether to use dependent sorting (True) or independent sorting (False)
 98 |             quantiles: number of quantiles
 99 |             value_weighted: Use market cap weighting (True) or equal weighting (False)
100 |             get_quantile_sorts: Return portfolio assignments
101 |         Returns:
102 |             Portfolio returns and statistical results
103 |         """
104 |         # Ensure that factor1 and factor2 have the same index and columns
105 |         assert factor1.index.equals(factor2.index) and factor1.columns.equals(factor2.columns), \
106 |             "factor1 and factor2 must have the same index and columns"
107 | 
108 |         # 1. DATA PREPARATION
109 |         # assume factor1, factor2, forward_return, market_cap are aligned DataFrames in our case
110 |         # 2. QUANTILE CALCULATIONS
111 |         if dependent:
112 |             # Dependent sorting (conditional double sorting)
113 |             """
114 |             Note from Professor SHI:
115 | 
116 |             Suppose we first sort the stocks based on X1, dividing all stocks into L1 groups. Then, within each of
117 |             these L1 groups, we further sort the stocks based on X2, dividing the stocks into L2 groups. Again, a
118 |             total of L1 × L2 groups
119 | 
120 |             The two sorting variables are NOT treated equally: the first sorting variable acts solely as a control
121 |             variable, and the main interest is the relationship between the second sorting variable and asset
122 |             returns. A factor should only be constructed based on the second sorting variable
123 | 
124 |             Lets assume factor1 is the control variable and factor2 is the main variable of interest.
125 |             We will first sort the stocks based on factor1, then within each group, we will sort the stocks based on factor2.
126 |             """
127 | 
128 |             combined_sorts =  factor_to_quantile_dependent_double_sort(factor1, factor2, quantiles)
129 |         else:
130 |             # independent sorting (unconditional double sorting)
131 |             # Independent sorting will result some NONE quantile
132 | 
133 |             quantile_sorts_factor1 = factor_to_quantile(factor1, quantiles[0]).astype(int)
134 |             quantile_sorts_factor2 = factor_to_quantile(factor2, quantiles[1]).astype(int)
135 |             # quantile_sorts to string and add them to q1_q2 format
136 |             combined_sorts = quantile_sorts_factor1.astype(str) + "_" + quantile_sorts_factor2.astype(str)
137 | 
138 |         # Initialize a dictionary to store the portfolio returns
139 |         portfolio_returns = {}
140 | 
141 |         # 3. RETURN CALCULATIONS
142 |         for period, period_returns in forward_returns.items():
143 | 
144 |             # Calculate returns for each combined quantile
145 |             if value_weighted:
146 |                 period_portfolio_returns = _compute_weighted_quantile_df(
147 |                     combined_sorts, period_returns, market_cap, reindex=False, quantiles= quantiles[0] * quantiles[1]
148 |                 )
149 |             else:
150 |                 # equal weighted
151 |                 period_portfolio_returns = _compute_quantile_df(
152 |                     combined_sorts, period_returns, reindex=False, quantiles= quantiles[0] * quantiles[1]
153 |                 )
154 |             # Store the results
155 |             portfolio_returns[period] = period_portfolio_returns
156 | 
157 |         # 4. HEDGE PORTFOLIO (High-High vs Low-Low)
158 |         for period, _ in forward_returns.items():
159 |             high_high = portfolio_returns[period].xs(f"{quantiles[0]}_{quantiles[1]}", axis=1)
160 |             low_low = portfolio_returns[period].xs("1_1", axis=1)
161 |             portfolio_returns[period]['HH-LL'] = high_high - low_low
162 | 
163 |         # Early exit if quantile assignments requested
164 |         if get_quantile_sorts:
165 |             return combined_sorts
166 | 
167 |         return QuantileReturns(portfolio_returns)
168 | 
169 |     @staticmethod
170 |     def get_statistics(result: QuantileReturns, quantiles: int) -> StatisticResults:
171 |         """
172 |         Compute statistical results for single portfolio sort.
173 | 
174 |         TODO: 
175 |             1. Add more statistics
176 |             2. plot the results
177 |         """        
178 |         # T-Test for all periods
179 |         # periods * (quantiles + H-L)
180 |         t_stats = np.empty((len(result), quantiles + 1), dtype=float)
181 |         p_values = np.empty((len(result), quantiles + 1), dtype=float)
182 |         se_values = np.empty((len(result), quantiles + 1), dtype=float)
183 |         mean_returns = np.empty((len(result), quantiles + 1), dtype=float)
184 |                                 
185 |         for n, (_, period_returns) in enumerate(result.items()):
186 |             # T-Test for all periods
187 |             t_stats[n], p_values[n], se_values[n] = np.apply_along_axis(
188 |                 NeweyWestTTest.newey_west_ttest_1samp,
189 |                 axis=0,
190 |                 arr=period_returns,
191 |                 popmean=0,
192 |                 lags=6,
193 |                 nan_policy='omit'
194 |             )
195 |             # other statistics can be added here
196 |             mean_returns[n] = np.nanmean(period_returns, axis=0)
197 | 
198 |         return StatisticResults({'t_stats': pd.DataFrame(t_stats, index=result.keys(), columns=period_returns.columns),
199 |                 'p_values': pd.DataFrame(p_values, index=result.keys(), columns=period_returns.columns),
200 |                 'se_values': pd.DataFrame(se_values, index=result.keys(), columns=period_returns.columns),
201 |                 'mean_returns': pd.DataFrame(mean_returns, index=result.keys(), columns=period_returns.columns)})


--------------------------------------------------------------------------------
/firefin/evaluation/academia/winsorizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Winsorization Implementation for Academic Research
  3 | ---------------------------------------------------
  4 | This module provides a class for performing winsorizations, including MAD, k-sigma, 
  5 | and winsorization at extreme percentiles. The implementation focuses on clarity, 
  6 | documentation, and best practices for financial research.
  7 | """
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | from typing import Union, Tuple
 12 | 
 13 | class Winsorizer:
 14 |     """
 15 |     A class to perform winsorizations on cross-sectional characteristic matrices.
 16 |     Supports input as either pandas DataFrame or numpy array.
 17 |     """
 18 | 
 19 |     @staticmethod
 20 |     def __to_dataframe(features: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
 21 |         """
 22 |         Convert the input features to a pandas DataFrame if it is a numpy array.
 23 | 
 24 |         Args:
 25 |             features: (Time x Stock) DataFrame of features.
 26 | 
 27 |         Returns:
 28 |             pd.DataFrame: The input converted to a DataFrame.
 29 |         """
 30 |         if isinstance(features, pd.DataFrame):
 31 |             return features.copy()
 32 |         elif isinstance(features, np.ndarray):
 33 |             return pd.DataFrame(features)
 34 |         else:
 35 |             raise TypeError("Input features must be a pandas DataFrame or a numpy array.")
 36 | 
 37 |     @staticmethod
 38 |     def __to_original_type(result: pd.DataFrame, original: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]:
 39 |         """
 40 |         Convert the DataFrame result back to the type of the original input.
 41 | 
 42 |         Args:
 43 |             result (pd.DataFrame): The processed DataFrame.
 44 |             original (Union[pd.DataFrame, np.ndarray]): The original input features.
 45 | 
 46 |         Returns:
 47 |             Union[pd.DataFrame, np.ndarray]: The result in the same type as the original input.
 48 |         """
 49 |         if isinstance(original, np.ndarray):
 50 |             return result.values
 51 |         return result
 52 | 
 53 |     @classmethod
 54 |     def MAD_winsorization(
 55 |         cls,
 56 |         features: Union[pd.DataFrame, np.ndarray],
 57 |         scaled: bool = False,
 58 |         k: int = 3
 59 |     ) -> Union[pd.DataFrame, np.ndarray]:
 60 |         """
 61 |         Apply winsorization on features using the Median Absolute Deviation (MAD) method.
 62 | 
 63 |         Args:
 64 |             features: (Time x Stock) DataFrame of features.
 65 |             scaled (bool, optional): Whether to scale the MAD value (MAD * 1.4826). Default is False.
 66 |             k (int, optional): Scaling factor to determine limits. Default is 3.
 67 | 
 68 |         Returns:
 69 |             Union[pd.DataFrame, np.ndarray]: Winsorized features using MAD.
 70 |         """
 71 |         original = features
 72 |         df = cls.__to_dataframe(features)
 73 | 
 74 |         # Calculate the median for each row
 75 |         median = df.median(axis=1)
 76 |         # Compute the absolute deviation from the median, then calculate the median of these deviations for each row
 77 |         mad = (df.sub(median, axis=0)).abs().median(axis=1)
 78 | 
 79 |         # Scale the MAD if required
 80 |         if scaled:
 81 |             mad *= 1.4826
 82 | 
 83 |         # Calculate the lower and upper limits for winsorization
 84 |         lower = median - k * mad
 85 |         upper = median + k * mad
 86 | 
 87 |         # Apply winsorization using the DataFrame.clip() method for each row
 88 |         result = df.clip(lower=lower, upper=upper, axis=0)
 89 |         return cls.__to_original_type(result, original)
 90 | 
 91 |     @classmethod
 92 |     def sigma_winsorization(
 93 |         cls,
 94 |         features: Union[pd.DataFrame, np.ndarray],
 95 |         k: int = 3
 96 |     ) -> Union[pd.DataFrame, np.ndarray]:
 97 |         """
 98 |         Apply winsorization on features using the k-sigma rule.
 99 | 
100 |         Args:
101 |             features: (Time x Stock) DataFrame of features.
102 |             k (int, optional): Scaling factor to determine limits. Default is 3.
103 | 
104 |         Returns:
105 |             Union[pd.DataFrame, np.ndarray]: Winsorized features using the k-sigma rule.
106 |         """
107 |         original = features
108 |         df = cls.__to_dataframe(features)
109 | 
110 |         # Calculate the mean and standard deviation for each row
111 |         mean = df.mean(axis=1)
112 |         std = df.std(axis=1)
113 | 
114 |         # Calculate the lower and upper limits for winsorization
115 |         lower = mean - k * std
116 |         upper = mean + k * std
117 | 
118 |         # Apply winsorization using the DataFrame.clip() method for each row
119 |         result = df.clip(lower=lower, upper=upper, axis=0)
120 |         return cls.__to_original_type(result, original)
121 | 
122 |     @classmethod
123 |     def percentile_winsorization(
124 |         cls,
125 |         features: Union[pd.DataFrame, np.ndarray],
126 |         percentile: Tuple[float, float] = (0.01, 0.99),
127 |         set_outlier_nan: bool = False
128 |     ) -> Union[pd.DataFrame, np.ndarray]:
129 |         """
130 |         Apply winsorization on features using the percentile rule.
131 | 
132 |         Args:
133 |             features: (Time x Stock) DataFrame of features.
134 |             percentile (Tuple[float, float], optional): The lower and upper percentiles to winsorize. 
135 |                 Default is (0.01, 0.99).
136 |             set_outlier_nan (bool, optional): Whether to set outliers to be NaN instead of clipping them. Default is False.
137 | 
138 |         Returns:
139 |             Union[pd.DataFrame, np.ndarray]: Winsorized features using the percentile rule.
140 |         """
141 |         original = features
142 |         df = cls.__to_dataframe(features)
143 | 
144 |         # Calculate lower and upper bounds based on the given percentiles
145 |         lower_bound = df.quantile(percentile[0], axis=1)
146 |         upper_bound = df.quantile(percentile[1], axis=1)
147 | 
148 |         if set_outlier_nan:
149 |             # set the outlier values to be NaN
150 |             mask = df.lt(lower_bound, axis=0) | df.gt(upper_bound, axis=0)
151 |             result = df.mask(mask)
152 |         else:
153 |             # Clip values to the specified bounds
154 |             result = df.clip(lower=lower_bound, upper=upper_bound, axis=0)
155 | 
156 |         return cls.__to_original_type(result, original)


--------------------------------------------------------------------------------
/firefin/evaluation/eva_utils.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
  2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
  3 | 
  4 | # TODO: Move some common algorithms to fire/core/algorithm/
  5 | 
  6 | import typing
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | __all__ = [
 12 |     "compute_forward_returns",
 13 |     "compute_ic",
 14 |     "factor_to_quantile",
 15 |     "factor_to_quantile_dependent_double_sort",
 16 |     "compute_quantile_returns",
 17 |     "_compute_weighted_quantile_df",
 18 |     "_compute_quantile_df",
 19 | ]
 20 | 
 21 | PeriodType = typing.NewType("PeriodType", int)
 22 | ForwardReturns = typing.NewType("ForwardReturns", dict[PeriodType, pd.DataFrame])
 23 | IC = typing.NewType("IC", pd.DataFrame)
 24 | QuantileReturns = typing.NewType("QuantileReturns", dict[PeriodType, pd.DataFrame])
 25 | 
 26 | 
 27 | def compute_forward_returns(price: pd.DataFrame, periods: list[PeriodType]) -> ForwardReturns:
 28 |     forward_returns_dict = {}
 29 | 
 30 |     returns: pd.DataFrame = np.log(price).shift(-1) - np.log(price)
 31 | 
 32 |     for period in sorted(periods):
 33 |         if period == 1:
 34 |             forward_returns_dict[period] = returns
 35 |             continue
 36 | 
 37 |         log_period_returns = returns.rolling(period).sum().shift(1 - period)
 38 |         period_returns: pd.DataFrame = np.exp(log_period_returns) - 1
 39 |         forward_returns_dict[period] = period_returns
 40 |     return ForwardReturns(forward_returns_dict)
 41 | 
 42 | 
 43 | def _compute_ic_df_df(
 44 |     a: pd.DataFrame, b: pd.DataFrame, method: typing.Literal["pearson", "kendall", "spearman"]
 45 | ) -> pd.Series:
 46 |     return a.corrwith(b, axis=1, method=method)
 47 | 
 48 | 
 49 | def compute_ic(
 50 |     factor: pd.DataFrame, forward_returns: ForwardReturns, method: typing.Literal["pearson", "kendall", "spearman"]
 51 | ) -> IC:
 52 |     """
 53 |     Compute IC (Information Coefficient) for the factor and forward returns, which is the correlation between the
 54 |     factor and the forward returns.
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     factor: pd.DataFrame
 59 |     forward_returns: ForwardReturns
 60 |     method: str
 61 |         default "pearson"
 62 | 
 63 |     Returns
 64 |     -------
 65 |     IC
 66 |         a dataframe of IC values for each period in columns.
 67 | 
 68 |     """
 69 |     factor = factor[np.isfinite(factor)]
 70 |     return IC(
 71 |         pd.DataFrame(
 72 |             {
 73 |                 period: _compute_ic_df_df(factor, period_returns, method=method)
 74 |                 for period, period_returns in forward_returns.items()
 75 |             }
 76 |         )
 77 |     )
 78 | 
 79 | 
 80 | def factor_to_quantile(factor: pd.DataFrame, quantiles: int = 5) -> pd.DataFrame:
 81 |     """
 82 |     Convert factor to quantile row-wise. The result will always have quantile values ranging from `quantiles` down
 83 |     to 1 continuously (if only 1 group, it'll be `quantiles`).
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     factor: pd.DataFrame
 88 |     quantiles: int
 89 |         default 5
 90 | 
 91 |     Returns
 92 |     -------
 93 |     pd.DataFrame
 94 |         a dataframe of quantile values.
 95 | 
 96 |     """
 97 |     quantile_values = np.arange(1, quantiles + 1)
 98 | 
 99 |     def _row_to_quantile(row):
100 |         finite = np.isfinite(row)
101 |         if finite.any():
102 |             tmp: pd.Series = pd.qcut(row[finite], quantiles, labels=False, duplicates="drop")
103 |             # rearrange values from `q` to 1
104 |             # this makes sure that the quantile values are generally continuous,
105 |             # and we always have a group of long portfolio of `q`
106 |             old_values = tmp.unique()
107 |             old_values.sort()
108 |             new_values = quantile_values[-len(old_values) :]
109 |             if not np.array_equal(old_values, new_values):
110 |                 tmp.replace(old_values, new_values, inplace=True)
111 |             row = row.copy()
112 |             row[finite] = tmp
113 |             return row
114 |         else:
115 |             return row
116 | 
117 |     return factor.apply(_row_to_quantile)
118 | 
119 | def factor_to_quantile_dependent_double_sort(primary_factor: pd.DataFrame, secondary_factor: pd.DataFrame, quantiles: typing.Tuple[int, int]):
120 |     """
121 |     Perform dependent double sorting on two factors.
122 | 
123 |     Parameters:
124 |     ------------
125 |     primary_factor : pd.DataFrame
126 |         The primary factor used for initial sorting.
127 |     secondary_factor : pd.DataFrame
128 |         The secondary factor used for sorting within each group defined by the primary factor.
129 |     quantiles : tuple of int
130 |        A tuple containing the number of quantiles for the primary and secondary factors respectively.
131 |     
132 |     Returns:
133 |     --------
134 |     quantile_sorts : pd.DataFrame
135 |        A DataFrame where each entry represents the quantile assignment for the secondary factor within the group defined by the primary factor.
136 |     
137 |     TODO: numba jit acceleration
138 |     """
139 |     quantile_values_p = np.arange(1, quantiles[0] + 1)
140 |     quantile_values_s = np.arange(1, quantiles[1] + 1)
141 | 
142 |     def _row_to_quantile(row_p, row_s):
143 |         finite_p = np.isfinite(row_p)
144 |         finite_s = np.isfinite(row_s)
145 | 
146 |         if finite_p.any() or finite_s.any():
147 |             # Sort by primary factor first
148 |             temp_p : pd.Series = pd.qcut(row_p[finite_p], quantiles[0], labels=False, duplicates='drop') 
149 |             old_values = temp_p.unique()
150 |             old_values.sort()
151 |             new_values = quantile_values_p[-len(old_values) :]
152 |             if not np.array_equal(old_values, new_values):
153 |                 temp_p.replace(old_values, new_values, inplace=True)
154 | 
155 |             # Sort by secondary factor within each primary quantile
156 |             temp_s = pd.Series(np.zeros_like(row_p), index=row_p.index, dtype=int)
157 |             temp_s[~finite_p | ~finite_s] = np.nan
158 | 
159 |             for q in quantile_values_p:
160 |                 mask = temp_p == q
161 |                 if mask.any():
162 |                     # nan + nan, nan + int -> nan, int + nan -> nan, int + int -> int
163 |                     temp_s[mask] = pd.qcut(row_s[finite_s & mask], quantiles[1], labels=False, duplicates='drop')
164 |                 else:
165 |                     temp_s[mask] = np.nan
166 |             
167 |             old_values = temp_s.unique()
168 |             old_values.sort()
169 |             new_values = quantile_values_s[-len(old_values) :]
170 |             if not np.array_equal(old_values, new_values):
171 |                 temp_s.replace(old_values, new_values, inplace=True)
172 |             
173 |             return temp_p.astype(str) + "_" + temp_s.astype(str)
174 |         else:
175 |             return pd.Series(index=row_p.index, dtype=str)
176 | 
177 |     result = pd.DataFrame(index=primary_factor.index, columns=primary_factor.columns)
178 |     # apply the function to each row both of the factors
179 |     for (i, row_p), (_, row_s) in zip(primary_factor.iterrows(), secondary_factor.iterrows()):
180 |         result.loc[i] = _row_to_quantile(row_p, row_s)
181 | 
182 |     return result
183 | 
184 | def _compute_quantile_df(qt: pd.DataFrame, fr: pd.DataFrame, reindex=True, quantiles: int = 5):
185 |     # assume aligned
186 |     result = {}
187 |     for (dt, fr_row), (_, qt_row) in zip(fr.iterrows(), qt.iterrows()):
188 |         result[dt] = fr_row.groupby(qt_row).mean()
189 |     result = pd.DataFrame(result).T
190 |     if reindex:
191 |         return result.reindex(columns=np.arange(1, quantiles + 1), copy=False)
192 |     return result
193 | 
194 | def _compute_weighted_quantile_df(qt: pd.DataFrame, fr: pd.DataFrame, wt: pd.DataFrame, reindex= True, quantiles: int = 5):
195 |     # assume aligned
196 |     result = {}
197 |     for (dt, fr_row), (_, qt_row), (_, wt_row) in zip(fr.iterrows(), qt.iterrows(), wt.iterrows()):
198 |         _wt_row = wt_row.groupby(qt_row).transform(lambda x: x / x.sum())
199 |         result[dt] = (fr_row * _wt_row).groupby(qt_row).sum()
200 |     result = pd.DataFrame(result).T
201 |     if reindex:
202 |         return result.reindex(columns=np.arange(1, quantiles + 1), copy=False)
203 |     return result
204 | 
205 | def compute_quantile_returns(
206 |     factor: pd.DataFrame, forward_returns: ForwardReturns, quantiles: int = 5
207 | ) -> QuantileReturns:
208 |     """
209 |     Compute quantile returns. Factor will be converted to quantiles using `factor_to_quantile`. Then, for each period
210 |     in forward_returns, the period returns will be grouped row-wise by quantiles and averaged.
211 | 
212 |     Parameters
213 |     ----------
214 |     factor: pd.DataFrame
215 |     forward_returns: ForwardReturns
216 |     quantiles: int
217 |         default 5
218 | 
219 |     Returns
220 |     -------
221 |     QuantileReturns
222 |         a dictionary of period returns for each quantile. The quantile returns are dataframe with index as date and
223 |         columns as quantiles.
224 | 
225 |     """
226 |     factor_as_quantile = factor_to_quantile(factor, quantiles=quantiles)
227 |     return QuantileReturns(
228 |         {
229 |             period: _compute_quantile_df(factor_as_quantile, period_returns, quantiles=quantiles)
230 |             for period, period_returns in forward_returns.items()
231 |         }
232 |     )
233 | 


--------------------------------------------------------------------------------
/firefin/evaluation/industry/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
3 | 
4 | """
5 | factor evaluation
6 | 
7 | """
8 | from .evaluator import Evaluator


--------------------------------------------------------------------------------
/firefin/evaluation/industry/evaluator.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | import typing
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from ...core.plot import plots
 9 | from ..eva_utils import ForwardReturns, IC, QuantileReturns, compute_ic, compute_quantile_returns
10 | 
11 | __all__ = ["Evaluator"]
12 | 
13 | 
14 | def to_datetime_index(df: pd.DataFrame) -> pd.DataFrame:
15 |     out = df.copy(deep=False)
16 |     if not isinstance(out.index, pd.DatetimeIndex):
17 |         out.index = pd.to_datetime(out.index)
18 |     return out
19 | 
20 | 
21 | class Evaluator:
22 |     def __init__(self, factor: pd.DataFrame, forward_returns: ForwardReturns):
23 |         self.factor = factor
24 |         self.forward_returns = forward_returns
25 |         self._to_datetime_index()
26 |         self._reindex_forward_returns()
27 |         self._result = {}
28 | 
29 |     def _to_datetime_index(self):
30 |         self.factor = to_datetime_index(self.factor)
31 |         self.forward_returns = {k: to_datetime_index(v) for k, v in self.forward_returns.items()}
32 | 
33 |     def _reindex_forward_returns(self):
34 |         self.forward_returns = {k: v.reindex_like(self.factor, copy=False) for k, v in self.forward_returns.items()}
35 | 
36 |     def get_ic(self, method: typing.Literal["pearson", "kendall", "spearman"], plot=True) -> IC:
37 |         cache_key = ("ic", (method,))
38 |         if cache_key not in self._result:
39 |             self._result[cache_key] = compute_ic(self.factor, self.forward_returns, method)
40 |         ic = self._result[cache_key]
41 |         if plot:
42 |             plots.plt_ic(ic)
43 |         return ic
44 | 
45 |     def get_quantile_returns(self, quantiles: int = 5, plot=True) -> QuantileReturns:
46 |         cache_key = ("quantile_returns", (quantiles,))
47 |         if cache_key not in self._result:
48 |             self._result[cache_key] = compute_quantile_returns(self.factor, self.forward_returns, quantiles)
49 |         qt = self._result[cache_key]
50 |         if plot:
51 |             plots.plt_quantile_cumulative_returns(qt)
52 |             plots.plt_quantile_cumulated_end_returns(qt)
53 |         return qt
54 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | 
 4 | [build-system]
 5 | requires = ["setuptools>=65.5.0", "wheel"]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [project]
 9 | name = "firefin"
10 | version = "0.2.1"
11 | description = "The bundled opensource toolkit for book Navigate through the Factor Zoo: The Science of Factor Investing."
12 | readme = "README.md"
13 | authors = [{ name = "Renjie Liao", email = "auderson@qq.com" }, { name = "Baochen Qiao", email = "baochenqiao@gmail.com" }]
14 | dependencies = [
15 |     "click >= 8.1.3",
16 |     "pandas >= 2.2.1",
17 |     "matplotlib >= 3.8.3",
18 |     "seaborn >= 0.13.2",
19 |     "statsmodels >= 0.14.1",
20 |     "scipy >= 1.12.0",
21 |     "numba >= 0.59.0",
22 |     "loguru >= 0.7.2",
23 |     "tqdm >= 4.66.4",
24 |     "joblib >= 1.4.2",
25 | ]
26 | 
27 | [project.scripts]
28 | firefin = "firefin.cli.command:cli"
29 | 


--------------------------------------------------------------------------------
/tests/evaluation/Beta_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.stats import t as t_dist
  3 | import pandas as pd
  4 | #这里原假设是β=0（也就是β没有作用）
  5 | def beta_t_test(resid: pd.DataFrame, beta: pd.DataFrame, date , window_size, factors:pd.DataFrame ) -> tuple:
  6 |     """
  7 |     对多因子回归中每个 β 做 t 检验 (H0: β=0)。
  8 | 
  9 |     :param resid: 回归残差矩阵，形状 (T, N)
 10 |     :param betas: β 系数矩阵，形状 (N, K)
 11 |     :param factors: 因子收益矩阵，形状 (T, K)
 12 |     :return:
 13 |       t_stats: t 统计量矩阵，形状 (N, K)
 14 |       p_values: 双侧 p‑value 矩阵，形状 (N, K)
 15 |     """
 16 |     beta = beta.loc[date].to_numpy()
 17 |     if beta.ndim == 1:
 18 |         beta = beta.reshape(1, -1).T  # N*1
 19 |     rows = []
 20 |     for i in range(window_size):
 21 |         resid_i = resid[i].loc[date].to_numpy()
 22 |         rows.append(resid_i)
 23 |     resid = np.stack(rows, axis=0).T  # resid是N*T矩阵
 24 |     factors = factors.loc[:date].tail(window_size).to_numpy()
 25 |     if factors.ndim == 1:
 26 |         factors = factors.reshape(1, -1)
 27 |     N, T = resid.shape
 28 |     K =1  #factors.shape[1] 1*T矩阵
 29 | 
 30 |     df = T - K - 1
 31 | 
 32 |     cov_factor=np.cov(factors,rowvar=True,ddof=-1).item()
 33 |     sigma=[]
 34 |     for i in range(N):
 35 |         residi=resid[i]
 36 |         cov_epsilon_i=np.cov(residi,rowvar=True,ddof=K)
 37 |         sigma.append(cov_epsilon_i.item())
 38 | 
 39 |     t_stats = np.zeros((N, K))
 40 |     p_values = np.zeros((N, K))
 41 | 
 42 |     for j in range(N):
 43 |         t_stats_j=beta[j] * np.sqrt(N) * np.sqrt(cov_factor)/ np.sqrt(sigma[j])
 44 |         t_stats[j]=t_stats_j
 45 |         p_value_j=2 * (1 - t_dist.cdf(np.abs(t_stats_j), df))
 46 |         p_values[j]=p_value_j
 47 | 
 48 | 
 49 | 
 50 |     return t_stats, p_values
 51 | 
 52 | 
 53 | 
 54 | def beta_test(resid: pd.DataFrame, beta: pd.DataFrame, date, window_size: int, factors: pd.DataFrame) -> str:
 55 |     """
 56 |     对单因子回归中每个 β 做 t 检验 (H0: β=0)，并直接输出 LaTeX 格式的表格。
 57 | 
 58 |     :param resid: 回归残差矩阵，形状 (T, N)，index 是日期，columns 是资产名称
 59 |     :param beta: β 系数矩阵，index 是日期，columns 是资产名称
 60 |     :param date: 要检验的日期
 61 |     :param window_size: 窗口大小
 62 |     :param factors: 因子收益矩阵，index 是日期，columns 是因子名称（仅支持单因子）
 63 |     :return: LaTeX 表格字符串
 64 |     """
 65 |     # 提取截面 beta
 66 |     beta_vals = beta.loc[date].to_numpy()
 67 |     if beta_vals.ndim == 1:
 68 |         beta_vals = beta_vals.reshape(-1)
 69 | 
 70 |     # 构造残差矩阵 N x T
 71 |     rows = []
 72 |     for i in range(window_size):
 73 |         resid_i = resid[i].loc[date].to_numpy()
 74 |         rows.append(resid_i)
 75 |     resid_mat = np.stack(rows, axis=0).T  # N x T 矩阵
 76 | 
 77 |     # 提取因子收益序列长度 window_size
 78 |     fac = factors.loc[:date].tail(window_size).to_numpy().reshape(-1)
 79 | 
 80 |     N, T = resid_mat.shape
 81 |     K = 1
 82 |     df = T - K - 1
 83 | 
 84 |     # 因子方差 (无偏估计 ddof=0)
 85 |     cov_factor = np.var(fac, ddof=0)
 86 |     # 各资产残差方差 (ddof=K)
 87 |     sigma = [np.var(resid_mat[i], ddof=K) for i in range(N)]
 88 | 
 89 |     # 计算 t 统计量和双侧 p 值
 90 |     t_stats = beta_vals * np.sqrt(N * cov_factor) / np.sqrt(sigma)
 91 |     p_values = 2 * (1 - t_dist.cdf(np.abs(t_stats), df))
 92 | 
 93 |     # 组织为 DataFrame
 94 |     asset_names = resid.columns.tolist() if hasattr(resid, 'columns') else beta.columns.tolist()
 95 |     df_result = pd.DataFrame({
 96 |         't 统计量': t_stats,
 97 |         'p 值': p_values
 98 |     }, index=asset_names)
 99 |     df_result.index.name = '资产'
100 | 
101 |     # 生成 LaTeX 表格
102 |     latex_table = df_result.to_latex(
103 |         float_format="%.4f",
104 |         caption="各资产的 t 统计量和 p 值",
105 |         label="tab:beta_t_test",
106 |         escape=False
107 |     )
108 | 
109 |     print(latex_table)
110 |     return latex_table


--------------------------------------------------------------------------------
/tests/evaluation/MSR_Test.py:
--------------------------------------------------------------------------------
 1 | from firefin.evaluation.academia.MSR_Test import MSRTest
 2 | from firefin.data.fake import gen_df
 3 | 
 4 | factor1 = gen_df(253, 100, index="day", mock="rand")
 5 | factor2 = gen_df(253, 100, index="day", mock="rand")
 6 | 
 7 | def test_MSR_Test():
 8 |     result = MSRTest.run_msr_comparison(factor1, factor2, regularize_covariance=True)
 9 |     print("Model A MSR:", result["msr_a"])
10 |     print("Model B MSR:", result["msr_b"])
11 |     print("Z-statistic:", result["test_stat"])
12 |     print("P-value:", result["p_value"])
13 | 
14 | test_MSR_Test()


--------------------------------------------------------------------------------
/tests/evaluation/eva_utils.py:
--------------------------------------------------------------------------------
 1 | from firefin.data.fake import gen_df
 2 | from firefin.evaluation.eva_utils import factor_to_quantile_dependent_double_sort
 3 | 
 4 | def test_factor_to_quantile_dependent_double_sort():
 5 |     factor1 = gen_df(10, 100, index="day", mock="rand")
 6 |     factor2 = gen_df(10, 100, index="day", mock="rand")
 7 | 
 8 |     double_sort = factor_to_quantile_dependent_double_sort(factor1, factor2, quantiles=(3, 5))
 9 |     print(double_sort.head())
10 | 
11 | 
12 | test_factor_to_quantile_dependent_double_sort()


--------------------------------------------------------------------------------
/tests/evaluation/fama_macbeth.py:
--------------------------------------------------------------------------------
 1 | from firefin.evaluation.academia.fama_macbeth import FamaMacBeth
 2 | from firefin.data.fake import gen_df
 3 | from firefin.data import fetch_data
 4 | 
 5 | data = fetch_data(['open','close','volume','return_adj'])
 6 | 
 7 | def test_fama_macbeth_regression():
 8 |     r = FamaMacBeth.run_regression(data['close'], data['return_adj'], window=252, verbose=10, n_jobs=24)
 9 |     print(r)
10 |     stats = FamaMacBeth.test_statistics(r)
11 |     print(stats)
12 | 
13 | test_fama_macbeth_regression()


--------------------------------------------------------------------------------
/tests/evaluation/grs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from scipy.stats import f
 4 | def grs_test(resid: pd.DataFrame, alpha: pd.DataFrame, date , window_size, factors:pd.DataFrame ,label: str = "tab:grs",caption: str = "GRS 检验结果") -> None:
 5 |     """ Perform the Gibbons, Ross and Shanken (1989) test.
 6 |         :param resid: Matrix of residuals from the OLS of size TxN.
 7 |         :param alpha: Vector of alphas from the OLS of size Nx1.
 8 |         :param factors: Matrix of factor returns size KxT.
 9 |         :return: Test statistic and p-value of the test statistic.
10 |     """
11 |     #数据处理
12 |     alpha = alpha.loc[date].to_numpy()
13 |     if alpha.ndim == 1:
14 |         alpha = alpha.reshape(1, -1).T #N*1
15 |     rows = []
16 |     for i in range(window_size):
17 |         resid_i = resid[i].loc[date].to_numpy()
18 |         rows.append(resid_i)
19 |     resid = np.stack(rows, axis=0).T
20 |     factors = factors.loc[:date].tail(window_size).to_numpy()
21 |     if factors.ndim == 1:
22 |         factors = factors.reshape(1, -1)
23 | 
24 | 
25 |     # Determine the time series and assets
26 |     N, T = resid.shape
27 |     K= factors.shape[0]  # factors是K*T矩阵
28 |     try:
29 |         T-N-K >0
30 |     except ValueError as e:
31 |         print(f"time period should be greater than number of assets{e}")
32 | 
33 |     # Covariance of the residuals
34 |     Sigma = np.cov(resid, rowvar=True,ddof=K)#N*N残差协方差矩阵
35 | 
36 |     # Mean excess returns of the risk factors
37 |     factor_mean = np.mean(factors, axis=1,keepdims=True)#K*1的均值矩阵
38 | 
39 | 
40 |     # Covariance matrix of factors
41 |     omega=np.cov(factors,rowvar=True,ddof=-1)
42 |     omega = np.atleast_2d(omega)
43 |     inv_omega = np.linalg.pinv(omega)
44 |     inv_Sigma= np.linalg.pinv(Sigma)
45 |     mult_=(factor_mean.T @ inv_omega @ factor_mean).item()
46 |     mult=1/(1+mult_)
47 |     inter=(alpha.T @ inv_Sigma @ alpha).item()
48 |     # GRS statistic
49 |     dTestStat = (T / N) * ((T - N - K) / (T - K - 1)) * inter * mult
50 |     # p-value of the F-test
51 |     df1=N
52 |     df2=T-N-K
53 |     pVal = 1 - f.cdf(dTestStat, df1, df2)
54 |     df = pd.DataFrame(
55 |         {"Value": [dTestStat, pVal]},
56 |         index=["GRS 统计量", "p‑value"],
57 |     )
58 | 
59 |     # 打印 LaTeX 代码
60 |     print(df.to_latex(
61 |         float_format="%.4f",
62 |         caption=caption,
63 |         label=label,
64 |         header=False
65 |     ))
66 | 


--------------------------------------------------------------------------------
/tests/evaluation/portfolio_test.py:
--------------------------------------------------------------------------------
 1 | from firefin.evaluation.academia.portfolio_sort import PortfolioSort
 2 | from firefin.data.fake import gen_df
 3 | 
 4 | 
 5 | 
 6 | factor1 = gen_df(10, 100, index="day", mock="rand")
 7 | factor2 = gen_df(10, 100, index="day", mock="rand")
 8 | 
 9 | # 1, 2, 3, 4, 5 periods
10 | forward_returns = { i: gen_df(10, 100, index="day", mock="return") for i in range(1, 6) }
11 | market_cap = gen_df(10, 100, index="day", mock="volume")
12 | 
13 | def test_single_sort():
14 |     # test single sort
15 |     single_sort_r = PortfolioSort.single_sort(factor1, forward_returns, market_cap, quantiles=5)
16 |     statistical_r = PortfolioSort.get_statistics(single_sort_r, quantiles=5)
17 | 
18 | def test_dual_sort():
19 |     # test dual sort
20 |     dual_sort_r = PortfolioSort.double_sort(factor1, factor2, forward_returns, market_cap, quantiles=(3,5))
21 |     # statistical_r_dual = PortfolioSort.get_statistics(dual_sort_r, quantiles=5)
22 |     print(dual_sort_r)
23 | 
24 | test_dual_sort()


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | 
 5 | from firefin.data import fetch_data
 6 | from firefin.evaluation.eva_utils import compute_forward_returns
 7 | from firefin.compute.window import ts_corr
 8 | from firefin.evaluation.industry import Evaluator
 9 | # fetch data
10 | data = fetch_data(['open','close','volume','return_adj'])
11 | 
12 | # compute pv correlation
13 | def pv_corr(close, volume):
14 |     return ts_corr(close, volume, 5, method="pearson")
15 | 
16 | # compute factor
17 | factor = pv_corr(data["close"], data["volume"])
18 | 
19 | # compute forward returns
20 | fr = compute_forward_returns(data["open"].shift(-1), [1, 5, 10])
21 | 
22 | # compute industry evaluation
23 | mng = Evaluator(factor, fr)
24 | mng.get_ic("pearson")
25 | mng.get_quantile_returns(5)
26 | 
27 | # compute academia evaluation
28 | from firefin.evaluation.academia.AcaEvaluatorModel import AcaEvaluatorModel
29 | 
30 | mng = AcaEvaluatorModel(factor=factor, forward_returns=fr, return_adj=data["return_adj"], n_jobs=24, verbose=10)
31 | mng.run_all()


--------------------------------------------------------------------------------
/tests/test_algo/test_regression.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Created : 2025/3/27 11:36
  3 | # @Author  : Liao Renjie
  4 | # @Email   : liao.renjie@techfin.ai
  5 | # @File    : test_regression.py
  6 | # @Software: PyCharm
  7 | 
  8 | from unittest import TestCase
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | from numpy import array, nan
 13 | 
 14 | from firefin.core.algorithm.regression import RollingRegressor, table_regression, rolling_regression
 15 | 
 16 | 
 17 | class TestRegression(TestCase):
 18 | 
 19 |     def test_rolling_regressor_basic(self):
 20 | 
 21 |         x = np.arange(12).reshape(6, 2)
 22 |         y = np.arange(6)
 23 | 
 24 |         reg = RollingRegressor(x, y)
 25 |         self.assertEqual((2, 6, 2), reg.x.shape)
 26 |         self.assertEqual((6, 1), reg.y.shape)
 27 |         res = reg.fit(5)
 28 |         np.testing.assert_array_almost_equal(
 29 |             array(
 30 |                 [
 31 |                     [nan, nan],
 32 |                     [nan, nan],
 33 |                     [nan, nan],
 34 |                     [nan, nan],
 35 |                     [0, -0.5],
 36 |                     [0, -0.5],
 37 |                 ]
 38 |             ),
 39 |             res.alpha,
 40 |         )
 41 |         np.testing.assert_array_almost_equal(
 42 |             array(
 43 |                 [
 44 |                     [nan, nan],
 45 |                     [nan, nan],
 46 |                     [nan, nan],
 47 |                     [nan, nan],
 48 |                     [0.5, 0.5],
 49 |                     [0.5, 0.5],
 50 |                 ]
 51 |             ),
 52 |             res.beta,
 53 |         )
 54 | 
 55 |     def test_rolling_regression(self):
 56 |         x = pd.DataFrame(np.arange(12).reshape(6, 2), index=list("ABCDEF"), columns=list("ab"), dtype=float)
 57 |         y = x.copy()
 58 |         y.iloc[3:, 0] = x.iloc[3:, 0] * 2 + 1
 59 |         y.iloc[3:, 1] = x.iloc[3:, 1] * 4 + 1
 60 | 
 61 |         res0 = rolling_regression(x, y, 3)
 62 |         pd.testing.assert_frame_equal(
 63 |             res0.alpha,
 64 |             pd.DataFrame(
 65 |                 array([[nan, nan], [nan, nan], [0, 0], [-4.66666667, -20.1666667], [-8.16666667, -32.3333333], [1, 1]]),
 66 |                 index=list("ABCDEF"),
 67 |                 columns=list("ab"),
 68 |             ),
 69 |         )
 70 |         pd.testing.assert_frame_equal(
 71 |             res0.beta,
 72 |             pd.DataFrame(
 73 |                 array([[nan, nan], [nan, nan], [1, 1], [2.75, 6.5], [3.25, 8], [2, 4]]),
 74 |                 index=list("ABCDEF"),
 75 |                 columns=list("ab"),
 76 |             ),
 77 |         )
 78 | 
 79 |         res1 = rolling_regression([x, x], y, 3)
 80 |         pd.testing.assert_frame_equal(res0.alpha, res1.alpha)
 81 |         pd.testing.assert_frame_equal(
 82 |             pd.concat(res1.beta, axis=1),
 83 |             pd.DataFrame(
 84 |                 array(
 85 |                     [
 86 |                         [nan, nan, nan, nan],
 87 |                         [nan, nan, nan, nan],
 88 |                         [0.5, 0.5, 0.5, 0.5],
 89 |                         [1.375, 3.25, 1.375, 3.25],
 90 |                         [1.625, 4.0, 1.625, 4.0],
 91 |                         [1.0, 2.0, 1.0, 2.0],
 92 |                     ]
 93 |                 ),
 94 |                 index=list("ABCDEF"),
 95 |                 columns=list("abab"),
 96 |             ),
 97 |         )
 98 | 
 99 |     def test_table_regression(self):
100 |         x = pd.DataFrame(np.arange(12).reshape(2, 6), index=list("ab"), columns=list("ABCDEF"), dtype=float)
101 |         y = x.copy()
102 |         y.iloc[0] = x.iloc[0] * 2 + 1
103 |         y.iloc[1] = x.iloc[1] * 4 + 1
104 |         w = x.copy()
105 |         w.iloc[:] = 1
106 | 
107 |         for _w in [None, w]:
108 |             with self.subTest(w=_w):
109 |                 res0 = table_regression(x, y, _w, axis=0)
110 |                 pd.testing.assert_frame_equal(res0.alpha + res0.beta * x, y)
111 | 
112 |                 res1 = table_regression(x, y, _w, axis=1)
113 |                 pd.testing.assert_frame_equal(x.mul(res1.beta, axis=0).add(res1.alpha, axis=0), y)
114 | 
115 |     def test_table_regression_weights(self):
116 |         x = pd.DataFrame(np.arange(12).reshape(2, 6), index=list("ab"), columns=list("ABCDEF"), dtype=float)
117 |         y = x.copy()
118 |         y.iloc[0] = x.iloc[0] * 2 + 1
119 |         y.iloc[1] = x.iloc[1] * 4 + 1
120 |         w = x.copy()
121 | 
122 |         # 此时w不影响结果
123 |         res0 = table_regression(x, y, w, axis=1)
124 |         pd.testing.assert_frame_equal(x.mul(res0.beta, axis=0).add(res0.alpha, axis=0), y)
125 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 2 | # For details: https://github.com/fire-institute/fire/blob/master/NOTICE.txt
 3 | 
 4 | from firefin.data.gateway import fetch_data, DATA_MAPS
 5 | 
 6 | print(DATA_MAPS)
 7 | 
 8 | data = fetch_data(["open", "TradingValue","test_no_data"])
 9 | 
10 | print(data["open"])
11 | print(data["test_no_data"])
12 | print(data["cn_bond_2y"])


--------------------------------------------------------------------------------