├── .github
    └── workflows
    │   ├── ci.yml
    │   ├── codecov.yml
    │   ├── deploy-mkdocs.yml
    │   └── ruff.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── api.md
    ├── assets
    │   └── .icons
    │   │   ├── cc_white.svg
    │   │   └── favicon.ico
    ├── contributing.md
    ├── crowdcent.md
    ├── disclaimer.md
    ├── download.md
    ├── end_to_end.md
    ├── evaluation.md
    ├── index.md
    ├── meta.md
    ├── model_upload.md
    ├── models.md
    ├── numerframe.md
    ├── postprocessing.md
    ├── prediction_loaders.md
    ├── preprocessing.md
    ├── submission.md
    └── targets.md
├── examples
    ├── end_to_end.ipynb
    ├── google_cloud_storage.ipynb
    ├── numerai_pipeline.ipynb
    ├── numerbay_integration.ipynb
    ├── numerframe_tutorial.ipynb
    ├── quickstart.ipynb
    ├── submitting.ipynb
    └── synthetic_data_generation.ipynb
├── mkdocs.yml
├── pyproject.toml
├── pytest.ini
├── src
    └── numerblox
    │   ├── __init__.py
    │   ├── download.py
    │   ├── ensemble.py
    │   ├── evaluation.py
    │   ├── feature_groups.py
    │   ├── meta.py
    │   ├── misc.py
    │   ├── model_upload.py
    │   ├── models.py
    │   ├── neutralizers.py
    │   ├── numerframe.py
    │   ├── penalizers.py
    │   ├── prediction_loaders.py
    │   ├── preprocessing
    │       ├── __init__.py
    │       ├── base.py
    │       ├── classic.py
    │       └── signals.py
    │   ├── submission.py
    │   └── targets.py
├── tests
    ├── test_assets
    │   ├── eodhd-map.csv
    │   ├── mock_credentials.json
    │   └── val_3_eras.parquet
    ├── test_download
    │   ├── __init__.py
    │   ├── test_download_classic.py
    │   ├── test_download_crypto.py
    │   └── test_download_signals.py
    ├── test_end_to_end.py
    ├── test_ensemble.py
    ├── test_evaluation.py
    ├── test_meta.py
    ├── test_misc.py
    ├── test_models.py
    ├── test_neutralizers.py
    ├── test_numerframe.py
    ├── test_penalizers.py
    ├── test_prediction_loaders.py
    ├── test_preprocessing.py
    ├── test_submission.py
    ├── test_targets.py
    └── utils.py
└── uv.lock


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.10', '3.11', '3.12']
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       - name: Install uv
25 |         run: |
26 |           curl -LsSf https://astral.sh/uv/install.sh | sh
27 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
28 | 
29 |       - name: Create and activate virtual environment
30 |         run: |
31 |           uv venv
32 |           echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
33 | 
34 |       - name: Install dependencies
35 |         run: |
36 |           uv pip install 'setuptools[pkg_resources]'
37 |           uv pip install -e ".[test]"
38 | 
39 |       - name: Run tests with coverage
40 |         run: |
41 |           uv pip install pytest-cov
42 |           pytest -s tests/ --cov=numerblox --cov-report term-missing
43 | 
44 |       - name: Build wheel
45 |         run: |
46 |           uv pip install build
47 |           python -m build --wheel
48 | 
49 |       - name: Install built wheel
50 |         run: |
51 |           uv pip install dist/*.whl
52 | 


--------------------------------------------------------------------------------
/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
 1 | name: Upload coverage to Codecov
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - master
 7 | 
 8 | jobs:
 9 |   coverage:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     - name: Set up Python 3.12
14 |       uses: actions/setup-python@v5
15 |       with:
16 |         python-version: '3.12'
17 |     - name: Install uv
18 |       run: |
19 |         curl -LsSf https://astral.sh/uv/install.sh | sh
20 |         echo "$HOME/.cargo/bin" >> $GITHUB_PATH
21 |     - name: Create and activate virtual environment
22 |       run: |
23 |         uv venv
24 |         echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
25 |     - name: Install dependencies
26 |       run: |
27 |         uv pip install 'setuptools[pkg_resources]'
28 |         uv pip install -e ".[test]"
29 |     - name: Run tests with coverage
30 |       run: |
31 |         uv pip install pytest pytest-cov
32 |         pytest -s tests/ --cov=numerblox --cov-report term-missing --cov-report=xml
33 |     - name: Upload coverage to Codecov
34 |       uses: codecov/codecov-action@v3
35 |       env:
36 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
37 |       with:
38 |         files: ./coverage.xml
39 |         fail_ci_if_error: false
40 |         verbose: true
41 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-mkdocs.yml:
--------------------------------------------------------------------------------
 1 | name: MKDocs -> GitHub Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: '3.12'
19 | 
20 |       - name: Install uv
21 |         run: |
22 |           curl -LsSf https://astral.sh/uv/install.sh | sh
23 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
24 | 
25 |       - name: Create and activate virtual environment
26 |         run: |
27 |           uv venv
28 |           echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
29 | 
30 |       - name: Install dependencies
31 |         run: |
32 |           uv pip install 'setuptools[pkg_resources]'
33 |           uv pip install mkdocs mkdocs-material mkdocstrings mkdocstrings-python
34 | 
35 |       - name: Install project
36 |         run: uv pip install -e .
37 | 
38 |       - name: Build the MkDocs site
39 |         run: mkdocs build
40 | 
41 |       - name: Deploy to GitHub Pages
42 |         uses: peaceiris/actions-gh-pages@v3
43 |         with:
44 |           github_token: ${{ secrets.GITHUB_TOKEN }}
45 |           publish_dir: ./site
46 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: Ruff
2 | on: [push, pull_request]
3 | jobs:
4 |   ruff:
5 |     runs-on: ubuntu-latest
6 |     steps:
7 |       - uses: actions/checkout@v4
8 |       - uses: astral-sh/ruff-action@v1.1.0
9 |       


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .jekyll-cache/
  2 | Gemfile.lock  
  3 | *.bak
  4 | .gitattributes
  5 | .last_checked
  6 | .gitconfig
  7 | *.bak
  8 | *.log
  9 | *~
 10 | ~*
 11 | _tmp*
 12 | tmp*
 13 | tags
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | env/
 26 | build/
 27 | develop-eggs/
 28 | dist/
 29 | downloads/
 30 | eggs/
 31 | .eggs/
 32 | lib/
 33 | lib64/
 34 | parts/
 35 | sdist/
 36 | var/
 37 | wheels/
 38 | *.egg-info/
 39 | .installed.cfg
 40 | *.egg
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # celery beat schedule file
 91 | celerybeat-schedule
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # dotenv
 97 | .env
 98 | 
 99 | # virtualenv
100 | .venv
101 | venv/
102 | ENV/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | 
117 | .vscode
118 | *.swp
119 | 
120 | # osx generated files
121 | .DS_Store
122 | .DS_Store?
123 | .Trashes
124 | ehthumbs.db
125 | Thumbs.db
126 | .idea
127 | 
128 | # pytest
129 | .pytest_cache
130 | 
131 | # tools/trust-doc-nbs
132 | docs_src/.last_checked
133 | 
134 | # link checker
135 | checklink/cookies.txt
136 | 
137 | # Numerai authentication
138 | key.json
139 | keys.json
140 | finnhub_key.json
141 | eod_key.json
142 | 
143 | _docs/
144 | sidebar.yml
145 | 
146 | # Test files
147 | edu_nbs/*.h5
148 | prod_requirements.txt
149 | test_numclassic_general_*
150 | test_numcrypto_general_*
151 | some_path_*
152 | test_kaggle_*
153 | test_eod_*
154 | 
155 | examples/data
156 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
 1 | # Full API Reference
 2 | 
 3 | This section provides a detailed reference to all objects defined in NumerBlox.
 4 | 
 5 | ## Download
 6 | 
 7 | ::: numerblox.download
 8 | 
 9 | ------------------------------------------------
10 | 
11 | ## NumerFrame
12 | 
13 | ::: numerblox.numerframe
14 | 
15 | ------------------------------------------------
16 | 
17 | ## Preprocessing
18 | 
19 | ### Base Preprocessing
20 | 
21 | ::: numerblox.preprocessing.base
22 | 
23 | ### Classic Preprocessing
24 | 
25 | ::: numerblox.preprocessing.classic
26 | 
27 | ### Signals Preprocessing
28 | 
29 | ::: numerblox.preprocessing.signals
30 | 
31 | ------------------------------------------------
32 | 
33 | ## Meta
34 | 
35 | ::: numerblox.meta
36 | 
37 | ------------------------------------------------
38 | 
39 | ## Ensemble
40 | 
41 | ::: numerblox.ensemble
42 | 
43 | ------------------------------------------------
44 | 
45 | ## Neutralizers
46 | 
47 | ::: numerblox.neutralizers
48 | 
49 | ------------------------------------------------
50 | 
51 | ## Penalizers
52 | 
53 | ::: numerblox.penalizers
54 | 
55 | ------------------------------------------------
56 | 
57 | ## Prediction Loaders
58 | 
59 | ::: numerblox.prediction_loaders
60 | 
61 | ------------------------------------------------
62 | 
63 | ## Targets
64 | 
65 | ::: numerblox.targets
66 | 
67 | ------------------------------------------------
68 | 
69 | ## Evaluation
70 | 
71 | ::: numerblox.evaluation
72 | 
73 | ------------------------------------------------
74 | 
75 | ## Submission
76 | 
77 | ::: numerblox.submission
78 | 
79 | ------------------------------------------------
80 | 
81 | ## Model Upload
82 | 
83 | ::: numerblox.model_upload
84 | 
85 | ------------------------------------------------
86 | 
87 | ## Models
88 | 
89 | ::: numerblox.models
90 | 
91 | ------------------------------------------------
92 | 
93 | ## Miscellaneous
94 | 
95 | ::: numerblox.misc
96 | 
97 | ------------------------------------------------


--------------------------------------------------------------------------------
/docs/assets/.icons/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/docs/assets/.icons/favicon.ico


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # How To Contribute
 2 | 
 3 | First, thank you for your consideration to contribute to `numerblox`! This document provides some general guidelines to streamline the contribution process.
 4 | 
 5 | ## Installation
 6 | 
 7 | If you haven't installed `numerblox` yet, clone the project into your favorite development environment. Install the repository in editable mode with all dev dependencies.
 8 | 
 9 | Using pip:
10 | ```bash
11 | git clone https://github.com/crowdcent/numerblox.git
12 | cd numerblox
13 | pip install -e ".[test]"
14 | ```
15 | 
16 | Using [uv](https://github.com/astral-sh/uv):
17 | ```bash
18 | git clone https://github.com/crowdcent/numerblox.git
19 | cd numerblox
20 | uv venv
21 | uv pip install -e ".[test]"
22 | ```
23 | 
24 | ## Developing considerations
25 | 
26 | ### 1. Building a new component
27 | 
28 | If you would like to build a new component for Numerblox, please consider the following steps:
29 | 
30 | 1. Place the new component in the appropriate section. Is it a Downloader (`download.py`), a Preprocessor (`preprocessing.py`) or a Submitting tool (`submission.py`)? Also check the documentation on that section for templates, conventions and how these blocks are constructed in general.
31 | 2. Add tests for this new component in the appropriate test file. If you are introducing a new Downloader, add tests in `tests/test_downloader.py`. If you are introducing a new Preprocessor, add tests in `tests/test_preprocessing.py`. etc.
32 | 3. When making a preprocessor or postprocessor, make sure the component follows [scikit-learn conventions](https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator). The core things to implement are inheriting from `BaseEstimator` and implementing `fit`, `transform` and `get_feature_names_out` methods. 
33 | 4. If your component introduces new dependencies, make sure to add them to uv with `uv add <library>`.
34 | 5. Consider adding support for [metadata routing](https://scikit-learn.org/stable/metadata_routing.html) if your component uses additional arguments for `fit`, `transform` and/or `predict`. Check out the documentation and other Numerblox components that use this feature for examples. We are also happy to help out with implementation of metadata routing.
35 | 
36 | 
37 | ### 2. Fixing bugs
38 | 
39 | Even though most of the components in this library are tested, users will still likely run into issues. If you discover bugs, other issues or ideas for enhancements, do not hesitate to make a [Github issue](https://github.com/crowdcent/numerblox/issues). Describe in the issue what code was run on what machine and background on the issue. Add stacktraces and screenshots if this is relevant for solving the issue. Also, please add appropriate labels for the Github issue.
40 | 
41 | - Ensure the bug was not already reported by searching on GitHub under Issues.
42 | - If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
43 | - Be sure to add the complete error messages.
44 | - Be sure to add tests that fail without your patch, and pass with it.
45 | 
46 | ### 3. Creating an example notebook
47 | 
48 | We welcome example notebooks that demonstrate the use of `numerblox`. If you want to create an example notebook, please make a notebook in the `examples/` folder. Make sure to add appropriate descriptions and explain the process of using the various components. Before committing please run the notebook from top to bottom. If it runs without errors, you can commit the notebook.
49 | Lastly, if the notebook uses additional libraries, please note this at the top of the notebook and create a code block with `!pip install <library>`.
50 | 
51 | Example pip install cell:
52 | 
53 | ```bash
54 | !pip install scikit-lego plotly
55 | ```
56 | 
57 | #### Did you write a patch that fixes a bug?
58 | - Open a new GitHub pull request with the patch.
59 | - Ensure that your PR includes a test that fails without your patch, and pass with it.
60 | - Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
61 | 
62 | ## PR submission guidelines
63 | - Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused.
64 | - Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
65 | - If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another.
66 | 


--------------------------------------------------------------------------------
/docs/crowdcent.md:
--------------------------------------------------------------------------------
1 | # About CrowdCent
2 | 
3 | CrowdCent is on a mission to decentralize investment management by changing the way investment funds make decisions and allocate capital. We are the machine learning and coordination layer for online investment communities looking to turn their data into actionable, investable portfolios.
4 | 
5 | More information about CrowdCent can be found on [crowdcent.com](https://crowdcent.com).


--------------------------------------------------------------------------------
/docs/disclaimer.md:
--------------------------------------------------------------------------------
1 | # Disclaimer
2 | 
3 | --------------------------------------
4 | 
5 | Under no circumstances should any information provided in this software — or on associated distribution outlets — be construed as an offer soliciting the purchase or sale of any security or interest in any pooled investment vehicle sponsored, discussed, or mentioned by CrowdCent LLC or affiliates. Nor should it be construed as an offer to provide investment advisory services; an offer to invest in a CrowdCent investment vehicle will be made separately and only by means of the confidential offering documents of the specific pooled investment vehicles — which should be read in their entirety, and only to those who, among other requirements, meet certain qualifications under federal securities laws. Such investors, defined as accredited investors and qualified purchasers, are generally deemed capable of evaluating the merits and risks of prospective investments and financial matters. There can be no assurances that CrowdCent’s investment objectives will be achieved or investment strategies will be successful. Any investment in a vehicle managed by CrowdCent involves a high degree of risk including the risk that the entire amount invested is lost. Any investments or portfolio companies mentioned, referred to, or described are not representative of all investments in vehicles managed by CrowdCent and there can be no assurance that the investments will be profitable or that other investments made in the future will have similar characteristics or results.
6 | 
7 | --------------------------------------
8 | 


--------------------------------------------------------------------------------
/docs/download.md:
--------------------------------------------------------------------------------
  1 | # Downloaders
  2 | 
  3 | ## Numerai Classic
  4 | 
  5 | `NumeraiClassicDownloader` simplifies downloading of datasets from Numerai's API. It allows you to easily download data with a few lines and the data is automatically organized in directories.
  6 | 
  7 | More information: [https://numer.ai/data](https://numer.ai/data)
  8 | 
  9 | 
 10 | ```py
 11 | from numerblox.download import NumeraiClassicDownloader
 12 | 
 13 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder")
 14 | 
 15 | # Training and validation data
 16 | dl.download_training_data("train_val", version="5.0")
 17 | 
 18 | # Live data 
 19 | dl.download_live_data("live", version="5.0")
 20 | ```
 21 | 
 22 | Besides these common use cases you can also get feature sets and meta model predictions with `NumeraiClassicDownloader`. 
 23 | 
 24 | ```py
 25 | from numerblox.download import NumeraiClassicDownloader
 26 | 
 27 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder")
 28 | 
 29 | # Get feature sets
 30 | features = dl.get_classic_features()
 31 | 
 32 | # Get meta model predictions
 33 | dl.download_meta_model_preds()
 34 | meta_model_preds = pd.read_parquet("my_numerai_data_folder/meta_model.parquet")
 35 | ```
 36 | 
 37 | ## Numerai Signals
 38 | 
 39 | Numerai provides a dataset for Numerai Signals. This is a good starting point for new users.
 40 | 
 41 | More information: [https://signals.numer.ai/data](https://signals.numer.ai/data)
 42 | 
 43 | ```py
 44 | from numerblox.download import NumeraiSignalsDownloader
 45 | 
 46 | dl = NumeraiSignalsDownloader(directory_path="my_numerai_signals_folder")
 47 | 
 48 | # Download full dataset
 49 | dl.download_training_data()
 50 | 
 51 | # Live data
 52 | dl.download_live_data()
 53 | ```
 54 | 
 55 | ## Numerai Crypto
 56 | 
 57 | For Numerai Crypto there are files to download.
 58 | 
 59 | More information: [https://crypto.numer.ai/data](https://crypto.numer.ai/data)
 60 | 
 61 | ```py
 62 | from numerblox.download import NumeraiCryptoDownloader
 63 | 
 64 | dl = NumeraiCryptoDownloader(directory_path="my_numerai_crypto_folder")
 65 | 
 66 | # Training targets
 67 | dl.download_training_data()
 68 | 
 69 | # Live Crypto universe data
 70 | dl.download_live_data()
 71 | ```
 72 | 
 73 | ### EOD Historical Data
 74 | 
 75 | Download data from EOD historical data. A common data vendor used for Numerai Signals. 
 76 | 
 77 | More information: [https://eodhistoricaldata.com](https://eodhistoricaldata.com)
 78 | 
 79 | 
 80 | Make sure you have the underlying Python package for EOD installed.
 81 | 
 82 | ```bash
 83 | pip install eod
 84 | ```
 85 | 
 86 | For EOD you also need to define credentials in the form of an API key.
 87 | 
 88 | More information: [https://eodhd.com/pricing](https://eodhd.com/pricing)
 89 | 
 90 | ```py
 91 | from numerblox.download import EODDownloader
 92 | 
 93 | eod_api_key = "MY_EOD_API_KEY"
 94 | tickers = ["AAPL.US", "MSFT.US", "GOOG.US"]
 95 | dl = EODDownloader(directory_path="my_numerai_signals_folder",
 96 | key=eod_api_key, tickers=tickers)
 97 | 
 98 | # Download full dataset
 99 | dl.download_training_data(start="2008-01-01")
100 | 
101 | # load data directly into DataFrame from January 1st 2024 for live.
102 | live_data = dl.download_live_data(start="2024-01-01")
103 | ```
104 | 
105 | ### Kaggle
106 | 
107 | Some Numerai dataset are uploaded and maintained on Kaggle Datasets. NumerBlox offers a convenient API to download these datasets.
108 | 
109 | For authentication, make sure you have a directory called .kaggle in your home directory
110 | with therein a kaggle.json file. kaggle.json should have the following structure:
111 | `{"username": USERNAME, "key": KAGGLE_API_KEY}`
112 | 
113 | More info on authentication: [github.com/Kaggle/kaggle-api#api-credentials](https://github.com/Kaggle/kaggle-api#api-credentials)
114 | 
115 | More info on the Kaggle Python API: [kaggle.com/donkeys/kaggle-python-api](https://kaggle.com/donkeys/kaggle-python-api)
116 | 
117 | Also make sure you have the `kaggle` Python package installed.
118 | 
119 | ```bash
120 | pip install kaggle
121 | ```
122 | 
123 | Below is a quickstart example using Katsu's starter dataset.
124 | 
125 | ```py
126 | from numerblox.download import KaggleDownloader
127 | 
128 | kd = KaggleDownloader(directory_path="my_numerai_signals_folder")
129 | 
130 | # A good example of Numerai Signals data on Kaggle Datasets is Katsu1110's yfinance price dataset.
131 | kd.download_live_data("code1110/yfinance-stock-price-data-for-numerai-signals")
132 | ```
133 | 
134 | ### Google Cloud Storage Integration
135 | 
136 | All NumerBlox downloaders inherit from `BaseIO`, which provides built-in support for Google Cloud Storage (GCS). This allows you to easily upload and download data to/from GCS buckets.
137 | 
138 | #### Prerequisites
139 | 
140 | Make sure you have Google Cloud Storage credentials configured. You'll need:
141 | - The `google-cloud-storage` Python package installed
142 | - Authentication set up (typically via `GOOGLE_APPLICATION_CREDENTIALS` environment variable or default credentials)
143 | 
144 | #### Usage
145 | 
146 | ```py
147 | from numerblox.download import NumeraiClassicDownloader
148 | 
149 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder")
150 | 
151 | # Download from GCS
152 | dl.download_file_from_gcs(bucket_name="my-bucket", gcs_path="path/to/file.parquet")
153 | dl.download_directory_from_gcs(bucket_name="my-bucket", gcs_path="path/to/directory")
154 | 
155 | # Upload to GCS
156 | dl.upload_file_to_gcs(bucket_name="my-bucket", gcs_path="path/to/file.parquet", local_path="local_file.parquet")
157 | dl.upload_directory_to_gcs(bucket_name="my-bucket", gcs_path="path/to/directory")
158 | ```
159 | 
160 | This functionality is available for all downloaders (NumeraiClassicDownloader, NumeraiSignalsDownloader, NumeraiCryptoDownloader, EODDownloader, and KaggleDownloader) since they all inherit from BaseIO.
161 | 
162 | ### Rolling your own downloader
163 | 
164 | We invite users to build out their own downloaders for Numerai Signals. The only requirements are that you inherit from `numerblox.download.BaseDownloader` and implement the `download_training_data` and `download_live_data` methods. Below you will find a template for this.
165 | 
166 | If you have a downloader that you would like to share with the community, please open a Pull Request in NumerBlox.
167 | 
168 | ```py
169 | class AwesomeCustomDownloader(BaseDownloader):
170 |     """
171 |     TEMPLATE -
172 |     Download awesome financial data for Numerai Signals from who knows where.
173 | 
174 |     :param directory_path: Base folder to download files to.
175 |     """
176 |     def __init__(self, directory_path: str):
177 |         super().__init__(directory_path=directory_path)
178 | 
179 |     def download_live_data(self, *args, **kwargs):
180 |         """ (minimal) weekly live downloading here. """
181 |         ...
182 | 
183 |     def download_training_data(self, *args, **kwargs):
184 |         """ Training + validation dataset downloading here. """
185 |         ...
186 | 
187 | ```
188 | 


--------------------------------------------------------------------------------
/docs/end_to_end.md:
--------------------------------------------------------------------------------
  1 | # End To End Examples
  2 | 
  3 | This section will show NumerBlox in action for some more advanced use cases. If you are looking for inspiration to leverage the power of NumerBlox, check out these examples.
  4 | 
  5 | First we download the classic data with NumeraiClassicDownloader. We use a NumerFrame for convenience to parse the dataset.
  6 | 
  7 | ```py
  8 | from numerblox.numerframe import create_numerframe
  9 | from numerblox.download import NumeraiClassicDownloader
 10 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder")
 11 | dl.download_training_data("train_val", version="5.0", int8=True)
 12 | df = create_numerframe("my_numerai_data_folder/train_val/train.parquet")
 13 | val_df = create_numerframe("my_numerai_data_folder/train_val/val.parquet")
 14 | 
 15 | X, y = df.get_feature_target_pair(multi_target=False)
 16 | fncv3_cols = df.get_fncv3_feature_data.columns.tolist()
 17 | 
 18 | val_X, val_y = val_df.get_feature_target_pair(multi_target=False)
 19 | val_features = val_df.get_feature_data
 20 | val_eras = val_df.get_era_data
 21 | ```
 22 | 
 23 | ## 1. Neutralized XGBoost pipeline.
 24 | 
 25 | Let's construct an end-to-end pipeline that does the following:
 26 | - Augment FNCv3 features with group statistics features for the `sunshine` and `rain` data.
 27 | - Fit 5 folds of XGBoost.
 28 | - Ensemble them with a weighted average where the more recent folds get a higher weight.
 29 | - Neutralize the prediction with respect to the original features.
 30 | 
 31 | External libraries are xgboost and sklego. Make sure to have these dependencies installed.
 32 | 
 33 | ```bash
 34 | !pip install xgboost sklego
 35 | ```
 36 | 
 37 | ```py
 38 | from xgboost import XGBRegressor
 39 | from sklego.preprocessing import ColumnSelector
 40 | from sklearn.model_selection import TimeSeriesSplit
 41 | from sklearn.pipeline import make_union
 42 | from sklearn.compose import make_column_transformer
 43 | 
 44 | from numerblox.preprocessing import GroupStatsPreProcessor
 45 | from numerblox.meta import CrossValEstimator, make_meta_pipeline
 46 | from numerblox.ensemble import NumeraiEnsemble
 47 | from numerblox.neutralizers import FeatureNeutralizer
 48 | 
 49 | # Preprocessing
 50 | gpp = GroupStatsPreProcessor(groups=['sunshine', 'rain'])
 51 | fncv3_selector = ColumnSelector(fncv3_cols)
 52 | 
 53 | preproc_pipe = make_union(gpp, fncv3_selector)
 54 | 
 55 | # Model
 56 | xgb = XGBRegressor()
 57 | cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5))
 58 | ens = NumeraiEnsemble()
 59 | fn = FeatureNeutralizer(proportion=0.5)
 60 | full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn)
 61 | 
 62 | # Train full model
 63 | full_pipe.fit(X, y, era_series=era_series);
 64 | 
 65 | # Inference on validation data
 66 | val_preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features)
 67 | ```
 68 | 
 69 | ## 2. Multi Classification Ensemble
 70 | 
 71 | This example shows a multiclass classification example where the Numerai target is transformed into integers (`[0, 0.25, 0.5, 0.75, 1.0] -> [0, 1, 2, 3, 4]`) and treated as a classification problem. 
 72 | 
 73 | When we call `predict_proba` on a classifier the result will be a probability for every class, like for example `[0.1, 0.2, 0.3, 0.2, 0.2]`. In order to reduce these to one number we use the `PredictionReducer`, which takes the probabilities for every model and reduces it with a vector multiplication (Fro example, `[0.1, 0.2, 0.3, 0.2, 0.2] @ [0, 1, 2, 3, 4] = 2.2`). It does this for every model so the output of `PredictionReducer` has 3 columns. 
 74 | 
 75 | Because we set `donate_weighted=True` in `NumeraiEnsemble` 3 columns are reduced to one column using a weighted ensemble where the most recent fold get the highest weight. Lastly, the final prediction column is neutralized.
 76 | 
 77 | ```py
 78 | from sklearn.tree import DecisionTreeClassifier
 79 | from sklearn.model_selection import TimeSeriesSplit
 80 | from numerblox.meta import CrossValEstimator, make_meta_pipeline
 81 | from numerblox.ensemble import NumeraiEnsemble, PredictionReducer
 82 | from numerblox.neutralizers import FeatureNeutralizer
 83 | 
 84 | model = DecisionTreeClassifier()
 85 | crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba')
 86 | pred_rud = PredictionReducer(n_models=3, n_classes=5)
 87 | ens2 = NumeraiEnsemble(donate_weighted=True)
 88 | neut2 = FeatureNeutralizer(proportion=0.5)
 89 | full_pipe = make_meta_pipeline(preproc_pipe, crossval1, pred_rud, ens2, neut2)
 90 | 
 91 | full_pipe.fit(X, y, era_series=era_series)
 92 | 
 93 | preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features)
 94 | ```
 95 | 
 96 | ## 3. Ensemble of ensemble of regressors
 97 | 
 98 | This object introduces a `ColumnTransformer` that contains 3 pipelines. Each pipeline can have a different set of arguments. Here we simplify by passing every pipeline with the same columns. 
 99 | The output from all pipelines is concatenated, ensembled with `NumeraiEnsemble` and the final ensembles column is neutralized. Note that every fold here is equal weighted. If you want to give recent folds more weight set `weights` in `NumeraiEnsemble` for all `ColumnTransformer` output.
100 | 
101 | ```py
102 | from sklearn.tree import DecisionTreeRegressor
103 | from sklearn.model_selection import TimeSeriesSplit
104 | from sklearn.pipeline import make_pipeline
105 | from sklearn.compose import make_column_transformer
106 | from numerblox.meta import CrossValEstimator, make_meta_pipeline
107 | from numerblox.ensemble import NumeraiEnsemble,
108 | from numerblox.neutralizers import FeatureNeutralizer
109 | 
110 | 
111 | pipes = []
112 | for i in range(3):
113 |     model = DecisionTreeRegressor()
114 |     crossval = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=5), predict_func='predict')
115 |     pipe = make_pipeline(crossval)
116 |     pipes.append(pipe)
117 | 
118 | models = make_column_transformer(*[(pipe, features.columns.tolist()) for pipe in pipes])
119 | ens_end = NumeraiEnsemble()
120 | neut = FeatureNeutralizer(proportion=0.5)
121 | full_pipe = make_meta_pipeline(models, ens_end, neut)
122 | 
123 | full_pipe.fit(X, y, era_series=era_series)
124 | 
125 | preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features)
126 | ```
127 | 


--------------------------------------------------------------------------------
/docs/evaluation.md:
--------------------------------------------------------------------------------
  1 | # Evaluators
  2 | 
  3 | NumerBlox offers evaluators for both Numerai Classic and Numerai Signals.
  4 | 
  5 | ## Common Metrics
  6 | 
  7 | For both `NumeraiClassicEvaluator` and `NumeraiSignalsEvaluator` you can set a custom `metrics_list` with all metrics you want to compute.
  8 | 
  9 | By default, metrics will include `["mean_std_sharpe", "apy", "max_drawdown", "calmar_ratio"]`
 10 | 
 11 | All valid metrics for `metrics_list` are:
 12 | 
 13 | - "mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on Corrv2 (Numerai Correlation).
 14 | 
 15 | - "apy" -> Annual Percentage Yield.
 16 | 
 17 | - "max_drawdown" -> Max drawdown.
 18 | 
 19 | - "calmar_ratio" -> [Calmar Ratio](https://www.investopedia.com/terms/c/calmarratio.asp).
 20 | 
 21 | - "autocorrelation" -> Autocorrelation (1st order).
 22 | 
 23 | - "max_feature_exposure" -> [Max feature exposure](https://forum.numer.ai/t/model-diagnostics-feature-exposure/899).
 24 | 
 25 | - "smart_sharpe" -> Smart Sharpe.
 26 | 
 27 | - "legacy_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on legacy model contribution.
 28 | 
 29 | - "fn_mean_std_sharpe" -> [Feature Neutral](https://docs.numer.ai/tournament/feature-neutral-correlation) mean, standard deviation and Sharpe ratio (can take some time to compute).
 30 | 
 31 | - "tb200_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on TB200.
 32 | 
 33 | - "tb500_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on TB500.
 34 | 
 35 | The following metrics only work if `benchmark_cols` are defined in `full_evaluation`:
 36 | 
 37 | - "mc_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on [model contribution](https://forum.numer.ai/t/mmc-staking-starts-jan-2-2024/6827).
 38 | 
 39 | - "corr_with" -> Correlation with benchmark predictions.
 40 | 
 41 | - "ex_diss_pearson" (alias "ex_diss") -> [Exposure Dissimilarity](https://forum.numer.ai/t/true-contribution-details/5128/4) to benchmark predictions using Pearson correlation.
 42 | 
 43 | - "ex_diss_spearman" -> [Exposure Dissimilarity](https://forum.numer.ai/t/true-contribution-details/5128/4) to benchmark predictions using Spearman correlation. Will be slower compared to "ex_diss_pearson".
 44 | 
 45 | - "churn" -> [Churn](https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638) is a statistic describing how the alpha scores of a signal changes over time.
 46 | 
 47 | - "tb200_churn" -> Churn based on TB200.
 48 | 
 49 | - "tb500_churn" -> Churn based on TB500.
 50 | 
 51 | ## Numerai Classic specific metrics
 52 | 
 53 | `NumeraiClassicEvaluator` can also compute [FNCv3](https://docs.numer.ai/numerai-tournament/scoring/feature-neutral-correlation#fnc-on-the-website). If you want to compute this add `fncv3_mean_std_sharpe` to the `metrics_list`.
 54 | 
 55 | ```py
 56 | from numerblox.evaluation import NumeraiClassicEvaluator, FAST_METRICS
 57 | 
 58 | # Validation DataFrame to compute metrics on
 59 | # Should have at least era_col, pred_cols and target_col columns.
 60 | val_df = ...
 61 | 
 62 | evaluator = NumeraiClassicEvaluator(era_col="era", metrics_list=FAST_METRICS)
 63 | metrics = evaluator.full_evaluation(val_df, 
 64 |                                     pred_cols=["prediction"], 
 65 |                                     target_col="target",
 66 |                                     benchmark_cols=["benchmark1", "benchmark2"])
 67 | ```
 68 | 
 69 | ## Numerai Signals specific metrics
 70 | 
 71 | `NumeraiSignalsEvaluator` offers [Numerai Signals diagnostics](https://forum.numer.ai/t/signals-diagnostics-guide/5950) scores. This is a special operation as it calls on Numerai servers and needs additional authentication, so it is not included in `full_evaluation`.
 72 | 
 73 | Example of how to get diagnostic scores for Numerai Signals:
 74 | ```py
 75 | from numerblox.misc import Key
 76 | from numerblox.evaluation import NumeraiSignalsEvaluator
 77 | 
 78 | evaluator = NumeraiSignalsEvaluator()
 79 | 
 80 | # A Numerai Signals model name you use.
 81 | model_name = "MY_MODEL"
 82 | # NumerBlox Key for accessing the Numerai API
 83 | key = Key(pub_id="Hello", secret_key="World")
 84 | # DataFrame with validation data containing prediction, date, ticker and data_type columns
 85 | val_df = pd.DataFrame()
 86 | 
 87 | evaluator.get_neutralized_corr(val, model_name=model_name, key=key, corr_col="validationRic")
 88 | # Returns a Pandas DataFrame with validationRic.
 89 | ```
 90 | 
 91 | ## Custom functions
 92 | 
 93 | In addition to the default metrics, evaluators can be augmented with custom metrics. This can be done by defining a dictionary of functions and arguments.
 94 | 
 95 | The custom function dictionary should have the following structure:
 96 | ```py
 97 | {
 98 |     "func1": # Metric name
 99 |     {
100 |         "func": custom_function,  # Function to call
101 |         "args": { # General arguments (can be any type)
102 |             "dataf": "dataf",
103 |             "some_arg": "some_arg",
104 |         },
105 |         "local_args": ["dataf"]  # List of local variables to use/resolve
106 |     },
107 |     "func2":
108 |     {
109 |         "func": custom_function2,
110 |         "args": { 
111 |             "dataf": "dataf",
112 |             "some_arg": "some_arg",
113 |         },
114 |         "local_args": ["dataf"]
115 |     },
116 |     (...)
117 | }
118 | ```
119 | 
120 | - The main keys (`func1` and `func2` in the example) will be the metric key names for the output evaluation DataFrame.
121 | 
122 | - The `func` key should be a function that takes in the arguments defined in `args` as keyword arguments. `func` should be a callable function or class (i.e. class that implements `__call__`).
123 | 
124 | - The `args` key should be a dictionary with arguments to pass to `func`. The values of the dictionary can be any type. Arguments that you want resolved as local variables should be defined as strings (see `local_args` explanation).
125 | 
126 | - The `local_args` key should be a list of strings that refer to variables that exist locally in the [evaluation_one_col](https://crowdcent.github.io/numerblox/api/#numerblox.evaluation.BaseEvaluator.evaluation_one_col) function. These local variables will be resolved to local variables for `func`. This allows you to use [evaluation_one_col](https://crowdcent.github.io/numerblox/api/#numerblox.evaluation.BaseEvaluator.evaluation_one_col) variables like `dataf`, `pred_col`, `target_col`, `col_stats`, `mean`, `per_era_numerai_corrs`, etc.
127 | 
128 | 
129 | Example of how to use custom functions in `NumeraiClassicEvaluator`:
130 | ```py
131 | from numerblox.evaluation import NumeraiClassicEvaluator
132 | 
133 | def residuals(dataf, target_col, pred_col, val: int):
134 |     """ Simple dummy func: mean of residuals. """
135 |     return np.mean(dataf[target_col] - dataf[pred_col] + val)
136 | 
137 | custom_functions = {
138 |         "residuals": {
139 |             # Callable function
140 |             "func": residuals,
141 |             "args": {
142 |                 # String referring to local variables
143 |                 "dataf": "dataf", 
144 |                 "pred_col": "pred_col",
145 |                 "target_col": "target_col",
146 |                 # Static argument
147 |                 "val": 0.0001,
148 |             },
149 |              # List of local variables to use/resolve
150 |             "local_args": ["dataf", "pred_col", "target_col"] 
151 |         },
152 | }
153 | 
154 | evaluator = NumeraiClassicEvaluator(custom_functions=custom_functions)
155 | 
156 | # In evaluator residuals(dataf=dataf, pred_col="prediction", target_col="target", val="0.0001) is called.
157 | metrics = evaluator.full_evaluation(val_df, 
158 |                                     pred_cols=["prediction"], 
159 |                                     target_col="target")
160 | # metrics will contain a "residuals" column.
161 | ```


--------------------------------------------------------------------------------
/docs/meta.md:
--------------------------------------------------------------------------------
 1 | # Meta Estimators
 2 | 
 3 | Meta estimator wrap existing scikit-learn estimators to provide additional functionality. Currently, the following meta estimators are available:
 4 | 
 5 | - [CrossValEstimator](#crossvalestimator)
 6 | - [MetaPipeline](#metapipeline)
 7 | 
 8 | ## CrossValEstimator
 9 | 
10 | `CrossValEstimator` provides a way to integrate cross-validation directly into model training, enabling simultaneous fitting of multiple models across data folds. By doing this, you can fit it as one transformer and get outputs for each fold during the prediction phase.
11 | 
12 | ### Why CrossValEstimator?
13 | 
14 | - **Holistic Training**: Cross-validation offers a more robust model training process by leveraging multiple sub-sets of your data. This way, your model's performance is less susceptible to the peculiarities of any single data split.
15 | 
16 | - **Inherent Ensemble**: By training on multiple folds, you're essentially building an ensemble of models. Ensembles often outperform individual models since they average out biases, reduce variance, and are less likely to overfit.
17 | 
18 | - **Custom Evaluation**: With the `evaluation_func` parameter, you can input your custom evaluation logic, allowing for flexible and tailored performance assessment for each fold.
19 | 
20 | - **Flexibility with Predictions**: Choose between different prediction functions like 'predict', 'predict_proba', and 'predict_log_proba' using the `predict_func` parameter.
21 | 
22 | - **Verbose Logging**: Gain insights into the training process with detailed logs during the fitting phase, aiding in debugging and understanding model performance across folds.
23 | 
24 | ### Example
25 | 
26 | ```py
27 | from sklearn.model_selection import KFold
28 | from xgboost import XGBRegressor
29 | 
30 | from numerblox.meta import CrossValEstimator
31 | 
32 | # Define the cross-validation strategy
33 | cv = KFold(n_splits=5)
34 | 
35 | # Initialize the estimator
36 | estimator = XGBRegressor(n_estimators=100, max_depth=3)
37 | 
38 | # (optional) Define a custom evaluation function
39 | def custom_eval(y_true, y_pred):
40 |     return {"mse": ((y_true - y_pred) ** 2).mean()}
41 | 
42 | # Initialize the CrossValEstimator
43 | cross_val_estimator = CrossValEstimator(cv=cv, 
44 |                                         estimator=estimator,
45 |                                         evaluation_func=custom_eval)
46 | 
47 | # Fit the CrossValEstimator
48 | cross_val_estimator.fit(X_train, y_train)
49 | predictions = cross_val_estimator.predict(X_test)
50 | ```
51 | 
52 | ## MetaPipeline
53 | 
54 | The `MetaPipeline` extends the functionality of scikit-learn's `Pipeline` by seamlessly integrating models and post-model transformations. It empowers you to employ sophisticated data transformation techniques not just before, but also after your model's predictions. This is particularly useful when post-processing predictions, such as neutralizing feature exposures in financial models.
55 | 
56 | ## Why MetaPipeline?
57 | 
58 | - **Post-Model Transformations**: It can be crucial to apply transformations, like feature neutralization, after obtaining predictions. `MetaPipeline` facilitates such operations, leading to improved model generalization and stability.
59 | 
60 | - **Streamlined Workflow**: Instead of managing separate sequences for transformations and predictions, you can orchestrate them under a single umbrella, simplifying both development and production workflows.
61 | 
62 | - **Flexible Integration**: `MetaPipeline` gracefully handles a variety of objects, including `Pipeline`, `FeatureUnion`, and `ColumnTransformer`. This makes it a versatile tool adaptable to diverse tasks and data structures.
63 | 
64 | #### Example
65 | 
66 | Consider a scenario where you have an `XGBRegressor` model and want to apply a `FeatureNeutralizer` after obtaining the model's predictions:
67 | 
68 | ```py
69 | from xgboost import XGBRegressor
70 | from numerblox.meta import MetaPipeline 
71 | from numerblox.neutralizers import FeatureNeutralizer
72 | 
73 | # Define MetaPipeline steps
74 | steps = [
75 |     ('xgb_regressor', XGBRegressor(n_estimators=100, max_depth=3)),
76 |     ('feature_neutralizer', FeatureNeutralizer(proportion=0.5))
77 | ]
78 | 
79 | # Create MetaPipeline
80 | meta_pipeline = MetaPipeline(steps)
81 | 
82 | # Train and predict using MetaPipeline
83 | meta_pipeline.fit(X_train, y_train)
84 | predictions = meta_pipeline.predict(X_test)
85 | ```
86 | 
87 | For a more succinct creation of a `MetaPipeline`, you can use the `make_meta_pipeline` function:
88 | 
89 | ```py
90 | from numerblox.meta import make_meta_pipeline
91 | 
92 | pipeline = make_meta_pipeline(XGBRegressor(n_estimators=100, max_depth=3),
93 |                               FeatureNeutralizer(proportion=0.5))
94 | ```


--------------------------------------------------------------------------------
/docs/model_upload.md:
--------------------------------------------------------------------------------
  1 | # Numerai Model Upload
  2 | 
  3 | The `NumeraiModelUpload` class is designed for uploading trained models to Numerai for automated submissions. You can upload a single trained model or a complete `sklearn` pipeline, allowing seamless integration with various machine learning workflows. This class efficiently handles model serialization, validation, and uploading, making it adaptable for different types of models and workflows.
  4 | 
  5 | ***Warning**: The `NumeraiModelUpload` class is designed to work with very specific requirements. For compatibility, make sure your environment matches the requirements listed in the official numerai-predict repository: [numerai-predict/requirements.txt](https://github.com/numerai/numerai-predict/blob/master/requirements.txt). Using different versions or additional packages may lead to issues during model upload and execution.*
  6 | 
  7 | ## Why Use NumeraiModelUpload?
  8 | 
  9 | - **Automation**: Automates the model submission process to Numerai, reducing the need for manual intervention.
 10 | - **Support for Sklearn Pipelines**: Integrates seamlessly with `sklearn` pipelines and NumerBlox processors, allowing users to submit models with preprocessing, feature engineering, and stacking in a single workflow.
 11 | - **Error Handling**: Offers robust error handling with retry logic, ensuring reliable uploads even in case of network or API errors.
 12 | - **Custom Predict Function**: Supports custom prediction functions for advanced use cases, offering greater flexibility.
 13 | 
 14 | ## Instantiation
 15 | 
 16 | To use `NumeraiModelUpload`, instantiate it with a `Key` object containing your credentials and optional parameters for error handling.
 17 | 
 18 | ```python
 19 | from numerblox.misc import Key
 20 | from numerblox.submission import NumeraiModelUpload
 21 | 
 22 | key = Key(pub_id="your_public_id", secret_key="your_secret_key")
 23 | 
 24 | uploader = NumeraiModelUpload(
 25 |     key=key,
 26 |     max_retries=3,
 27 |     sleep_time=15,
 28 |     fail_silently=True
 29 | )
 30 | ```
 31 | 
 32 | ### Parameters:
 33 | 
 34 | - **`key`**: (Key) Key object containing valid credentials for Numerai Classic.
 35 | - **`max_retries`**: (int, optional) Maximum number of retries for uploading models to Numerai. Defaults to 2.
 36 | - **`sleep_time`**: (int, optional) Time in seconds to wait between retries. Defaults to 10.
 37 | - **`fail_silently`**: (bool, optional) Whether to suppress errors and skip failed uploads without raising exceptions. Useful for batch processing. Defaults to `False`.
 38 | - **`*args, **kwargs`**: Additional arguments passed to `NumerAPI` initialization.
 39 | 
 40 | ## Model Uploading
 41 | 
 42 | The primary method for uploading models is `create_and_upload_model`, which serializes the model using `cloudpickle`, saves it to a file, and uploads it to Numerai.
 43 | 
 44 | ### Example: Upload a Single Model
 45 | 
 46 | ```python
 47 | import pandas as pd
 48 | from some_ml_library import TrainedModel
 49 | 
 50 | # Assume you have a trained model named 'my_model'
 51 | my_model = TrainedModel()
 52 | 
 53 | uploader.create_and_upload_model(
 54 |     model=my_model,
 55 |     model_name="my_model_name",
 56 |     file_path="models/my_model.pkl"
 57 | )
 58 | ```
 59 | 
 60 | ### Method: `create_and_upload_model`
 61 | 
 62 | Creates a model prediction function, serializes it, and uploads the model to Numerai.
 63 | 
 64 | #### Parameters:
 65 | 
 66 | - **`model`**: (Any) The machine learning model object.
 67 | - **`feature_cols`**: (Optional[List[str]]) List of feature column names for predictions. If `None`, all columns starting with "feature_" will be used.
 68 | - **`model_name`**: (str) Numerai model name.
 69 | - **`file_path`**: (str) Full path where the serialized model function will be saved.
 70 | - **`data_version`**: (Optional[str]) Data version to use for model upload.
 71 | - **`docker_image`**: (Optional[str]) Docker image to use for model upload.
 72 | - **`custom_predict_func`**: (Optional[Callable[[pd.DataFrame], pd.DataFrame]]) Custom predict function. If provided, it should accept a DataFrame and return a DataFrame with a "prediction" column.
 73 | 
 74 | #### Returns:
 75 | 
 76 | - **`upload_id`**: Upload ID if successful, `None` otherwise.
 77 | 
 78 | ### Method: `get_available_data_versions`
 79 | 
 80 | Retrieve available data versions for model uploads.
 81 | 
 82 | #### Example
 83 | 
 84 | ```python
 85 | available_data_versions = uploader.get_available_data_versions()
 86 | print(available_data_versions)
 87 | ```
 88 | 
 89 | ### Method: `get_available_docker_images`
 90 | 
 91 | Retrieve available Docker images for model uploads.
 92 | 
 93 | #### Example
 94 | 
 95 | ```python
 96 | available_docker_images = uploader.get_available_docker_images()
 97 | print(available_docker_images)
 98 | ```
 99 | 
100 | ### Method: `_get_model_id`
101 | 
102 | Private method to get the model ID needed for model uploading.
103 | 
104 | #### Parameters:
105 | 
106 | - **`model_name`**: (str) The name of the model registered in Numerai.
107 | 
108 | #### Returns:
109 | 
110 | - **`model_id`**: (str) Corresponding model ID for the given model name.
111 | 
112 | ### Method: `get_model_mapping`
113 | 
114 | Property that returns a mapping between raw model names and their corresponding model IDs.
115 | 
116 | #### Example
117 | 
118 | ```python
119 | model_mapping = uploader.get_model_mapping
120 | print(model_mapping)
121 | ```
122 | 
123 | ## Example: Upload an Ensemble Model with Sklearn Pipeline
124 | 
125 | To upload an ensemble model with multiple layers using an `sklearn` pipeline:
126 | 
127 | ```python
128 | from sklearn.ensemble import StackingRegressor
129 | from sklearn.linear_model import RidgeCV
130 | from sklearn.ensemble import RandomForestRegressor
131 | 
132 | # Create base models
133 | base_models = [
134 |     ('rf', RandomForestRegressor()),
135 |     ('ridge', RidgeCV())
136 | ]
137 | 
138 | # Create stacking ensemble model
139 | stacking_model = StackingRegressor(estimators=base_models, final_estimator=RandomForestRegressor())
140 | 
141 | uploader.create_and_upload_model(
142 |     model=stacking_model,
143 |     model_name="ensemble_model_name",
144 |     file_path="models/ensemble_model.pkl"
145 | )
146 | ```
147 | 
148 | ## Note
149 | 
150 | Ensure that the credentials and model names used in the above examples match those configured in your Numerai account.
151 | 


--------------------------------------------------------------------------------
/docs/models.md:
--------------------------------------------------------------------------------
 1 | # Models
 2 | 
 3 | ## EraBoostedXGBRegressor
 4 | 
 5 | NOTE: This is still an experimental feature and subject to change.
 6 | 
 7 | `EraBoostedXGBRegressor` is a custom regressor extending the functionality of XGBoost, aimed at improving accuracy on specific eras in a dataset. It upweights the eras that are toughest to fit. It is designed to integrate seamlessly with scikit-learn.
 8 | 
 9 | ### Why?
10 | - Era-Specific Focus: Targets the worst-performing eras in your data for performance enhancement, ensuring that the model improves where it is most needed.
11 | - Scikit-learn integration: `EraBoostedXGBRegressor` is designed to integrate seamlessly with scikit-learn.
12 | - Customization Options: Offers flexibility to adjust the proportion of eras to focus on, the number of trees added per iteration, and the total number of iterations for era boosting.
13 | 
14 | ### Quickstart
15 | 
16 | Make sure to include the era column as a `pd.Series` in the `fit` method.
17 | ```python
18 | from numerblox.models import EraBoostedXGBRegressor
19 | 
20 | model = EraBoostedXGBRegressor(proportion=0.5, trees_per_step=10, num_iters=20)
21 | model.fit(X=X_train, y=y_train, era_series=eras_train)
22 | 
23 | predictions = model.predict(X_live)
24 | ```


--------------------------------------------------------------------------------
/docs/numerframe.md:
--------------------------------------------------------------------------------
 1 | # NumerFrame
 2 | 
 3 | `NumerFrame` is an extension of `pd.DataFrame` tailored specifically for the data format and workflow commonly used by Numerai participants. It builds upon the base functionalities of a Pandas DataFrame by offering utilities that simplify working with Numerai datasets.
 4 | 
 5 | ## Why?
 6 | - **Intuitive Data Handling**: With built-in features like `get_feature_data`, `get_target_data`, and more, it simplifies extracting data subsets specific to Numerai competitions.
 7 |   
 8 | - **Automated Column Grouping**: Automatically parses columns into recognizable groups such as features, targets, predictions, making data retrieval more intuitive and less error-prone.
 9 |   
10 | - **Support for Multiple Formats**: Through `create_numerframe`, it supports initializing from various data formats such as CSV, Parquet, Excel, and Pickle, providing a flexible interface for users.
11 |   
12 | - **Optimized for Numerai**: Whether you're trying to fetch specific eras, feature groups or patterns like all 20-day targets, `NumerFrame` is designed to simplify those tasks for Numerai participants.
13 |   
14 | - **Chainable Operations**: Since most operations return another `NumerFrame`, they can be conveniently chained for more complex workflows.
15 |   
16 | - **Tailored for Machine Learning**: With methods like `get_feature_target_pair`, it aids in easily splitting the data for machine learning tasks specific to the Numerai competition.
17 |   
18 | By using `NumerFrame`, participants can focus more on model development and less on data wrangling, leading to a smoother and more efficient workflow in the Numerai competition.
19 | 
20 | 
21 | ## Initialization
22 | A NumerFrame can be initialized either from an existing `pd.DataFrame` or with `create_numerframe`. The `create_numerframe` function takes a path to a file and returns a `NumerFrame` object. This function automatically parses the file and supports CSV, Parquet, Excel and Pickle formats.
23 | 
24 | `NumerFrame` automatically parses columns into groups so you can easily retrieve what you need. It automatically is aware of the `era` column for its operations. 
25 | 
26 | `NumerFrame` follows a convention for feature groups.
27 | 
28 | - Features are all columns that start with `feature`.
29 | 
30 | - Targets are all columns that start with `target`.
31 | 
32 | - Predictions are all columns that start with `prediction`.
33 | 
34 | - Aux columns are all that fall in none of these buckets, like `era`, `data_type` and `id`. 
35 | 
36 | - Era column is either `era` or `date`.
37 | 
38 | ```py
39 | import pandas as pd
40 | from numerblox.numerframe import NumerFrame, create_numerframe
41 | # From DataFrame
42 | data = pd.read_parquet('train.parquet')
43 | df = NumerFrame(data)
44 | 
45 | # With create_numerframe
46 | df = create_numerframe('train.parquet')
47 | ```
48 | 
49 | 
50 | ## Examples
51 | 
52 | Basic functionality: 
53 | ```py
54 | # Get data for features, targets, predictions, and aux
55 | features = df.get_feature_data
56 | targets = df.get_target_data
57 | predictions = df.get_prediction_data
58 | aux_data = df.get_aux_data
59 | ```
60 | 
61 | Additionally it is possible to get groups specific to Numerai Classic like FNCv3 and internal feature groups. The examples below show some advanced functionality in `NumerFrame`.
62 | 
63 | ```py
64 | # Get data for features, targets and predictions
65 | features = df.get_feature_data
66 | targets = df.get_target_data
67 | predictions = df.get_prediction_data
68 | 
69 | # Get specific data groups
70 | fncv3_features = df.get_fncv3_feature_data
71 | group_features = df.get_group_features(group='rain')
72 | small_features = df.get_small_feature_data
73 | medium_features = df.get_medium_feature_data
74 | 
75 | # Fetch columns by pattern. For example all 20 day targets.
76 | pattern_data = df.get_pattern_data(pattern='_20')
77 | # Or for example Jerome targets.
78 | jerome_targets = df.get_pattern_data(pattern='_jerome_')
79 | 
80 | # Split into feature and target pairs. Will get single target by default.
81 | X, y = df.get_feature_target_pair()
82 | # Optionally get all targets
83 | X, y = df.get_feature_target_pair(multi_target=True)
84 | 
85 | # Fetch data for specified eras
86 | X, y = df.get_era_batch(eras=['0001', '0002'])
87 | 
88 | # Since every operation returns a NumerFrame they can be chained.
89 | # An example chained operation is getting features and targets for the last 2 eras.
90 | X, y = df.get_last_eras(2).get_feature_target_pair()
91 | ```
92 | 
93 | 


--------------------------------------------------------------------------------
/docs/postprocessing.md:
--------------------------------------------------------------------------------
 1 | # Postprocessing
 2 | 
 3 | ## Feature Neutralization
 4 | 
 5 | `FeatureNeutralizer` provides classic feature neutralization by subtracting linear model influence, ensuring that predictions are not overly influenced by a specific set of features.
 6 | 
 7 | ### Why?
 8 | - **Reduce Overfitting**: By neutralizing predictions, you can potentially reduce the risk of overfitting to specific feature characteristics.
 9 | - **Control Feature Influence**: Allows you to have a granular control on how much influence a set of features can exert on the final predictions. 
10 | - **Enhance Model Robustness**: By limiting the influence of potentially noisy or unstable features, you might improve the robustness of your model's predictions across different data periods.
11 | 
12 | ### Quickstart
13 | 
14 | Make sure to pass both the features to use for penalization as a `pd.DataFrame` and the accompanying era column as a `pd.Series` to the `predict` method.
15 | 
16 | Additionally, `pred_name` and `proportion` can be lists. In this case, the neutralization will be performed for each prediction name and proportion. For example, if `pred_name=["prediction1", "prediction2"]` and `proportion=[0.5, 0.7]`, then the result will be an array with 4 neutralized prediction columns.
17 | All neutralizations will be performed in parallel.
18 | 
19 | Single column neutralization:
20 | ```python
21 | import pandas as pd
22 | from numerblox.neutralizers import FeatureNeutralizer
23 | 
24 | predictions = pd.Series([0.24, 0.87, 0.6])
25 | feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
26 | era_data = pd.Series([1, 1, 2])
27 | 
28 | neutralizer = FeatureNeutralizer(pred_name="prediction", proportion=0.5)
29 | neutralizer.fit()
30 | neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, era_series=era_data)
31 | ```
32 | 
33 | Multiple column neutralization:
34 | ```python
35 | import pandas as pd
36 | from numerblox.neutralizers import FeatureNeutralizer
37 | 
38 | predictions = pd.DataFrame({"prediction1": [0.24, 0.87, 0.6], "prediction2": [0.24, 0.87, 0.6]})
39 | feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
40 | era_data = pd.Series([1, 1, 2])
41 | 
42 | neutralizer = FeatureNeutralizer(pred_name=["prediction1", "prediction2"], proportion=[0.5, 0.7])
43 | neutralizer.fit()
44 | neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, era_series=era_data)
45 | ```
46 | 
47 | ## FeaturePenalizer
48 | 
49 | `FeaturePenalizer` neutralizes predictions using TensorFlow based on provided feature exposures. It's designed to integrate seamlessly with scikit-learn.
50 | 
51 | ### Why?
52 | - **Limit Feature Exposure**: Ensures that predictions are not excessively influenced by any individual feature, which can help in achieving more stable predictions.
53 | - **Enhanced Prediction Stability**: By penalizing high feature exposures, it might lead to more stable and consistent predictions across different eras or data splits.
54 | - **Mitigate Model Biases**: If a model is relying too heavily on a particular feature, penalizing can help in balancing out the biases and making the model more generalizable.
55 | 
56 | ### Quickstart
57 | 
58 | Make sure to pass both the features to use for penalization as a `pd.DataFrame` and the accompanying era column as a `pd.Series` to the `predict` method.
59 | ```python
60 | from numerblox.penalizers import FeaturePenalizer
61 | 
62 | predictions = pd.Series([0.24, 0.87, 0.6])
63 | feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
64 | era_data = pd.Series([1, 1, 2])
65 | 
66 | penalizer = FeaturePenalizer(max_exposure=0.1, pred_name="prediction")
67 | penalizer.fit(X=predictions)
68 | penalized_predictions = penalizer.predict(X=predictions, features=feature_data, era_series=era_data)
69 | ```
70 | 


--------------------------------------------------------------------------------
/docs/prediction_loaders.md:
--------------------------------------------------------------------------------
 1 | # Prediction Loaders
 2 | 
 3 | Prediction loaders are designed to seamlessly fetch and transform prediction data, especially from Numerai's API. These classes can be integrated into pipelines to automate the prediction generation process for the Numerai competition.
 4 | 
 5 | # Why?
 6 | 
 7 | Numerai provides example predictions to help participants understand the expected structure and format of predictions. With the ExamplePredictions class, you can easily fetch these example predictions for different data versions, allowing you to quickly evaluate or test your models against the Numerai's standard prediction dataset.
 8 | 
 9 | # ExamplePredictions
10 | 
11 | ## Usage:
12 | The `ExamplePredictions` class fetches the example predictions for the specified version of the Numerai dataset. This can be useful for testing or understanding the prediction structure and data distribution.
13 | 
14 | Downloaded files are automatically cleaned up after data is loaded with the `transform` method. To keep the files make sure to set `keep_files=True` when instantiating the class.
15 | 
16 | ```py
17 | from numerblox.prediction_loaders import ExamplePredictions
18 | # Instantiate and load example predictions for v5.0
19 | example_loader = ExamplePredictions(file_name="v5.0/live_example_preds.parquet", keep_files=False)
20 | example_preds_df = example_loader.transform()
21 | ```
22 | 


--------------------------------------------------------------------------------
/docs/preprocessing.md:
--------------------------------------------------------------------------------
  1 | # Preprocessors
  2 | 
  3 | NumerBlox offers a suite of preprocessors to easily do Numerai specific data transformations. All preprocessors are compatible with `scikit-learn` pipelines and feature a similar API. Note that some preprocessors may require an additional `eras` or `tickers` argument in the `transform` step.
  4 | 
  5 | ## Numerai Classic
  6 | 
  7 | ### GroupStatsPreProcessor
  8 | 
  9 | The v4.2 (rain) dataset for Numerai Classic reintroduced feature groups. The `GroupStatsPreProcessor` calculates group statistics for all data groups. It uses predefined feature group mappings to generate statistical measures (mean, standard deviation, skew) for each of the feature groups. 
 10 | 
 11 | #### Example
 12 | 
 13 | Here's how you can use the `GroupStatsPreProcessor`:
 14 | 
 15 | ```python
 16 | from numerblox.preprocessing import GroupStatsPreProcessor
 17 | group_processor = GroupStatsPreProcessor(groups=['intelligence'])
 18 | 
 19 | # Return features with group statistics for the 'intelligence' group
 20 | features = group_processor.transform(X)
 21 | ```
 22 | 
 23 | ## Numerai Signals
 24 | 
 25 | ### ReduceMemoryProcessor
 26 | 
 27 | The `ReduceMemoryProcessor` reduces the memory usage of the data as much as possible. It's particularly useful for Numerai Signals dataset which can be quite large.
 28 | 
 29 | Note that modern Numerai Classic Data (v4.2+) already is an int8 format so this processor will be not be useful for Numerai Classic.
 30 | 
 31 | ```py
 32 | from numerblox.preprocessing import ReduceMemoryProcessor
 33 | 
 34 | processor = ReduceMemoryProcessor(deep_mem_inspect=True, verbose=True)
 35 | reduced_data = processor.fit_transform(dataf)
 36 | ```
 37 | 
 38 | ### KatsuFeatureGenerator
 39 | 
 40 | `KatsuFeatureGenerator` performs feature engineering based on [Katsu's starter notebook](https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners). This is useful for those participating in the Numerai Signals contest.
 41 | 
 42 | You can specify custom windows that indicates how many days to look back when generating features.
 43 | 
 44 | ```py
 45 | from numerblox.preprocessing import KatsuFeatureGenerator
 46 | 
 47 | feature_gen = KatsuFeatureGenerator(windows=[7, 14, 21])
 48 | enhanced_data = feature_gen.fit_transform(dataf)
 49 | ```
 50 | 
 51 | ### EraQuantileProcessor
 52 | 
 53 | `EraQuantileProcessor` transforms features into quantiles by era. This can help normalize data and make patterns more distinguishable. Quantiling operation are parallelized across features for faster processing.
 54 | 
 55 | Using `.transform` requires passing `era_series`. This is because the quantiles are calculated per era so it needs that information along with the raw input features.
 56 | 
 57 | ```py
 58 | from numerblox.preprocessing import EraQuantileProcessor
 59 | 
 60 | eq_processor = EraQuantileProcessor(num_quantiles=50, random_state=42)
 61 | transformed_data = eq_processor.fit_transform(X, era_series=eras_series)
 62 | ```
 63 | 
 64 | ### LagPreProcessor
 65 | 
 66 | `LagPreProcessor` generates lag features based on specified windows. Lag features can capture temporal patterns in time-series data.
 67 | 
 68 | Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step.
 69 | 
 70 | ```py
 71 | from numerblox.preprocessing import LagPreProcessor
 72 | 
 73 | lag_processor = LagPreProcessor(windows=[5, 10, 20])
 74 | lag_processor.fit(X)
 75 | lagged_data = lag_processor.transform(X, ticker_series=tickers_series)
 76 | 
 77 | ```
 78 | 
 79 | ### DifferencePreProcessor
 80 | 
 81 | `DifferencePreProcessor` computes the difference between features and their lags. It's used after `LagPreProcessor`.
 82 | 
 83 | WARNING: `DifferencePreProcessor` works only on `pd.DataFrame` and with columns that are generated in `LagPreProcessor`. If you are using these in a Pipeline make sure `LagPreProcessor` is defined before `DifferencePreProcessor` and that output API is set to Pandas (`pipeline.set_output(transform="pandas")`).
 84 | 
 85 | Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step so a pipeline with both preprocessors will need a `tickers` argument in `.transform`.
 86 | 
 87 | ```py
 88 | from sklearn.pipeline import make_pipeline
 89 | from numerblox.preprocessing import DifferencePreProcessor
 90 | 
 91 | lag = LagPreProcessor(windows=[5, 10])
 92 | diff = DifferencePreProcessor(windows=[5, 10], pct_diff=True)
 93 | pipe = make_pipeline(lag, diff)
 94 | pipe.set_output(transform="pandas")
 95 | pipe.fit(X)
 96 | diff_data = pipe.transform(X, ticker_series=tickers_series)
 97 | ```
 98 | 
 99 | ### PandasTaFeatureGenerator
100 | 
101 | `PandasTaFeatureGenerator` uses the `pandas-ta-classic` library to generate technical analysis features. It's a powerful tool for those interested in financial time-series data.
102 | 
103 | Make sure you have `pandas-ta-classic` installed before using this feature generator:
104 | 
105 | ```bash
106 | !pip install pandas-ta-classic
107 | ```
108 | 
109 | Currently `PandasTaFeatureGenerator` only works on `pd.DataFrame` input. Its input is a DataFrame with columns `[ticker, date, open, high, low, close, volume]`.
110 | 
111 | ```py
112 | from numerblox.preprocessing import PandasTaFeatureGenerator
113 | 
114 | ta_gen = PandasTaFeatureGenerator()
115 | ta_features = ta_gen.transform(dataf)
116 | ```
117 | 
118 | ### MinimumDataFilter
119 | 
120 | `MinimumDataFilter` filters out dates and tickers that don't have enough data. For example, it makes sense to filter out dates for which you have less than 100 days of data. Also, dates that have less than 100 unique tickers can be filtered out.
121 | 
122 | Additionally, you can specify a list of tickers to blacklist and exclude from your data.
123 | 
124 | NOTE: This step only works with DataFrame input.
125 | 
126 | ```py
127 | from numerblox.preprocessing import MinimumDataFilter
128 | 
129 | min_data_filter = MinimumDataFilter(min_samples_date=200, min_samples_ticker=1200, blacklist_tickers=["SOMETICKER.BLA"])
130 | filtered_data = min_data_filter.fit_transform(dataf)
131 | ```
132 | 
133 | ## Rolling your own preprocessor
134 | 
135 | We invite the community to contribute their own preprocessors to NumerBlox. If you have a preprocessor that you think would be useful to others, please open a PR with your code and tests.
136 | The new preprocessor should adhere to [scikit-learn conventions](https://scikit-learn.org/stable/developers/develop.html). Here are some the most important things to keep in mind and a template.
137 | 
138 | - Make sure that your preprocessor inherits from `numerblox.preprocessing.base.BasePreProcessor`. This will automatically implement a blank fit method. It will also inherit from `sklearn.base.TransformerMixin` and `sklearn.base.BaseEstimator`.
139 | - Make sure your preprocessor implements a `transform` method that can take a `np.array` or `pd.DataFrame` as input and outputs an `np.array`. If your preprocessor can only work with `pd.DataFrame` input, mention this explicitly in the docstring.
140 | - Implement a `get_feature_names_out` method so it can support `pd.DataFrame` output with valid column names.
141 | 
142 | ```py
143 | import numpy as np
144 | import pandas as pd
145 | from typing import Union
146 | from sklearn.validation import check_is_fitted, check_X_y
147 | from numerblox.preprocessing.base import BasePreProcessor
148 | 
149 | class MyAwesomePreProcessor(BasePreProcessor):
150 |     def __init__(self, random_state: int = 0):
151 |         super().__init__()
152 |         # If you introduce additional arguments be sure to add them as attributes.
153 |         self.random_state = random_state
154 | 
155 |     def fit(self, X: Union[np.array, pd.DataFrame], y=None):
156 |         # Arguments can be set for later use.
157 |         self.n_cols_ = X.shape[1]
158 |         return self
159 | 
160 |     def transform(self, X: Union[np.array, pd.DataFrame]) -> np.array:
161 |         # Do your preprocessing here.
162 |         # Can involve additional checks.
163 |         check_is_fitted(self)
164 |         X = check_X_y(X)
165 |         return X
166 | 
167 |     def get_feature_names_out(self, input_features=None) -> list:
168 |         # Return a list of feature names.
169 |         # If you are not using pandas output, you can skip this method.
170 |         check_is_fitted(self)
171 |         return ["awesome_output_feature_{i}" for i in range(self.n_cols_)]
172 | ```
173 | 


--------------------------------------------------------------------------------
/docs/submission.md:
--------------------------------------------------------------------------------
  1 | # Submitters
  2 | 
  3 | NumerBlox provides submitters for both Numerai Classic and Signals. 
  4 | Also check out `example/submitting.ipynb` for more information on Numerai submission.
  5 | 
  6 | ## Why?
  7 | - **Simplified Workflow**: Instead of managing multiple manual steps for submissions, `Submitters` allow you to simplify the submission process down to a few lines of code.
  8 | 
  9 | - **Integrated Validation Checks**: Before submitting, `Submitters` performs a series of checks to ensure the submission format is correct and prevent common mistakes that could lead to invalid submissions.
 10 | 
 11 | - **Security**: By providing a way to load credentials from a `.json` file, `Submitters` ensures that you're not hard-coding your secret credentials in the main code, reducing the risk of accidental exposure.
 12 | 
 13 | - **Automatic Cleanup**: For users who run automated jobs, the ability to automatically clean up the environment post-submission ensures that your workspace remains clutter-free.
 14 | 
 15 | With `Submitters`, you can focus more on developing and refining your model and spend less time on the manual aspects of the submission process.
 16 | 
 17 | ## Instantiation
 18 | 
 19 | In order to use a Submitter you should first create a `Key` object which handles credentials.
 20 | There are two ways to create a `Key`:
 21 | 
 22 | **1. Initialize `Key` with `pub_id` and `secret_key` from memory.**
 23 | 
 24 | ```py
 25 | from numerblox.misc import Key
 26 | key = Key(pub_id="Hello", secret_key="World")
 27 | ```
 28 | 
 29 | **2. Load credentials from `.json` file with `load_key_from_json`.**
 30 | 
 31 | JSON file should have the following format:
 32 | ```json
 33 | {"pub_id": "PUBLIC_ID", "secret_key": "SECRET_KEY"}
 34 | ```
 35 | We recommend loading from `.json`. With this method you only have to save your credentials in one (safe) place and avoid leaving reference to a secret key in Python code.
 36 | 
 37 | ```py
 38 | from numerblox.misc import load_key_from_json
 39 | key = load_key_from_json("my_credentials.json")
 40 | ```
 41 | 
 42 | ## Numerai Classic
 43 | 
 44 | Submissions can be done in 2 lines of code. To initialize the submitter object, pass a directory path for saving submissions and a `Key` object.
 45 | 
 46 | `NumeraiClassicSubmitter.full_submission` will perform:
 47 |  1. Checks to prevent surprise behavior (including value range and column validity)
 48 |  2. Saving to CSV
 49 |  3. Uploading with `numerapi`.
 50 | 
 51 | The `dataf` argument can be either a `pd.DataFrame` or `NumerFrame`.
 52 | 
 53 | ```py
 54 | from numerblox.submission import NumeraiClassicSubmitter
 55 | submitter = NumeraiClassicSubmitter(directory_path="sub_current_round", key=key)
 56 | # Your prediction file with 'id' as index and defined 'cols' below.
 57 | dataf = pd.DataFrame(columns=["prediction"])
 58 | # Only works with valid key credentials and model_name
 59 | submitter.full_submission(dataf=dataf,
 60 |                           cols="prediction",
 61 |                           file_name="submission.csv",
 62 |                           model_name="my_model")
 63 | ```
 64 | 
 65 | ## Numerai Signals
 66 | 
 67 | `NumeraiSignalsSubmitter` is very similar to `NumeraiClassicSubmitter`, but has a few additional checks specific to Signals. Mainly, it checks if the data contains a valid ticker column (`"cusip"`, `"sedol"`, `"ticker"`, `"numerai_ticker"` or `"bloomberg_ticker"`) and a `'signal'` column.
 68 | 
 69 | `NumeraiSignalsSubmitter.full_submission` handles checks, saving of CSV and uploading with `numerapi`.
 70 | 
 71 | ```py
 72 | from numerblox.submission import NumeraiSignalsSubmitter
 73 | submitter = NumeraiSignalsSubmitter(directory_path="sub_current_round", key=key)
 74 | # Your prediction file with 'id' as index, a valid ticker column and signal column below.
 75 | dataf = pd.DataFrame(columns=['bloomberg_ticker', 'signal'])
 76 | # Only works with valid key credentials and model_name
 77 | submitter.full_submission(dataf=dataf,
 78 |                           cols=["bloomberg_ticker", "signal"],
 79 |                           file_name="submission.csv",
 80 |                           model_name="my_signals_model")
 81 | ```
 82 | 
 83 | ## Numerai Crypto
 84 | 
 85 | `NumeraiCryptoSubmitter` has checks specific to Crypto. Mainly, it checks if the data contains a valid symbol column (`"symbol"`) and a `'signal'` column.
 86 | 
 87 | `NumeraiCryptoSubmitter.full_submission` handles checks, saving of CSV and uploading with `numerapi`.
 88 | 
 89 | ```py
 90 | from numerblox.submission import NumeraiCryptoSubmitter
 91 | submitter = NumeraiCryptoSubmitter(directory_path="sub_current_round", key=key)
 92 | # Your prediction file with 'id' as index, a valid symbol column and signal column below.
 93 | dataf = pd.DataFrame(columns=['symbol', 'signal'])
 94 | # Only works with valid key credentials and model_name
 95 | submitter.full_submission(dataf=dataf,
 96 |                           cols=["symbol", "signal"],
 97 |                           file_name="submission.csv",
 98 |                           model_name="my_crypto_model")
 99 | ```
100 | 
101 | ## NumerBay
102 | 
103 | NumerBlox also offers functionality to submit predictions from [NumerBay](https://numerbay.ai). This is a marketplace where Numerai predictions are bought and sold. Uploading from Numerbay is similar, but also requires authentication with your NumerBay account.
104 | 
105 | Also make sure the `numerbay` library is installed.
106 | 
107 | ```bash
108 | pip install numerbay
109 | ```
110 | 
111 | ```py
112 | from numerblox.submission import NumeraiClassicSubmitter, NumerBaySubmitter
113 | # Your prediction DataFrame
114 | dataf = pd.DataFrame(columns=["prediction"])
115 | 
116 | # Full submission to both Numerai and NumerBay
117 | numerbay_submitter = NumerBaySubmitter(
118 |     tournament_submitter = NumeraiClassicSubmitter(directory_path="sub_current_round", key=key),
119 |     numerbay_username="yourusername",
120 |     numerbay_password="yourpassword"
121 | )
122 | numerbay_submitter.full_submission(
123 |     dataf=dataf,
124 |     model_name="my_model",
125 |     numerbay_product_full_name="numerai-predictions-yourproductname",
126 |     file_name="submission.csv"
127 | )
128 | ```
129 | 
130 | ## Note
131 | 
132 | When you are done with submissions and don't need the submission file you can remove the submission directory with 1 line. Convenient if you have automated jobs and want to avoid clutter due to saving submission files for every round.
133 | 
134 | ```py
135 | # Clean up environment
136 | submitter.remove_base_directory()
137 | ```


--------------------------------------------------------------------------------
/docs/targets.md:
--------------------------------------------------------------------------------
 1 | # Target Engineering
 2 | 
 3 | Target engineering object allows you to easily create synthetic targets to train on or to convert raw price data into Numerai-style targets.
 4 | 
 5 | ## Why?
 6 | 
 7 | - **Enhanced Experimentation**: The availability of synthetic targets through the `BayesianGMMTargetProcessor` allows modelers to test new algorithms, techniques, or strategies.
 8 | 
 9 | - **Align with Numerai's Methodology**: `SignalsTargetProcessor` ensures that the targets you use are consistent with Numerai's approach. This alignment boosts the relevance of your models, potentially leading to better performance in the competition.
10 | 
11 | - **Versatility**: With different windows and target types, `SignalsTargetProcessor` offers a rich set of features, allowing for a more nuanced approach to model training. By exploring different timeframes and target representations, you can gain a deeper understanding of the data's dynamics.
12 | 
13 | - **Efficiency**: Manually engineering features or creating synthetic targets can be time-consuming and error-prone. These processors automate intricate steps, saving time and ensuring accuracy.
14 | 
15 | By integrating these processors into your workflow, you can enhance your modeling capabilities, streamline experimentation, and align closer to Numerai's expectations.
16 | 
17 | ## BayesianGMMTargetProcessor
18 | 
19 | The `BayesianGMMTargetProcessor` generates synthetic targets based on a Bayesian Gaussian Mixture model. It's primarily used for creating fake targets, which are useful for experimenting and validating model structures without exposing true labels.
20 | 
21 | ### Example:
22 | ```py
23 | from numerblox.targets import BayesianGMMTargetProcessor
24 | processor = BayesianGMMTargetProcessor(n_components=3)
25 | processor.fit(X=train_features, y=train_targets, era_series=train_eras)
26 | fake_target = processor.transform(X=train_features, era_series=train_eras)
27 | ```
28 | 
29 | For more detailed examples and use-cases, check out `examples/synthetic_data_generation.ipynb.`
30 | 
31 | 
32 | ## SignalsTargetProcessor
33 | 
34 | The `SignalsTargetProcessor` is specifically designed to engineer targets for Numerai Signals. This involves converting raw price data into Numerai-style targets.
35 | 
36 | ### Example:
37 | ```py
38 | from numerblox.targets import SignalsTargetProcessor
39 | processor = SignalsTargetProcessor(price_col="close")
40 | signals_target_data = processor.transform(dataf=data, era_series=eras_column)
41 | ```


--------------------------------------------------------------------------------
/examples/google_cloud_storage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "All `Downloaders` and `Submittors` support Google Cloud Storage (GCS).\n",
  8 |     "\n",
  9 |     "__Credentials are detected automatically in the following way:__\n",
 10 |     "1. The environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set and points to a valid `.json` file.\n",
 11 |     "\n",
 12 |     "2. (Fallback 1) You have a valid Cloud SDK installation.\n",
 13 |     "\n",
 14 |     "3. (Fallback 2) The machine running the code is a GCP machine."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from numerblox.download import NumeraiClassicDownloader"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Example usage"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "In order to use GCS you should:\n",
 38 |     "1. Instantiate a `Downloader` or `Submitter`.\n",
 39 |     "\n",
 40 |     "2a. For single files, call `.upload_file_to_gcs` or `.download_file_from_gcs`.\n",
 41 |     "\n",
 42 |     "2b. For directories, call `.upload_directory_to_gcs` or `.download_directory_from_gcs`."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "#### 1a. Downloading Numerai Classic inference data and uploading to GCS"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# This should point to a valid GCS bucket within your Google Cloud environment.\n",
 59 |     "bucket_name = \"test\"\n",
 60 |     "\n",
 61 |     "# Get inference data for current round\n",
 62 |     "downloader = NumeraiClassicDownloader(\"round_n\")\n",
 63 |     "downloader.download_inference_data(\"inference\", version=\"5.0\")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "All the data that has been downloaded can be uploaded to a GCS bucket with 1 line of code."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# Upload inference data for most recent round to GCS\n",
 80 |     "# downloader.upload_directory_to_gcs(bucket_name=bucket_name, gcs_path=\"round_n\")"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "#### 2b. Downloading inference data from GCS Bucket\n",
 88 |     "\n",
 89 |     "Conversely, A directory stored in a GCS bucket can be downloaded to your local directory. It will be stored in the base directory specified when you instantiated `nmr_downloader`."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Download data from bucket to local directory\n",
 99 |     "# downloader.download_directory_from_gcs(bucket_name=bucket_name, gcs_path=\"round_n\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "Your local environment can be cleaned up with 1 line of code. Convenient if you are done with inference and would like to delete downloaded inference data automatically."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "# Clean up environment\n",
116 |     "downloader.remove_base_directory()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "------------------------------------"
124 |    ]
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "kernelspec": {
129 |    "display_name": "python3",
130 |    "language": "python",
131 |    "name": "python3"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.9.12"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 0
148 | }
149 | 


--------------------------------------------------------------------------------
/examples/numerbay_integration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This part of the tutorial demonstrates how to use `NumerFrame` to download predictions bought on [NumerBay](http://numerbay.ai/) community marketplace. Currently only the main tournament is supported. Signals support will be added in future."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from numerblox.download import NumeraiClassicDownloader\n",
 17 |     "from numerblox.numerframe import create_numerframe"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "First, we download validation data using `NumeraiClassicDownloader`."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "downloader = NumeraiClassicDownloader(\"numerframe_edu\")\n",
 34 |     "# Path variables\n",
 35 |     "tournament_file = \"v5.0/validation.parquet\"\n",
 36 |     "tournament_save_path = f\"{str(downloader.dir)}/{tournament_file}\"\n",
 37 |     "# Download only tournament parquet file\n",
 38 |     "downloader.download_single_dataset(tournament_file, dest_path=tournament_save_path)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "Loading in data and initializing a `NumerFrame` takes one line of code. It will automatically recognize the data format such as `.csv` or `.parquet`. You have the option to add metadata, which is stored in the `meta` attribute."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Initialize NumerFrame from parquet file path\n",
 55 |     "dataf = create_numerframe(tournament_save_path)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "dataf.head(2)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Call the `predict` method on the `NumerFrame` to fetch the prediction file from NumerBay. If the file already exists in the `data_directory`, that file will be loaded without re-downloading."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# preds = nb_model.predict(dataf)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "The predictions are concatenated to the `NumerFrame` with column name `prediction_numerai-predictions-numerbay`"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# preds"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "In this part of the tutorial we have downloaded a prediction file from NumerBay with `NumerFrame`. This makes things easier for post processing such as ensembling and neutralization."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "------------------------------------------------------"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "# NumerBay submission"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "This part of the tutorial is for sellers who want to upload their predictions to NumerBay to fulfill sale orders. Using `NumerBaySubmitter`, a seller can choose to submit to both Numerai and NumerBay or just NumerBay."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Assume we have some prediction column to upload for the Numerai main tournament, in this case the `prediction` column which simply takes the value of a feature."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "# dataf = create_numerframe(tournament_save_path)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "# dataf['prediction'] = dataf['feature_dichasial_hammier_spawner']"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "Set `upload_to_numerai` to True (default) if you want to submit to both Numerai and NumerBay, set to False to submit only to NumerBay."
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# nb_submitter = NumerBaySubmitter(tournament_submitter=numerai_submitter, upload_to_numerai=True, numerbay_username=\"numerbay\", numerbay_password=\"your_password\")"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "Finally, we call the `full_submission` method to perform the submission"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "# nb_submitter.full_submission(dataf, file_name='upload-full.csv', model_name='numerbay', numerbay_product_full_name='numerai-predictions-numerbay', cols='prediction')"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "The process for Signals submission is very similar and is omitted for brevity, just do the following:\n",
189 |     "- Use Signals NumerFrame\n",
190 |     "- Change `NumeraiClassicSubmitter` to `NumeraiSignalsSubmitter` for the `tournament_submitter` argument\n",
191 |     "- When calling `full_submission`, change the `cols` argument to the list of Signals column to submit (e.g. `['bloomberg_ticker', 'signal']`)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "------------------------------------------------------"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "After we are done we can easily clean up our downloaded data with one line of code called from the downloader."
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "# Clean up environment\n",
215 |     "downloader.remove_base_directory()"
216 |    ]
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "kernelspec": {
221 |    "display_name": "python3",
222 |    "language": "python",
223 |    "name": "python3"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 4
228 | }
229 | 


--------------------------------------------------------------------------------
/examples/quickstart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 0. Dependencies"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "from xgboost import XGBRegressor\n",
 18 |     "\n",
 19 |     "from numerblox.download import NumeraiClassicDownloader\n",
 20 |     "from numerblox.evaluation import NumeraiClassicEvaluator\n",
 21 |     "from numerblox.misc import Key\n",
 22 |     "from numerblox.numerframe import create_numerframe\n",
 23 |     "from numerblox.prediction_loaders import ExamplePredictions\n",
 24 |     "from numerblox.submission import NumeraiClassicSubmitter"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "# 1. Download"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "`NumeraiClassicDownloader` allows you to download training and inference data with a single line of code."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Download data\n",
 48 |     "downloader = NumeraiClassicDownloader(\"data\")\n",
 49 |     "# Training and validation data\n",
 50 |     "downloader.download_training_data(\"train_val\", version=\"5.0\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "# 2. Train"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "We use a custom Pandas DataFrame data structure called `NumerFrame` with `create_numerframe` here to easily parse the Numerai data. The usage of `NumerFrame` is completely optional, but greatly simplify the building of Numerai pipelines and experimentation with Numerai data.\n",
 65 |     "\n",
 66 |     "We then fit a simple XGBoost regressor model."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "df = create_numerframe(\"data/train_val/train.parquet\")\n",
 76 |     "X, y = df.sample(100).get_feature_target_pair(multi_target=False)\n",
 77 |     "xgb = XGBRegressor()\n",
 78 |     "xgb.fit(X.values, y.values)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "# 3. Evaluate"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "`NumeraiClassicEvaluator` will calculate all relevant Numerai metrics. \n",
 93 |     "\n",
 94 |     "`ExamplePredictions` is a NumerBlox class that handles downloading of example predictions for you. This object like all other NumerBlox processors can also used end to end in a scikit-learn pipeline."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "val_df = create_numerframe(\"data/train_val/validation.parquet\")[:100]\n",
104 |     "val_df[\"prediction\"] = xgb.predict(val_df.get_feature_data)\n",
105 |     "val_df[\"example_preds\"] = ExamplePredictions(\"v5.0/validation_example_preds.parquet\").fit_transform(None)[\"prediction\"].values[:100]\n",
106 |     "evaluator = NumeraiClassicEvaluator()\n",
107 |     "metrics = evaluator.full_evaluation(val_df, example_col=\"example_preds\", pred_cols=[\"prediction\"], target_col=\"target\")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "# 4. Inference"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "Here again `NumeraiClassicDownloader` and `NumerFrame` are leveraged to simplify inference."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "downloader.download_inference_data(\"current_round\", version=\"5.0\")\n",
131 |     "live_df = create_numerframe(file_path=\"data/current_round/live.parquet\")\n",
132 |     "live_X, live_y = live_df.get_feature_target_pair(multi_target=False)\n",
133 |     "preds = xgb.predict(live_X)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "# 5. Submission"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "`NumeraiClassicSubmitter` takes care of data integrity checks and submission to Numerai for you. Credentials are conveniently initialized with a `Key` object."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# Submit\n",
157 |     "NUMERAI_PUBLIC_ID = \"YOUR_PUBLIC_ID\"\n",
158 |     "NUMERAI_SECRET_KEY = \"YOUR_SECRET_KEY\"\n",
159 |     "key = Key(pub_id=NUMERAI_PUBLIC_ID, secret_key=NUMERAI_SECRET_KEY)\n",
160 |     "submitter = NumeraiClassicSubmitter(directory_path=\"sub_current_round\", key=key)\n",
161 |     "# Your prediction file with 'id' as index and defined 'cols' below.\n",
162 |     "pred_dataf = pd.DataFrame(preds, index=live_df.index, columns=[\"prediction\"])\n",
163 |     "# Only works with valid key credentials and model_name\n",
164 |     "# submitter.full_submission(dataf=pred_dataf,\n",
165 |     "#                           cols=\"prediction\",\n",
166 |     "#                           file_name=\"submission.csv\",\n",
167 |     "#                           model_name=\"MY_MODEL_NAME\")"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "# 6. Clean up environment (optional)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "All downloader and submitter have functionality to remove themselver. This is especially convenient if you are running a daily inference pipeline on your server or a cloud VM."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "downloader.remove_base_directory()\n",
191 |     "submitter.remove_base_directory()"
192 |    ]
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "classic_prod",
198 |    "language": "python",
199 |    "name": "python3"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 3
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython3",
211 |    "version": "3.11.5"
212 |   },
213 |   "orig_nbformat": 4
214 |  },
215 |  "nbformat": 4,
216 |  "nbformat_minor": 2
217 | }
218 | 


--------------------------------------------------------------------------------
/examples/synthetic_data_generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This example notebook covers ways to generate synthetic data using `numerblox` components. Synthetic data can be a great way to improve performance simply by having more data to train. We will both cover ways to generate synthetic target variables and features."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 0. Download and load"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from uuid import uuid4\n",
 24 |     "\n",
 25 |     "import pandas as pd\n",
 26 |     "\n",
 27 |     "from numerblox.download import NumeraiClassicDownloader"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "unique_id = uuid4()\n",
 37 |     "\n",
 38 |     "dl = NumeraiClassicDownloader(directory_path=f\"synth_test_{unique_id}\")\n",
 39 |     "dl.download_training_data(version=\"5.0\")"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "dataf = pd.read_parquet(f\"synth_test_{unique_id}/train.parquet\")"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "dataf.head(2)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## 1. Synthetic target (Bayesian GMM)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "First we will tackle the problem of creating a synthetic target column to improve model performance. `BayesianGMMTargetProcessor` allows you to generate a new target variable based on a given target. The preprocessor sample the target from a [Bayesian Gaussian Mixture model](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html) which is fitted on coefficients from a [regularized linear model (Ridge regression)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html).\n",
 72 |     "\n",
 73 |     "This implementation is based on a [Github Gist by Michael Oliver (mdo)](https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93)."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from numerblox.targets import BayesianGMMTargetProcessor"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "dataf.head()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "bgmm = BayesianGMMTargetProcessor()\n",
101 |     "bgmm.set_output(transform=\"pandas\")\n",
102 |     "sample = dataf.sample(1000)\n",
103 |     "X = sample[[\"feature_polaroid_vadose_quinze\", \"feature_genuine_kyphotic_trehala\"]].fillna(0.5)\n",
104 |     "y = sample[\"target\"]\n",
105 |     "eras = sample[\"era\"]\n",
106 |     "bgmm.fit(X, y, eras=eras)\n",
107 |     "fake_target = bgmm.transform(X, eras=eras)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "fake_target.head(10)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "# Clean up environment\n",
126 |     "dl.remove_base_directory()"
127 |    ]
128 |   }
129 |  ],
130 |  "metadata": {
131 |   "kernelspec": {
132 |    "display_name": "python3",
133 |    "language": "python",
134 |    "name": "python3"
135 |   },
136 |   "language_info": {
137 |    "codemirror_mode": {
138 |     "name": "ipython",
139 |     "version": 3
140 |    },
141 |    "file_extension": ".py",
142 |    "mimetype": "text/x-python",
143 |    "name": "python",
144 |    "nbconvert_exporter": "python",
145 |    "pygments_lexer": "ipython3",
146 |    "version": "3.9.12"
147 |   }
148 |  },
149 |  "nbformat": 4,
150 |  "nbformat_minor": 0
151 | }
152 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: numerblox
 2 | theme:
 3 |   name: material
 4 |   palette:
 5 |     # Palette toggle for light mode
 6 |     - media: "(prefers-color-scheme: light)"
 7 |       scheme: default
 8 |       primary: black
 9 |       accent: cyan
10 |       toggle:
11 |         icon: material/brightness-7
12 |         name: Switch to dark mode
13 |     # Palette toggle for dark mode
14 |     - media: "(prefers-color-scheme: dark)"
15 |       scheme: slate
16 |       primary: black
17 |       accent: cyan
18 |       toggle:
19 |         icon: material/brightness-4
20 |         name: Switch to light mode
21 |   font:
22 |     text: Roboto
23 |   custom_dir: docs/assets
24 |   icon:
25 |     logo: cc_white
26 |   favicon: assets/.icons/favicon.ico
27 |   features:
28 |     - navigation.top
29 |     - navigation.tracking
30 |     - navigation.expand
31 |     - navigation.path
32 |     - content.code.copy
33 |     - navigation.instant
34 |     - navigation.instant.prefetch
35 |     - navigation.sections
36 | 
37 | repo_url: https://github.com/crowdcent/numerblox/
38 | repo_name: crowdcent/numerblox
39 | 
40 | nav:
41 |   - Home: index.md
42 |   - End-To-End Examples: end_to_end.md
43 |   
44 |   - Blox:
45 |     - Downloaders: download.md
46 |     - NumerFrame: numerframe.md
47 |     - Preprocessing: preprocessing.md
48 |     - Target Engineering: targets.md
49 |     - Postprocessing: postprocessing.md
50 |     - Meta Pipelines: meta.md
51 |     - Prediction Loaders: prediction_loaders.md
52 |     - Models: models.md
53 |     - Evaluation: evaluation.md
54 |     - Submitters: submission.md
55 |     - Model Upload: model_upload.md
56 | 
57 |   - API:
58 |     - API Reference: api.md
59 | 
60 |   - More:
61 |     - How To Contribute: contributing.md
62 |     - About CrowdCent: crowdcent.md
63 |     - Disclaimer: disclaimer.md
64 | 
65 | extra:
66 |   social:
67 |     - icon: fontawesome/brands/github
68 |       link: https://github.com/CrowdCent
69 |       name: crowdcent on github
70 |     - icon: fontawesome/brands/x-twitter
71 |       link: https://x.com/CrowdCent
72 |       name: crowdcent on X
73 |   generator: false
74 | 
75 | copyright: Made by CrowdCent
76 | plugins:
77 |   - search
78 |   - mkdocstrings


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "numerblox"
 3 | version = "1.6.1"
 4 | description = "Solid Numerai Pipelines"
 5 | authors = [
 6 |     {name = "CrowdCent", email = "support@crowdcent.com"},
 7 | ]
 8 | license = {text = "MIT License"}
 9 | readme = "README.md"
10 | requires-python = ">=3.10,<4"
11 | dependencies = [
12 |     "tqdm>=4.66.1",
13 |     "numpy>=1.26.3",
14 |     "scipy>=1.10.0",
15 |     "pandas>=2.1.1",
16 |     "pandas-ta-classic>=0.3.14b",
17 |     "joblib>=1.3.2",
18 |     "pyarrow>=14.0.1",
19 |     "numerapi>=2.19.1",
20 |     "matplotlib>=3.4.0",
21 |     "scikit-learn>=1.6.1",
22 |     "python-dateutil>=2.8.2",
23 |     "google-cloud-storage>=2.11.0",
24 |     "numerai-era-data>=0.1.1",
25 |     "numerai-tools>=0.2.2",
26 |     "polars>=1.5.0",
27 |     "werkzeug>=3.0.3",
28 | ]
29 | 
30 | [project.optional-dependencies]
31 | test = [
32 |     "pytest<8.0.0,>=7.4.2",
33 |     "pytest-cov<5.0.0,>=4.1.0",
34 |     "pytest-mock<4.0.0,>=3.11.1",
35 |     "mkdocs<2.0.0,>=1.5.3",
36 |     "mkdocs-material<10.0.0,>=9.4.2",
37 |     "eod<1.0.0,>=0.2.1",
38 |     "kaggle<2.0.0,>=1.5.16",
39 |     "scikit-lego>=0.9.1",
40 |     "xgboost>=2.0.0",
41 |     "mkdocstrings-python<2.0.0,>=1.7.1",
42 |     "ruff>=0.3.0",
43 | ]
44 | 
45 | [tool.ruff]
46 | line-length = 300
47 | 
48 | [tool.ruff.lint]
49 | ignore = ["F403", "F811"] 
50 | select = ["E", "F", "I"]
51 | 
52 | [tool.ruff.lint.per-file-ignores]
53 | "__init__.py" = ["F401"]
54 | 
55 | [build-system]
56 | requires = ["setuptools>=61.0"]
57 | build-backend = "setuptools.build_meta"
58 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore:pkg_resources is deprecated as an API:DeprecationWarning
4 | 


--------------------------------------------------------------------------------
/src/numerblox/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/src/numerblox/__init__.py


--------------------------------------------------------------------------------
/src/numerblox/ensemble.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import List, Union
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scipy
  7 | import sklearn
  8 | from sklearn.base import BaseEstimator, TransformerMixin
  9 | 
 10 | 
 11 | class NumeraiEnsemble(TransformerMixin, BaseEstimator):
 12 |     """
 13 |     Ensembler that standardizes predictions by era and averages them.
 14 |     :param weights: Sequence of weights (float or int), optional, default: None.
 15 |     If None, then uniform weights are used.
 16 |     :param n_jobs: The number of jobs to run in parallel for fit.
 17 |     Will revert to 1 CPU core if not defined.
 18 |     -1 means using all processors.
 19 |     :param donate_weighted: Whether to use Donate et al.'s weighted average formula.
 20 |     Often used when ensembling predictions from multiple folds over time.
 21 |     Paper Link: https://doi.org/10.1016/j.neucom.2012.02.053
 22 |     Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]
 23 |     """
 24 | 
 25 |     def __init__(self, weights=None, donate_weighted=False):
 26 |         sklearn.set_config(enable_metadata_routing=True)
 27 |         self.set_transform_request(era_series=True)
 28 |         self.set_predict_request(era_series=True)
 29 |         super().__init__()
 30 |         self.weights = weights
 31 |         if self.weights and sum(self.weights) != 1:
 32 |             warnings.warn(f"Warning: Weights do not sum to 1. Got {sum(self.weights)}.")
 33 |         self.donate_weighted = donate_weighted
 34 | 
 35 |     def fit(self, X: Union[np.array, pd.DataFrame], y=None):
 36 |         self.is_fitted_ = True
 37 |         return self
 38 | 
 39 |     def transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array:
 40 |         """
 41 |         Standardize by era and ensemble.
 42 |         :param X: Input data where each column contains predictions from an estimator.
 43 |         :param era_series: Era labels (strings) for each row in X.
 44 |         :return: Ensembled predictions.
 45 |         """
 46 |         assert era_series is not None, "Era series must be provided for NumeraiEnsemble."
 47 |         assert len(X) == len(era_series), f"input X and era_series must have the same length. Got {len(X)} != {len(era_series)}."
 48 | 
 49 |         if len(X.shape) == 1:
 50 |             raise ValueError("NumeraiEnsemble requires at least 2 prediction columns. Got 1.")
 51 | 
 52 |         n_models = X.shape[1]
 53 |         if n_models <= 1:
 54 |             raise ValueError(f"NumeraiEnsemble requires at least 2 predictions columns. Got {len(n_models)}.")
 55 | 
 56 |         # Override weights if donate_weighted is True
 57 |         if self.donate_weighted:
 58 |             weights = self._get_donate_weights(n=n_models)
 59 |         else:
 60 |             weights = self.weights
 61 | 
 62 |         if isinstance(X, pd.DataFrame):
 63 |             X = X.values
 64 |         # Standardize predictions by era
 65 |         standardized_pred_list = []
 66 |         for i in range(n_models):
 67 |             # Skip standardization if all predictions are the same
 68 |             pred = X[:, i]
 69 |             if np.isnan(pred).any():
 70 |                 warnings.warn(f"Warning: Some predictions in column '{i}' contain NaNs. Consider checking your estimators. Ensembled predictions will also be a NaN.")
 71 |             if np.all(pred == pred[0]):
 72 |                 warnings.warn(f"Warning: Predictions in column '{i}' are all constant. Consider checking your estimators. Skipping these estimator predictions in ensembling.")
 73 |             else:
 74 |                 standardized_pred = self._standardize_by_era(pred, era_series)
 75 |                 standardized_pred_list.append(standardized_pred)
 76 |         standardized_pred_arr = np.asarray(standardized_pred_list).T
 77 | 
 78 |         if not standardized_pred_list:
 79 |             raise ValueError("Predictions for all columns are constant. No valid predictions to ensemble.")
 80 | 
 81 |         # Average out predictions
 82 |         ensembled_predictions = np.average(standardized_pred_arr, axis=1, weights=weights)
 83 |         return ensembled_predictions.reshape(-1, 1)
 84 | 
 85 |     def fit_transform(self, X: Union[np.array, pd.DataFrame], y=None, era_series: pd.Series = None) -> np.array:
 86 |         self.fit(X, y)
 87 |         return self.transform(X, era_series)
 88 | 
 89 |     def predict(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array:
 90 |         """
 91 |         For if a NumeraiEnsemble happens to be the last step in the pipeline. Has same behavior as transform.
 92 |         """
 93 |         return self.transform(X, era_series=era_series)
 94 | 
 95 |     def _standardize(self, X: np.array) -> np.array:
 96 |         """
 97 |         Standardize single era.
 98 |         :param X: Predictions for a single era.
 99 |         :return: Standardized predictions.
100 |         """
101 |         percentile_X = (scipy.stats.rankdata(X, method="ordinal") - 0.5) / len(X)
102 |         return percentile_X
103 | 
104 |     def _standardize_by_era(self, X: np.array, era_series: Union[np.array, pd.Series, pd.DataFrame]) -> np.array:
105 |         """
106 |         Standardize predictions of a single estimator by era.
107 |         :param X: All predictions of a single estimator.
108 |         :param era_series: Era labels (strings) for each row in X.
109 |         :return: Standardized predictions.
110 |         """
111 |         if isinstance(era_series, (pd.Series, pd.DataFrame)):
112 |             era_series = era_series.to_numpy().flatten()
113 |         df = pd.DataFrame({"prediction": X, "era": era_series})
114 |         df["standardized_prediction"] = df.groupby("era")["prediction"].transform(self._standardize)
115 |         return df["standardized_prediction"].values.flatten()
116 | 
117 |     def _get_donate_weights(self, n: int) -> list:
118 |         """
119 |         Exponential weights as per Donate et al.'s formula.
120 |         Example donate weighting for 3 folds: [0.25, 0.25, 0.5]
121 |         Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5]
122 | 
123 |         :param n: Number of estimators.
124 |         :return: List of weights.
125 |         """
126 |         weights = []
127 |         for j in range(1, n + 1):
128 |             j = 2 if j == 1 else j
129 |             weights.append(1 / (2 ** (n + 1 - j)))
130 |         return weights
131 | 
132 |     def get_feature_names_out(self, input_features=None) -> List[str]:
133 |         return ["numerai_ensemble_predictions"] if not input_features else input_features
134 | 
135 | 
136 | class PredictionReducer(TransformerMixin, BaseEstimator):
137 |     """
138 |     Reduce multiclassification and proba preds to 1 column per model.
139 |     If predictions were generated with a regressor or regular predict you don't need this step.
140 |     :param n_models: Number of resulting columns.
141 |     This indicates how many models were trained to generate the prediction array.
142 |     :param n_classes: Number of classes for each prediction.
143 |     If predictions were generated with predict_proba and binary classification -> n_classes = 2.
144 |     """
145 | 
146 |     def __init__(self, n_models: int, n_classes: int):
147 |         super().__init__()
148 |         if n_models < 1:
149 |             raise ValueError(f"n_models must be >= 1. Got '{n_models}'.")
150 |         self.n_models = n_models
151 |         if n_classes < 2:
152 |             raise ValueError(f"n_classes must be >= 2. If n_classes = 1 you don't need PredictionReducer. Got '{n_classes}'.")
153 |         self.n_classes = n_classes
154 |         self.dot_array = [i for i in range(self.n_classes)]
155 | 
156 |     def fit(self, X: np.array, y=None):
157 |         return self
158 | 
159 |     def transform(self, X: np.array):
160 |         """
161 |         :param X: Input predictions.
162 |         :return: Reduced predictions of shape (X.shape[0], self.n_models).
163 |         """
164 |         reduced = []
165 |         expected_n_cols = self.n_models * self.n_classes
166 |         if len(X.shape) != 2:
167 |             raise ValueError(f"Expected X to be a 2D array. Got '{len(X.shape)}' dimension(s).")
168 |         if X.shape[1] != expected_n_cols:
169 |             raise ValueError(f"Input X must have {expected_n_cols} columns. Got {X.shape[1]} columns while n_models={self.n_models} * n_classes={self.n_classes} = {expected_n_cols}. ")
170 |         for i in range(self.n_models):
171 |             # Extracting the predictions of the i-th model
172 |             model_preds = X[:, i * self.n_classes : (i + 1) * self.n_classes]
173 |             r = model_preds @ self.dot_array
174 |             reduced.append(r)
175 |         reduced_arr = np.column_stack(reduced)
176 |         return reduced_arr
177 | 
178 |     def predict(self, X: np.array):
179 |         """
180 |         For if PredictionReducer happens to be the last step in the pipeline. Has same behavior as transform.
181 |         :param X: Input predictions.
182 |         :return: Reduced predictions of shape (X.shape[0], self.n_models).
183 |         """
184 |         return self.transform(X)
185 | 
186 |     def get_feature_names_out(self, input_features=None) -> List[str]:
187 |         return [f"reduced_prediction_{i}" for i in range(self.n_models)] if not input_features else input_features
188 | 


--------------------------------------------------------------------------------
/src/numerblox/misc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class AttrDict(dict):
 5 |     """Access dictionary elements as attributes."""
 6 | 
 7 |     def __init__(self, *args, **kwargs):
 8 |         super(AttrDict, self).__init__(*args, **kwargs)
 9 |         self.__dict__ = self
10 | 
11 | 
12 | class Key:
13 |     """Numerai credentials."""
14 | 
15 |     def __init__(self, pub_id: str, secret_key: str):
16 |         self.pub_id = pub_id
17 |         self.secret_key = secret_key
18 | 
19 |     def __repr__(self):
20 |         return f"Numerai Auth Key. pub_id = '{self.pub_id}'"
21 | 
22 |     def __str__(self):
23 |         return self.__repr__()
24 | 
25 | 
26 | def load_key_from_json(file_path: str, *args, **kwargs):
27 |     """
28 |     Initialize Key object from JSON file. \n
29 |     Credentials file must have the following format: \n
30 |     `{"pub_id": "PUBLIC_ID", "secret_key": "SECRET_KEY"}`
31 |     """
32 |     with open(file_path) as json_file:
33 |         json_data = json.load(json_file, *args, **kwargs)
34 |     pub_id = json_data["pub_id"]
35 |     secret_key = json_data["secret_key"]
36 |     return Key(pub_id=pub_id, secret_key=secret_key)
37 | 


--------------------------------------------------------------------------------
/src/numerblox/model_upload.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Any, Callable, List, Optional, Union
  3 | 
  4 | import cloudpickle
  5 | import pandas as pd
  6 | from numerapi import NumerAPI
  7 | 
  8 | from .misc import Key
  9 | 
 10 | 
 11 | class NumeraiModelUpload:
 12 |     """
 13 |     A class to handle the uploading of machine learning models to Numerai's servers.
 14 | 
 15 |     :param key: API key object containing public and secret keys for NumerAPI authentication.
 16 |     :param max_retries: Maximum number of attempts to upload the model.
 17 |     :param sleep_time: Number of seconds to wait between retries.
 18 |     :param fail_silently: Whether to suppress exceptions during upload.
 19 |     """
 20 | 
 21 |     def __init__(self, key: Key = None, max_retries: int = 2, sleep_time: int = 10, fail_silently: bool = False, *args, **kwargs):
 22 |         """
 23 |         Initializes the NumeraiModelUpload class with the necessary configuration.
 24 | 
 25 |         :param key: API key object containing public and secret keys for NumerAPI authentication.
 26 |         :param max_retries: Maximum number of retry attempts for model upload.
 27 |         :param sleep_time: Time (in seconds) to wait between retries.
 28 |         :param fail_silently: If True, suppress errors during model upload.
 29 |         :param *args: Additional arguments for NumerAPI.
 30 |         :param **kwargs: Additional keyword arguments for NumerAPI.
 31 |         """
 32 |         # Initialize NumerAPI with the provided keys and other arguments
 33 |         self.api = NumerAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs)
 34 |         self.max_retries = max_retries  # Set the maximum number of retries
 35 |         self.sleep_time = sleep_time  # Set the sleep time between retries
 36 |         self.fail_silently = fail_silently  # Determine whether to fail silently
 37 | 
 38 |     def create_and_upload_model(self, model: Any, feature_cols: Optional[List[str]] = None, model_name: str = None, file_path: str = None, data_version: str = None, docker_image: str = None, custom_predict_func: Callable[[pd.DataFrame], pd.DataFrame] = None) -> Union[str, None]:
 39 |         """
 40 |         Creates a model prediction function, serializes it, and uploads the model to Numerai.
 41 |         :param model: The machine learning model object.
 42 |         :param feature_cols: List of feature column names for predictions. Defaults to None.
 43 |         :param model_name: The name of the model to upload.
 44 |         :param file_path: The file path where the serialized model function will be saved.
 45 |         :param data_version: Data version to use for model upload.
 46 |         :param docker_image: Docker image to use for model upload.
 47 |         :param custom_predict_func: Custom prediction function to use instead of the model's predict method.
 48 | 
 49 |         :return: Upload ID if the upload is successful, None otherwise.
 50 |         """
 51 |         # Determine which prediction function to use
 52 |         if custom_predict_func is not None:
 53 |             predict = custom_predict_func  # Use custom prediction function if provided
 54 |         else:
 55 |             # Define default prediction function
 56 |             def predict(live_features: pd.DataFrame) -> pd.DataFrame:
 57 |                 # Determine feature columns to use for predictions
 58 |                 if feature_cols is None:
 59 |                     feature_cols_local = [col for col in live_features.columns if col.startswith("feature_")]
 60 |                 else:
 61 |                     feature_cols_local = feature_cols
 62 | 
 63 |                 # Predict using the model
 64 |                 live_predictions = model.predict(live_features[feature_cols_local])
 65 | 
 66 |                 # Rank predictions and convert to a DataFrame
 67 |                 submission = pd.Series(live_predictions, index=live_features.index).rank(pct=True, method="first")
 68 |                 return submission.to_frame("prediction")
 69 | 
 70 |         # Serialize the prediction function and save to the specified file path
 71 |         print(f"Serializing the predict function and saving to '{file_path}'")
 72 |         with open(file_path, "wb") as f:
 73 |             cloudpickle.dump(predict, f)
 74 | 
 75 |         # Get the model ID for the specified model name
 76 |         model_id = self._get_model_id(model_name=model_name)
 77 |         api_type = self.api.__class__.__name__  # Get the type of API being used
 78 |         print(f"{api_type}: Uploading model from '{file_path}' for model '{model_name}' (model_id='{model_id}')")
 79 | 
 80 |         # Attempt to upload the model, retrying if necessary
 81 |         for attempt in range(self.max_retries):
 82 |             try:
 83 |                 # Attempt to upload the model
 84 |                 upload_id = self.api.model_upload(file_path=file_path, model_id=model_id, data_version=data_version, docker_image=docker_image)
 85 |                 print(f"{api_type} model upload of '{file_path}' for '{model_name}' is successful! Upload ID: {upload_id}")
 86 |                 return upload_id  # Return upload ID if successful
 87 |             except Exception as e:
 88 |                 # Handle failed upload attempts
 89 |                 if attempt < self.max_retries - 1:
 90 |                     print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...")
 91 |                     print(f"Error: {e}")
 92 |                     time.sleep(self.sleep_time)  # Wait before retrying
 93 |                 else:
 94 |                     # Handle final failed attempt
 95 |                     if self.fail_silently:
 96 |                         print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Skipping...")
 97 |                         print(f"Error: {e}")
 98 |                     else:
 99 |                         print(f"Failed to upload model '{file_path}' for '{model_name}' after {self.max_retries} attempts.")
100 |                         raise e  # Raise the exception if not failing silently
101 | 
102 |     def get_available_data_versions(self) -> dict:
103 |         """
104 |         Retrieves the available data versions for model uploads.
105 | 
106 |         :return: A dictionary of available data versions.
107 |         """
108 |         # Call NumerAPI to get available data versions
109 |         return self.api.model_upload_data_versions()
110 | 
111 |     def get_available_docker_images(self) -> dict:
112 |         """
113 |         Retrieves the available Docker images for model uploads.
114 | 
115 |         :return: A dictionary of available Docker images.
116 |         """
117 |         # Call NumerAPI to get available Docker images
118 |         return self.api.model_upload_docker_images()
119 | 
120 |     def _get_model_id(self, model_name: str) -> str:
121 |         """
122 |         Retrieves the model ID for a given model name.
123 | 
124 |         :param model_name: The name of the model.
125 |         :return: The ID of the model.
126 | 
127 |         Raises ValueError if the model name is not found in the user's Numerai account.
128 |         """
129 |         # Get the mapping of model names to model IDs
130 |         model_mapping = self.get_model_mapping
131 |         if model_name in model_mapping:
132 |             return model_mapping[model_name]  # Return the model ID if found
133 |         else:
134 |             # Raise an error if the model name is not found
135 |             available_models = ", ".join(model_mapping.keys())
136 |             raise ValueError(f"Model name '{model_name}' not found in your Numerai account. " f"Available model names: {available_models}")
137 | 
138 |     @property
139 |     def get_model_mapping(self) -> dict:
140 |         """
141 |         Retrieves the mapping of model names to their IDs from the user's Numerai account.
142 | 
143 |         :return: A dictionary mapping model names to model IDs.
144 |         """
145 |         # Call NumerAPI to get the model mapping
146 |         return self.api.get_models()
147 | 


--------------------------------------------------------------------------------
/src/numerblox/models.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sklearn
 3 | from sklearn.utils.validation import check_is_fitted
 4 | from xgboost import XGBRegressor
 5 | 
 6 | from .evaluation import NumeraiClassicEvaluator
 7 | 
 8 | 
 9 | class EraBoostedXGBRegressor(XGBRegressor):
10 |     """
11 |     Custom XGBRegressor model that upweights the worst eras in the data.
12 |     The worst eras are determined by Corrv2.
13 |     NOTE: Currently only supports single target regression.
14 | 
15 |     This idea was first proposed by Richard Craib in the Numerai forums:
16 |     https://forum.numer.ai/t/era-boosted-models/189
17 | 
18 |     Credits to Michael Oliver (mdo) for proposing the 1st XGBoost implementation of era boosting:
19 |     https://forum.numer.ai/t/era-boosted-models/189/3
20 | 
21 |     :param proportion: Proportion of eras to upweight.
22 |     :param trees_per_step: Number of trees to add per iteration.
23 |     :param num_iters: Number of total era boosting iterations.
24 |     """
25 | 
26 |     def __init__(self, proportion=0.5, trees_per_step=10, num_iters=200, **xgb_params):
27 |         sklearn.set_config(enable_metadata_routing=True)
28 |         self.set_fit_request(era_series=True)
29 |         super().__init__(**xgb_params)
30 |         if not self.n_estimators:
31 |             self.n_estimators = 100
32 |         assert self.n_estimators >= 1, "n_estimators must be at least 1."
33 | 
34 |         assert 0 < proportion < 1, "proportion must be between 0 and 1."
35 |         self.proportion = proportion
36 |         assert trees_per_step >= 0, "trees_per_step must be at least 1."
37 |         self.trees_per_step = trees_per_step
38 |         assert num_iters >= 2, "num_iters must be at least 2."
39 |         self.num_iters = num_iters
40 | 
41 |     def fit(self, X, y, era_series: pd.Series, **fit_params):
42 |         super().fit(X, y, **fit_params)
43 |         evaluator = NumeraiClassicEvaluator(era_col="era")
44 |         self.feature_names = self.get_booster().feature_names
45 |         iter_df = pd.DataFrame(X, columns=self.feature_names)
46 |         iter_df["target"] = y
47 |         iter_df["era"] = era_series
48 | 
49 |         for _ in range(self.num_iters - 1):
50 |             preds = self.predict(X)
51 |             iter_df["predictions"] = preds
52 |             era_scores = pd.Series(index=iter_df["era"].unique())
53 | 
54 |             # Per era Corrv2 aka "Numerai Corr".
55 |             era_scores = evaluator.per_era_numerai_corrs(dataf=iter_df, pred_col="predictions", target_col="target")
56 |             # Filter on eras with worst Corrv2.
57 |             era_scores.sort_values(inplace=True)
58 |             worst_eras = era_scores[era_scores <= era_scores.quantile(self.proportion)].index
59 |             worst_df = iter_df[iter_df["era"].isin(worst_eras)]
60 | 
61 |             # Add estimators and fit on worst eras.
62 |             self.n_estimators += self.trees_per_step
63 |             booster = self.get_booster()
64 |             super().fit(worst_df.drop(columns=["target", "era", "predictions"]), worst_df["target"], xgb_model=booster, **fit_params)
65 |         return self
66 | 
67 |     def get_feature_names_out(self, input_features=None):
68 |         """Get output feature names for transformation."""
69 |         check_is_fitted(self)
70 |         return self.feature_names if not input_features else input_features
71 | 


--------------------------------------------------------------------------------
/src/numerblox/neutralizers.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from abc import abstractmethod
  3 | from typing import List, Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy.stats as sp
  8 | import sklearn
  9 | from joblib import Parallel, delayed
 10 | from sklearn.base import BaseEstimator, TransformerMixin
 11 | from sklearn.preprocessing import MinMaxScaler
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | class BaseNeutralizer(TransformerMixin, BaseEstimator):
 16 |     """
 17 |     Base class for neutralization so it is compatible with scikit-learn.
 18 |     :param new_col_name: Name of new neutralized column.
 19 |     """
 20 | 
 21 |     def __init__(self, new_col_names: list):
 22 |         self.new_col_names = new_col_names
 23 |         sklearn.set_config(enable_metadata_routing=True)
 24 |         self.set_transform_request(features=True, era_series=True)
 25 |         self.set_predict_request(features=True, era_series=True)
 26 |         super().__init__()
 27 | 
 28 |     def fit(self, X=None, y=None):
 29 |         return self
 30 | 
 31 |     @abstractmethod
 32 |     def transform(self, X: Union[np.array, pd.DataFrame], features: pd.DataFrame, era_series: pd.Series) -> np.array: ...
 33 | 
 34 |     def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
 35 |         """Convenience function for scikit-learn compatibility."""
 36 |         return self.transform(X=X, features=features, era_series=era_series)
 37 | 
 38 |     def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
 39 |         """
 40 |         Convenience function for scikit-learn compatibility.
 41 |         Needed because fit and transform except different arguments here.
 42 |         """
 43 |         return self.fit().transform(X=X, features=features, era_series=era_series)
 44 | 
 45 |     def get_feature_names_out(self, input_features: list = None) -> list:
 46 |         """
 47 |         Get feature names for neutralized output.
 48 | 
 49 |         :param input_features: Optional list of input feature names.
 50 |         :return: List of feature names for neutralized output.
 51 |         """
 52 |         return input_features if input_features else self.new_col_names
 53 | 
 54 | 
 55 | class FeatureNeutralizer(BaseNeutralizer):
 56 |     """
 57 |     Classic feature neutralization by subtracting a linear model.
 58 | 
 59 |     :param pred_name: Name of prediction column. For creating the new column name.
 60 |     :param proportion: Number in range [0...1] indicating how much to neutralize.
 61 |     :param suffix: Optional suffix that is added to new column name.
 62 |     :param num_cores: Number of cores to use for parallel processing.
 63 |     By default, all CPU cores are used.
 64 |     """
 65 | 
 66 |     def __init__(self, pred_name: Union[str, list] = "prediction", proportion: Union[float, List[float]] = 0.5, suffix: str = None, num_cores: int = -1):
 67 |         self.pred_name = [pred_name] if isinstance(pred_name, str) else pred_name
 68 |         self.proportion = [proportion] if isinstance(proportion, float) else proportion
 69 |         assert len(self.pred_name) == len(set(self.pred_name)), "Duplicate 'pred_names' found. Make sure all names are unique."
 70 |         assert len(self.proportion) == len(set(self.proportion)), "Duplicate 'proportions' found. Make sure all proportions are unique."
 71 |         for prop in self.proportion:
 72 |             assert 0.0 <= prop <= 1.0, f"'proportion' should be a float in range [0...1]. Got '{prop}'."
 73 | 
 74 |         new_col_names = []
 75 |         for pred_name in self.pred_name:
 76 |             for prop in self.proportion:
 77 |                 new_col_names.append(f"{pred_name}_neutralized_{prop}_{suffix}" if suffix else f"{pred_name}_neutralized_{prop}")
 78 |         super().__init__(new_col_names=new_col_names)
 79 |         self.suffix = suffix
 80 |         self.num_cores = num_cores
 81 | 
 82 |     def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array:
 83 |         """
 84 |         Main transform function.
 85 |         :param X: Input predictions to neutralize. \n
 86 |         :param features: DataFrame with features for neutralization. \n
 87 |         :param era_series: Series with era labels for each row in features. \n
 88 |         Features, era_series and the prediction column must all have the same length.
 89 |         :return: Neutralized predictions NumPy array.
 90 |         """
 91 |         if era_series is None:
 92 |             warnings.warn("""WARNING: 'era_series' not provided for 
 93 |                           neutralization! Neutralization will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to neutralization in this way! Not providing 'era_series' is valid for live inference, where only one era is used to generate predictions.""")
 94 |         else:
 95 |             assert len(X) == len(era_series), "Input predictions must have same length as era_series."
 96 |         assert len(X) == len(features), "Input predictions must have same length as features."
 97 | 
 98 |         df = features.copy()
 99 |         if not isinstance(X, np.ndarray):
100 |             X = np.array(X)
101 |         # Ensure X is a 2D array and has the same number of columns as pred_name
102 |         if X.ndim == 1:
103 |             assert len(self.pred_name) == 1, "Only one prediction column found. Please input a 2D array or define one column for 'pred_name'."
104 |             X = X.reshape(-1, 1)
105 |         else:
106 |             assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'."
107 |         for i, pred_name in enumerate(self.pred_name):
108 |             df[pred_name] = X[:, i]
109 |         # Treat input as 1 era if era_series is not provided.
110 |         df["era"] = era_series if era_series is not None else "X"
111 | 
112 |         feature_cols = list(features.columns)
113 |         tasks = [delayed(self._process_pred_name)(df, pred_name, proportion, feature_cols) for pred_name in tqdm(self.pred_name, desc="Processing feature neutralizations") for proportion in self.proportion]
114 |         neutralized_results = Parallel(n_jobs=self.num_cores)(tasks)
115 |         neutralized_preds = pd.concat(neutralized_results, axis=1).to_numpy()
116 |         return neutralized_preds
117 | 
118 |     def _process_pred_name(self, df: pd.DataFrame, pred_name: str, proportion: float, feature_cols: List[str]) -> pd.DataFrame:
119 |         """
120 |         Process one combination of prediction and proportion.
121 |         :param df: DataFrame with features and predictions.
122 |         :param pred_name: Name of prediction column.
123 |         :param proportion: Proportion to neutralize.
124 |         :param feature_cols: List of feature column names.
125 |         :return: Neutralized predictions.
126 |         Neutralized predictions are scaled to [0...1].
127 |         """
128 |         neutralized_pred = df.groupby("era", group_keys=False).apply(lambda x: self.normalize_and_neutralize(x, [pred_name], feature_cols, proportion))
129 |         return pd.DataFrame(MinMaxScaler().fit_transform(neutralized_pred))
130 | 
131 |     def neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
132 |         """
133 |         Neutralize on CPU.
134 |         :param dataf: DataFrame with features and predictions.
135 |         :param columns: List of prediction column names.
136 |         :param by: List of feature column names.
137 |         :param proportion: Proportion to neutralize.
138 |         :return: Neutralized predictions.
139 |         """
140 |         scores = dataf[columns]
141 |         exposures = dataf[by].values
142 |         scores = scores - proportion * self._get_raw_exposures(exposures, scores)
143 |         return scores / scores.std()
144 | 
145 |     @staticmethod
146 |     def normalize(dataf: pd.DataFrame) -> np.ndarray:
147 |         """Normalize predictions.
148 |         1. Rank predictions.
149 |         2. Normalize ranks.
150 |         3. Gaussianize ranks.
151 |         :param dataf: DataFrame with predictions.
152 |         :return: Gaussianized rank predictions.
153 |         """
154 |         normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf)
155 |         # Gaussianized ranks
156 |         return sp.norm.ppf(normalized_ranks)
157 | 
158 |     def normalize_and_neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame:
159 |         """
160 |         Gaussianize predictions and neutralize with one combination of prediction and proportion.
161 |         :param dataf: DataFrame with features and predictions.
162 |         :param columns: List of prediction column names.
163 |         :param by: List of feature column names.
164 |         :param proportion: Proportion to neutralize.
165 |         :return: Neutralized predictions DataFrame.
166 |         """
167 |         dataf[columns] = self.normalize(dataf[columns])
168 |         dataf[columns] = self.neutralize(dataf, columns, by, proportion)
169 |         return dataf[columns]
170 | 
171 |     @staticmethod
172 |     def _get_raw_exposures(exposures: np.array, scores: pd.DataFrame) -> np.array:
173 |         """
174 |         Get raw feature exposures.
175 |         Make sure predictions are normalized!
176 |         :param exposures: Exposures for each era.
177 |         :param scores: DataFrame with predictions.
178 |         :return: Raw exposures for each era.
179 |         """
180 |         return exposures @ np.linalg.lstsq(exposures, scores.values, rcond=None)[0]
181 | 


--------------------------------------------------------------------------------
/src/numerblox/penalizers.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from abc import abstractmethod
  3 | from typing import Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy
  8 | import sklearn
  9 | from sklearn.base import BaseEstimator, TransformerMixin
 10 | from tqdm.auto import tqdm
 11 | 
 12 | try:
 13 |     import tensorflow as tf
 14 | except ImportError:
 15 |     warnings.warn("TensorFlow is not installed. Some NumerBlox Penalizers may not work. " "To use all features, please install TensorFlow: `pip install tensorflow`", ImportWarning)
 16 | 
 17 | 
 18 | class BasePenalizer(TransformerMixin, BaseEstimator):
 19 |     """
 20 |     Base class for penalization so it is compatible with scikit-learn.
 21 |     :param new_col_name: Name of new neutralized column.
 22 |     """
 23 | 
 24 |     def __init__(self, new_col_name: str):
 25 |         sklearn.set_config(enable_metadata_routing=True)
 26 |         self.set_transform_request(features=True, era_series=True)
 27 |         self.set_predict_request(features=True, era_series=True)
 28 |         self.new_col_name = new_col_name
 29 |         super().__init__()
 30 | 
 31 |     def fit(self, X=None, y=None):
 32 |         return self
 33 | 
 34 |     @abstractmethod
 35 |     def transform(self, X: Union[np.array, pd.DataFrame], features: pd.DataFrame, era_series: pd.Series) -> np.array: ...
 36 | 
 37 |     def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
 38 |         """Convenience function for scikit-learn compatibility."""
 39 |         return self.transform(X=X, features=features, era_series=era_series)
 40 | 
 41 |     def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array:
 42 |         """
 43 |         Convenience function for scikit-learn compatibility.
 44 |         Needed because fit and transform except different arguments here.
 45 |         """
 46 |         return self.fit().transform(X=X, features=features, era_series=era_series)
 47 | 
 48 |     def get_feature_names_out(self, input_features: list = None) -> list:
 49 |         """
 50 |         Get feature names for neutralized output.
 51 | 
 52 |         :param input_features: Optional list of input feature names.
 53 |         :return: List of feature names for neutralized output.
 54 |         """
 55 |         return input_features if input_features else [self.new_col_name]
 56 | 
 57 | 
 58 | class FeaturePenalizer(BasePenalizer):
 59 |     """
 60 |     Feature penalization with TensorFlow.
 61 | 
 62 |     Source (by jrb): https://github.com/jonrtaylor/twitch/blob/master/FE_Clipping_Script.ipynb
 63 | 
 64 |     Source of first PyTorch implementation (by Michael Oliver / mdo): https://forum.numer.ai/t/model-diagnostics-feature-exposure/899/12
 65 | 
 66 |     :param max_exposure: Number in range [0...1] indicating how much to reduce max feature exposure to.
 67 |     :param pred_name: Prediction column name. Used for new column name. \n
 68 |     :param suffix: Optional suffix that is added to new column name.
 69 |     """
 70 | 
 71 |     def __init__(
 72 |         self,
 73 |         max_exposure: float,
 74 |         pred_name: str = "prediction",
 75 |         suffix: str = None,
 76 |     ):
 77 |         self.max_exposure = max_exposure
 78 |         self.pred_name = pred_name
 79 |         assert 0.0 <= max_exposure <= 1.0, f"'max_exposure' should be a float in range [0...1]. Got '{self.max_exposure}'."
 80 |         new_col_name = f"{self.pred_name}_penalized_{self.max_exposure}_{suffix}" if suffix else f"{self.pred_name}_penalized_{self.max_exposure}"
 81 |         super().__init__(new_col_name=new_col_name)
 82 |         self.suffix = suffix
 83 | 
 84 |     def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array:
 85 |         """
 86 |         Main transform method.
 87 |         :param X: Input predictions to neutralize.
 88 |         :param features: DataFrame with features for neutralization.
 89 |         :param era_series: Series with era labels for each row in features.
 90 |         Features, eras and the prediction column must all have the same length.
 91 |         :return: Penalized predictions.
 92 |         """
 93 |         assert len(X) == len(features), "Input predictions must have same length as features."
 94 |         assert len(X) == len(era_series), "Input predictions must have same length as eras."
 95 |         df = features.copy()
 96 |         df["prediction"] = X
 97 |         df["era"] = era_series
 98 |         penalized_data = self._reduce_all_exposures(dataf=df, column=self.pred_name, neutralizers=list(features.columns))
 99 |         return penalized_data
100 | 
101 |     def _reduce_all_exposures(
102 |         self,
103 |         dataf: pd.DataFrame,
104 |         column: str = "prediction",
105 |         neutralizers: list = None,
106 |         normalize=True,
107 |         gaussianize=True,
108 |     ) -> pd.DataFrame:
109 |         neutralized = []
110 | 
111 |         for era in tqdm(dataf["era"].unique()):
112 |             dataf_era = dataf[dataf["era"] == era]
113 |             scores = dataf_era[[column]].values
114 |             exposure_values = dataf_era[neutralizers].values
115 | 
116 |             if normalize:
117 |                 scores2 = []
118 |                 for x in scores.T:
119 |                     x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x)
120 |                     if gaussianize:
121 |                         x = scipy.stats.norm.ppf(x)
122 |                     scores2.append(x)
123 |                 scores = np.array(scores2)[0]
124 | 
125 |             scores, _ = self._reduce_exposure(scores, exposure_values, len(neutralizers), None)
126 | 
127 |             scores /= tf.math.reduce_std(scores)
128 |             scores -= tf.reduce_min(scores)
129 |             scores /= tf.reduce_max(scores)
130 |             neutralized.append(scores.numpy())
131 | 
132 |         predictions = pd.DataFrame(np.concatenate(neutralized), columns=[column], index=dataf.index)
133 |         return predictions
134 | 
135 |     def _reduce_exposure(self, prediction, features, input_size=50, weights=None):
136 |         model = tf.keras.models.Sequential(
137 |             [
138 |                 tf.keras.layers.Input(input_size),
139 |                 tf.keras.experimental.LinearModel(use_bias=False),
140 |             ]
141 |         )
142 |         feats = tf.convert_to_tensor(features - 0.5, dtype=tf.float32)
143 |         pred = tf.convert_to_tensor(prediction, dtype=tf.float32)
144 |         if weights is None:
145 |             optimizer = tf.keras.optimizers.Adamax()
146 |             start_exp = self.__exposures(feats, pred[:, None])
147 |             target_exps = tf.clip_by_value(start_exp, -self.max_exposure, self.max_exposure)
148 |             self._train_loop(model, optimizer, feats, pred, target_exps)
149 |         else:
150 |             model.set_weights(weights)
151 |         return pred[:, None] - model(feats), model.get_weights()
152 | 
153 |     def _train_loop(self, model, optimizer, feats, pred, target_exps):
154 |         for _ in range(1000000):
155 |             loss, grads = self.__train_loop_body(model, feats, pred, target_exps)
156 |             optimizer.apply_gradients(zip(grads, model.trainable_variables))
157 |             if loss < 1e-7:
158 |                 break
159 | 
160 |     def __train_loop_body(self, model, feats, pred, target_exps):
161 |         with tf.GradientTape() as tape:
162 |             exps = self.__exposures(feats, pred[:, None] - model(feats, training=True))
163 |             loss = tf.reduce_sum(tf.nn.relu(tf.nn.relu(exps) - tf.nn.relu(target_exps)) + tf.nn.relu(tf.nn.relu(-exps) - tf.nn.relu(-target_exps)))
164 |         return loss, tape.gradient(loss, model.trainable_variables)
165 | 
166 |     @staticmethod
167 |     def __exposures(x, y):
168 |         x = x - tf.math.reduce_mean(x, axis=0)
169 |         x = x / tf.norm(x, axis=0)
170 |         y = y - tf.math.reduce_mean(y, axis=0)
171 |         y = y / tf.norm(y, axis=0)
172 |         return tf.matmul(x, y, transpose_a=True)
173 | 


--------------------------------------------------------------------------------
/src/numerblox/prediction_loaders.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from pathlib import Path
 3 | from uuid import uuid4
 4 | 
 5 | import pandas as pd
 6 | from sklearn.base import BaseEstimator, TransformerMixin
 7 | 
 8 | from .download import NumeraiClassicDownloader
 9 | 
10 | 
11 | class BasePredictionLoader(TransformerMixin, BaseEstimator):
12 |     """Shared functionality for all Prediction Loaders."""
13 | 
14 |     def __init__(self): ...
15 | 
16 |     def fit(self, X=None, y=None):
17 |         return self
18 | 
19 |     @abstractmethod
20 |     def transform(self, X=None, y=None) -> pd.DataFrame:
21 |         """Return Predictions generated by model."""
22 |         ...
23 | 
24 |     @abstractmethod
25 |     def get_feature_names_out(self, input_features=None):
26 |         """Return feature names."""
27 |         ...
28 | 
29 | 
30 | class ExamplePredictions(BasePredictionLoader):
31 |     """
32 |     Load example predictions.
33 |     :param file_name: File to download from NumerAPI.
34 |     By default this is example predictions for v5.0 data.
35 |     'v5.0/live_example_preds.parquet' by default.
36 |     Example predictions in previous versions:
37 |     - v5.0. validation examples -> "v5.0/validation_example_preds.parquet"
38 |     - v5.0. live benchmark models -> "v5.0/live_benchmark_models.parquet"
39 |     - v5.0. validation benchmark models -> "v5.0/validation_benchmark_models.parquet"
40 |     :param round_num: Optional round number. Downloads most recent round by default.
41 |     :param keep_files: Whether to keep downloaded files.
42 |     By default, files are deleted after the predictions are loaded.
43 |     """
44 | 
45 |     def __init__(self, file_name: str = "v5.0/live_example_preds.parquet", round_num: int = None, keep_files: bool = False):
46 |         super().__init__()
47 |         self.file_name = file_name
48 |         self.round_num = round_num
49 |         self.keep_files = keep_files
50 | 
51 |     def transform(self, X=None, y=None) -> pd.DataFrame:
52 |         """Return example predictions."""
53 |         self._download_example_preds()
54 |         example_preds = self._load_example_preds()
55 |         if not self.keep_files:
56 |             self.downloader.remove_base_directory()
57 |         return example_preds
58 | 
59 |     def _download_example_preds(self):
60 |         data_directory = f"example_predictions_loader_{uuid4()}"
61 |         self.downloader = NumeraiClassicDownloader(directory_path=data_directory)
62 |         self.dest_path = f"{str(self.downloader.dir)}/{self.file_name}"
63 |         self.downloader.download_single_dataset(filename=self.file_name, dest_path=self.dest_path, round_num=self.round_num)
64 | 
65 |     def _load_example_preds(self, *args, **kwargs):
66 |         return pd.read_parquet(self.dest_path, *args, **kwargs)
67 | 
68 |     def get_feature_names_out(self, input_features=None):
69 |         return [Path(self.file_name).with_suffix("").as_posix()] if not input_features else input_features
70 | 


--------------------------------------------------------------------------------
/src/numerblox/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from numerblox.preprocessing.classic import *
2 | from numerblox.preprocessing.signals import *
3 | 


--------------------------------------------------------------------------------
/src/numerblox/preprocessing/base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from typing import List, Union
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import sklearn
 7 | from sklearn.base import BaseEstimator, TransformerMixin
 8 | 
 9 | 
10 | class BasePreProcessor(TransformerMixin, BaseEstimator):
11 |     """Common functionality for preprocessors and postprocessors."""
12 | 
13 |     def __init__(self):
14 |         sklearn.set_config(enable_metadata_routing=True)
15 | 
16 |     def fit(self, X, y=None):
17 |         self.is_fitted_ = True
18 |         return self
19 | 
20 |     @abstractmethod
21 |     def transform(self, X: Union[np.array, pd.DataFrame], y=None, **kwargs) -> pd.DataFrame: ...
22 | 
23 |     @abstractmethod
24 |     def get_feature_names_out(self, input_features=None) -> List[str]: ...
25 | 


--------------------------------------------------------------------------------
/src/numerblox/preprocessing/classic.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from typing import List
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from numerblox.feature_groups import V5_FEATURE_GROUP_MAPPING
 8 | from numerblox.preprocessing.base import BasePreProcessor
 9 | 
10 | 
11 | class GroupStatsPreProcessor(BasePreProcessor):
12 |     """
13 |     Note that this class only works with `pd.DataFrame` input.
14 |     When using in a Pipeline, make sure that the Pandas output API is set (`.set_output(transform="pandas")`.
15 | 
16 |     Calculates group statistics for all data groups. \n
17 |     :param groups: Groups to create features for. All groups by default. \n
18 |     """
19 | 
20 |     def __init__(self, groups: list = None):
21 |         super().__init__()
22 |         self.all_groups = ["intelligence", "charisma", "strength", "dexterity", "constitution", "wisdom", "agility", "serenity", "sunshine", "rain"]
23 |         self.groups = groups
24 |         self.group_names = groups if self.groups else self.all_groups
25 |         self.feature_group_mapping = V5_FEATURE_GROUP_MAPPING
26 | 
27 |     def transform(self, X: pd.DataFrame) -> np.array:
28 |         """Check validity and add group features."""
29 |         dataf = self._add_group_features(X)
30 |         return dataf.to_numpy()
31 | 
32 |     def _add_group_features(self, X: pd.DataFrame) -> pd.DataFrame:
33 |         """Mean, standard deviation and skew for each group."""
34 |         dataf = pd.DataFrame()
35 |         for group in self.group_names:
36 |             cols = self.feature_group_mapping[group]
37 |             valid_cols = [col for col in cols if col in X.columns]
38 |             if not valid_cols:
39 |                 warnings.warn(f"None of the columns of '{group}' are in the input data. Output will be nans for the group features.")
40 |             elif len(cols) != len(valid_cols):
41 |                 warnings.warn(f"Not all columns of '{group}' are in the input data ({len(valid_cols)} < {len(cols)}). Use remaining columns for stats features.")
42 |             dataf.loc[:, f"feature_{group}_mean"] = X[valid_cols].mean(axis=1)
43 |             dataf.loc[:, f"feature_{group}_std"] = X[valid_cols].std(axis=1)
44 |             dataf.loc[:, f"feature_{group}_skew"] = X[valid_cols].skew(axis=1)
45 |         return dataf
46 | 
47 |     def get_feature_names_out(self, input_features=None) -> List[str]:
48 |         """Return feature names."""
49 |         if not input_features:
50 |             feature_names = []
51 |             for group in self.group_names:
52 |                 feature_names.append(f"feature_{group}_mean")
53 |                 feature_names.append(f"feature_{group}_std")
54 |                 feature_names.append(f"feature_{group}_skew")
55 |         else:
56 |             feature_names = input_features
57 |         return feature_names
58 | 


--------------------------------------------------------------------------------
/src/numerblox/targets.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod
  2 | from typing import List, Union
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sklearn
  7 | from scipy.stats import rankdata
  8 | from sklearn.base import BaseEstimator, TransformerMixin
  9 | from sklearn.linear_model import Ridge
 10 | from sklearn.mixture import BayesianGaussianMixture
 11 | from sklearn.utils.validation import check_is_fitted
 12 | from tqdm import tqdm
 13 | 
 14 | # Ignore SettingWithCopyWarning
 15 | pd.options.mode.chained_assignment = None
 16 | 
 17 | 
 18 | class BaseTargetProcessor(TransformerMixin, BaseEstimator):
 19 |     """Common functionality for preprocessors and postprocessors."""
 20 | 
 21 |     def __init__(self):
 22 |         sklearn.set_config(enable_metadata_routing=True)
 23 |         self.set_transform_request(era_series=True)
 24 | 
 25 |     def fit(self, X, y=None):
 26 |         self.is_fitted_ = True
 27 |         return self
 28 | 
 29 |     @abstractmethod
 30 |     def transform(self, X: Union[np.array, pd.DataFrame], y=None) -> pd.DataFrame: ...
 31 | 
 32 |     @abstractmethod
 33 |     def get_feature_names_out(self, input_features=None) -> List[str]: ...
 34 | 
 35 | 
 36 | class BayesianGMMTargetProcessor(BaseTargetProcessor):
 37 |     """
 38 |     Generate synthetic (fake) target using a Bayesian Gaussian Mixture model. \n
 39 |     Based on Michael Oliver's GitHub Gist implementation: \n
 40 |     https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93
 41 | 
 42 |     :param n_components: Number of components for fitting Bayesian Gaussian Mixture Model.
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self,
 47 |         n_components: int = 3,
 48 |     ):
 49 |         super().__init__()
 50 |         self.set_fit_request(era_series=True)
 51 |         self.n_components = n_components
 52 |         self.ridge = Ridge(fit_intercept=False)
 53 |         self.bins = [0, 0.05, 0.25, 0.75, 0.95, 1]
 54 | 
 55 |     def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series):
 56 |         """
 57 |         Fit Bayesian Gaussian Mixture model on coefficients and normalize.
 58 |         :param X: DataFrame containing features.
 59 |         :param y: Series containing real target.
 60 |         :param era_series: Series containing era information.
 61 |         """
 62 |         bgmm = BayesianGaussianMixture(n_components=self.n_components)
 63 |         coefs = self._get_coefs(dataf=X, y=y, era_series=era_series)
 64 |         bgmm.fit(coefs)
 65 |         # make probability of sampling each component equal to better balance rare regimes
 66 |         bgmm.weights_[:] = 1 / self.n_components
 67 |         self.bgmm_ = bgmm
 68 |         self.is_fitted_ = True
 69 |         return self
 70 | 
 71 |     def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array:
 72 |         """
 73 |         Main method for generating fake target.
 74 |         :param X: DataFrame containing features.
 75 |         :param era_series: Series containing era information.
 76 |         """
 77 |         check_is_fitted(self, "bgmm_")
 78 |         assert len(X) == len(era_series), "X and eras must be same length."
 79 |         all_eras = era_series.unique().tolist()
 80 |         # Scale data between 0 and 1
 81 |         X = X.astype(float)
 82 |         X /= X.max()
 83 |         X -= 0.5
 84 |         X.loc[:, "era"] = era_series
 85 | 
 86 |         fake_target = self._generate_target(dataf=X, all_eras=all_eras)
 87 |         return fake_target
 88 | 
 89 |     def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, era_series: pd.Series) -> np.ndarray:
 90 |         """
 91 |         Generate coefficients for BGMM.
 92 |         :param dataf: DataFrame containing features.
 93 |         :param y: Series containing real target.
 94 |         """
 95 |         coefs = []
 96 |         dataf.loc[:, "era"] = era_series
 97 |         dataf.loc[:, "target"] = y
 98 |         all_eras = dataf["era"].unique().tolist()
 99 |         for era in all_eras:
100 |             era_df = dataf[dataf["era"] == era]
101 |             era_y = era_df.loc[:, "target"]
102 |             era_df = era_df.drop(columns=["era", "target"])
103 |             self.ridge.fit(era_df, era_y)
104 |             coefs.append(self.ridge.coef_)
105 |         stacked_coefs = np.vstack(coefs)
106 |         return stacked_coefs
107 | 
108 |     def _generate_target(self, dataf: pd.DataFrame, all_eras: list) -> np.ndarray:
109 |         """Generate fake target using Bayesian Gaussian Mixture model."""
110 |         fake_target = []
111 |         for era in tqdm(all_eras, desc="Generating fake target"):
112 |             features = dataf[dataf["era"] == era]
113 |             features = features.drop(columns=["era", "target"])
114 |             # Sample a set of weights from GMM
115 |             beta, _ = self.bgmm_.sample(1)
116 |             # Create fake continuous target
117 |             fake_targ = features @ beta[0]
118 |             # Bin fake target like real target
119 |             fake_targ = (rankdata(fake_targ) - 0.5) / len(fake_targ)
120 |             fake_targ = (np.digitize(fake_targ, self.bins) - 1) / 4
121 |             fake_target.append(fake_targ)
122 |         return np.concatenate(fake_target)
123 | 
124 |     def get_feature_names_out(self, input_features=None) -> List[str]:
125 |         """Return feature names."""
126 |         return ["fake_target"] if not input_features else input_features
127 | 
128 | 
129 | class SignalsTargetProcessor(BaseTargetProcessor):
130 |     """
131 |     Engineer targets for Numerai Signals. \n
132 |     More information on implements Numerai Signals targets: \n
133 |     https://forum.numer.ai/t/decoding-the-signals-target/2501
134 | 
135 |     :param price_col: Column from which target will be derived. \n
136 |     :param windows: Timeframes to use for engineering targets. 10 and 20-day by default. \n
137 |     :param bins: Binning used to create group targets. Nomi binning by default. \n
138 |     :param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default.
139 |     """
140 | 
141 |     def __init__(
142 |         self,
143 |         price_col: str = "close",
144 |         windows: list = None,
145 |         bins: list = None,
146 |         labels: list = None,
147 |     ):
148 |         super().__init__()
149 |         self.price_col = price_col
150 |         self.windows = windows if windows else [10, 20]
151 |         self.bins = bins if bins else [0, 0.05, 0.25, 0.75, 0.95, 1]
152 |         self.labels = labels if labels else [0, 0.25, 0.50, 0.75, 1]
153 | 
154 |     def transform(self, dataf: pd.DataFrame, era_series: pd.Series) -> np.array:
155 |         for window in tqdm(self.windows, desc="Signals target engineering windows"):
156 |             dataf.loc[:, f"target_{window}d_raw"] = dataf[self.price_col].pct_change(periods=window).shift(-window)
157 |             era_groups = dataf.groupby(era_series)
158 | 
159 |             dataf.loc[:, f"target_{window}d_rank"] = era_groups[f"target_{window}d_raw"].rank(pct=True, method="first")
160 |             dataf.loc[:, f"target_{window}d_group"] = era_groups[f"target_{window}d_rank"].transform(lambda group: pd.cut(group, bins=self.bins, labels=self.labels, include_lowest=True))
161 |         output_cols = self.get_feature_names_out()
162 |         return dataf[output_cols].to_numpy()
163 | 
164 |     def get_feature_names_out(self, input_features=None) -> List[str]:
165 |         """Return feature names of Signals targets."""
166 |         if not input_features:
167 |             feature_names = []
168 |             for window in self.windows:
169 |                 feature_names.append(f"target_{window}d_raw")
170 |                 feature_names.append(f"target_{window}d_rank")
171 |                 feature_names.append(f"target_{window}d_group")
172 |         else:
173 |             feature_names = input_features
174 |         return feature_names
175 | 


--------------------------------------------------------------------------------
/tests/test_assets/mock_credentials.json:
--------------------------------------------------------------------------------
1 | {"pub_id": "Hello", "secret_key": "World"}


--------------------------------------------------------------------------------
/tests/test_assets/val_3_eras.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/tests/test_assets/val_3_eras.parquet


--------------------------------------------------------------------------------
/tests/test_download/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/tests/test_download/__init__.py


--------------------------------------------------------------------------------
/tests/test_download/test_download_classic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import PosixPath
 3 | from uuid import uuid4
 4 | 
 5 | import pytest
 6 | from numerapi import NumerAPI
 7 | 
 8 | from numerblox.download import NumeraiClassicDownloader
 9 | 
10 | ALL_CLASSIC_VERSIONS = set(s.split("/")[0] for s in NumerAPI().list_datasets() if "signals" not in s)
11 | 
12 | TEST_CLASSIC_DIR = f"test_numclassic_general_{uuid4()}"
13 | TEST_CLASSIC_VERSIONS = ["5.0"]
14 | 
15 | 
16 | def test_base():
17 |     numer_classic_downloader = NumeraiClassicDownloader(TEST_CLASSIC_DIR)
18 | 
19 |     # Test building class
20 |     assert isinstance(numer_classic_downloader.dir, PosixPath)
21 |     assert numer_classic_downloader.dir.is_dir()
22 | 
23 |     # Test is_empty
24 |     (numer_classic_downloader.dir / "test.txt").write_text("test")
25 |     assert not numer_classic_downloader.is_empty
26 | 
27 |     # Remove contents
28 |     numer_classic_downloader.remove_base_directory()
29 |     assert not os.path.exists(TEST_CLASSIC_DIR)
30 | 
31 | 
32 | def test_classic():
33 |     dl = NumeraiClassicDownloader(TEST_CLASSIC_DIR)
34 | 
35 |     # Check versions
36 |     assert dl.dataset_versions == ALL_CLASSIC_VERSIONS
37 | 
38 |     # Test live download
39 |     for version in TEST_CLASSIC_VERSIONS:
40 |         dl.download_live_data("live", version=version)
41 |         assert os.path.exists(dl.dir / "live")
42 |         assert os.path.exists(dl.dir / "live" / "live.parquet")
43 | 
44 |         # Test example data
45 |         dl.download_example_data("test/", version=version)
46 |         assert os.path.exists(dl.dir / "test")
47 |         assert os.path.exists(dl.dir / "test" / "live_example_preds.parquet")
48 |         assert os.path.exists(dl.dir / "test" / "validation_example_preds.parquet")
49 | 
50 |     # Test features
51 |     features = dl.get_classic_features()
52 |     assert isinstance(features, dict)
53 |     assert len(features["feature_sets"]["medium"]) == 705
54 |     # Check that feature_stats and feature_sets keys exist
55 |     assert "feature_sets" in features.keys()
56 | 
57 |     dl.remove_base_directory()
58 | 
59 | 
60 | def test_classic_versions():
61 |     downloader = NumeraiClassicDownloader(directory_path=f"some_path_{uuid4()}")
62 | 
63 |     # Test unsupported versions
64 |     unsupported_versions = ["3"]
65 |     for version in unsupported_versions:
66 |         with pytest.raises(AssertionError):
67 |             downloader.download_training_data(version=version)
68 |         with pytest.raises(AssertionError):
69 |             downloader.download_live_data(version=version)
70 | 
71 |     downloader.remove_base_directory()
72 | 


--------------------------------------------------------------------------------
/tests/test_download/test_download_crypto.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from uuid import uuid4
 3 | 
 4 | import pytest
 5 | 
 6 | from numerblox.download import NumeraiCryptoDownloader
 7 | 
 8 | ALL_CRYPTO_VERSIONS = ["v1.0"]
 9 | 
10 | 
11 | @pytest.mark.xfail(reason="May fail due to API rate limiting")
12 | def test_crypto():
13 |     TEST_CRYPTO_DIR = f"test_numcrypto_general_{uuid4()}"
14 |     dl = NumeraiCryptoDownloader(TEST_CRYPTO_DIR)
15 | 
16 |     # Check versions
17 |     assert dl.dataset_versions == ALL_CRYPTO_VERSIONS
18 | 
19 |     # Test live download
20 |     dl.download_live_data("live", version="1.0")
21 |     assert os.path.exists(dl.dir / "live")
22 |     assert os.path.exists(dl.dir / "live" / "live_universe.parquet")
23 | 
24 |     # Test training data download
25 |     dl.download_training_data("train/", version="1.0")
26 |     assert os.path.exists(dl.dir / "train")
27 |     assert os.path.exists(dl.dir / "train" / "train_targets.parquet")
28 | 
29 | 
30 | @pytest.mark.xfail(reason="May fail due to API rate limiting")
31 | def test_crypto_versions():
32 |     downloader = NumeraiCryptoDownloader(directory_path=f"some_path_{uuid4()}")
33 | 
34 |     # Test unsupported versions
35 |     unsupported_versions = ["0", "0.5", "3.5"]
36 |     for version in unsupported_versions:
37 |         with pytest.raises(AssertionError):
38 |             downloader.download_training_data(version=version)
39 |         with pytest.raises(AssertionError):
40 |             downloader.download_live_data(version=version)
41 | 


--------------------------------------------------------------------------------
/tests/test_download/test_download_signals.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from uuid import uuid4
 3 | 
 4 | import pytest
 5 | from numerapi import SignalsAPI
 6 | 
 7 | from numerblox.download import EODDownloader, KaggleDownloader, NumeraiSignalsDownloader
 8 | 
 9 | ALL_SIGNALS_VERSIONS = set(s.replace("signals/", "").split("/")[0] for s in SignalsAPI().list_datasets() if s.startswith("signals/v"))
10 | TEST_SIGNALS_DIR = f"test_numsignals_general_{uuid4()}"
11 | TEST_SIGNALS_VERSIONS = ["2.0"]
12 | 
13 | 
14 | @pytest.mark.xfail(reason="May fail due to API rate limiting")
15 | def test_signals():
16 |     dl = NumeraiSignalsDownloader(TEST_SIGNALS_DIR)
17 | 
18 |     # Check versions
19 |     assert dl.dataset_versions == ALL_SIGNALS_VERSIONS
20 | 
21 |     # Test live download
22 |     for version in TEST_SIGNALS_VERSIONS:
23 |         dl.download_live_data("live", version=version)
24 |         assert os.path.exists(dl.dir / "live")
25 |         assert os.path.exists(dl.dir / "live" / "live.parquet")
26 | 
27 |         # Test example data
28 |         dl.download_example_data("test/", version=version)
29 |         assert os.path.exists(dl.dir / "test")
30 |         assert os.path.exists(dl.dir / "test" / "live_example_preds.parquet")
31 |         assert os.path.exists(dl.dir / "test" / "validation_example_preds.parquet")
32 | 
33 |     dl.remove_base_directory()
34 | 
35 | 
36 | @pytest.mark.xfail(reason="May fail due to API rate limiting")
37 | def test_signals_versions():
38 |     downloader = NumeraiSignalsDownloader(directory_path=f"some_path_{uuid4()}")
39 | 
40 |     # Test unsupported versions
41 |     unsupported_versions = ["0"]
42 |     for version in unsupported_versions:
43 |         with pytest.raises(AssertionError):
44 |             downloader.download_training_data(version=version)
45 |         with pytest.raises(AssertionError):
46 |             downloader.download_live_data(version=version)
47 | 
48 |     downloader.remove_base_directory()
49 | 
50 | 
51 | @pytest.mark.xfail(reason="May fail due to API rate limiting or missing credentials")
52 | def test_kaggle_downloader():
53 |     try:
54 |         kd = KaggleDownloader(f"test_kaggle_{uuid4()}")
55 |         assert os.path.exists(kd.dir)
56 |         kd.remove_base_directory()
57 |     except OSError:
58 |         pass
59 | 
60 | 
61 | @pytest.mark.xfail(reason="May fail due to API rate limiting or missing credentials")
62 | def test_eod():
63 |     eod = EODDownloader(f"test_eod_{uuid4()}", key="DEMO", tickers=["AAPL.US"])
64 |     eod.download_live_data()
65 |     eod.download_training_data()
66 |     eod.remove_base_directory()
67 | 


--------------------------------------------------------------------------------
/tests/test_end_to_end.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from sklearn.compose import ColumnTransformer
  3 | from sklearn.model_selection import TimeSeriesSplit
  4 | from sklearn.pipeline import make_pipeline, make_union
  5 | from sklearn.tree import DecisionTreeClassifier
  6 | from sklego.preprocessing import ColumnSelector
  7 | from xgboost import XGBRegressor
  8 | 
  9 | from numerblox.ensemble import NumeraiEnsemble, PredictionReducer
 10 | from numerblox.meta import CrossValEstimator, MetaEstimator, make_meta_pipeline
 11 | from numerblox.neutralizers import FeatureNeutralizer
 12 | from numerblox.numerframe import create_numerframe
 13 | from numerblox.preprocessing import GroupStatsPreProcessor
 14 | 
 15 | 
 16 | @pytest.fixture(scope="module")
 17 | def setup_data():
 18 |     df = create_numerframe("tests/test_assets/val_3_eras.parquet")
 19 |     return df
 20 | 
 21 | 
 22 | def test_neutralized_xgboost_pipeline(setup_data):
 23 |     df = setup_data
 24 | 
 25 |     X, y = df.get_feature_target_pair(multi_target=False)
 26 |     fncv3_cols = df.get_fncv3_feature_data.columns.tolist()
 27 |     era_series = df.get_era_data
 28 |     features = df.get_feature_data
 29 | 
 30 |     # Preprocessing
 31 |     gpp = GroupStatsPreProcessor(groups=["sunshine", "rain"])
 32 |     fncv3_selector = ColumnSelector(fncv3_cols)
 33 |     # TODO Test with preproc FeatureUnion
 34 |     preproc_pipe = ColumnTransformer([("gpp", gpp, features.columns.tolist()), ("selector", fncv3_selector, fncv3_cols)])
 35 | 
 36 |     # Model
 37 |     xgb = XGBRegressor()
 38 |     cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5))
 39 |     ens = NumeraiEnsemble()
 40 |     fn = FeatureNeutralizer(proportion=0.5)
 41 |     full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn)
 42 | 
 43 |     # Train full model
 44 |     full_pipe.fit(X, y, era_series=era_series)
 45 |     # Inference
 46 |     preds = full_pipe.predict(X, era_series=era_series, features=features)
 47 |     assert preds.min() >= 0
 48 |     assert abs(preds.max() - 1) <= 1e-9
 49 |     assert preds.shape[0] == X.shape[0]
 50 |     assert len(preds.shape) == 2
 51 | 
 52 | 
 53 | def test_multi_classification_ensemble(setup_data):
 54 |     df = setup_data
 55 |     X, y = df.get_feature_target_pair(multi_target=False)
 56 |     era_series = df.get_era_data
 57 |     features = df.get_feature_data
 58 |     fncv3_cols = df.get_fncv3_feature_data.columns.tolist()
 59 |     # TODO Test with preproc FeatureUnion in sklearn 1.5+
 60 |     preproc_pipe = ColumnTransformer([("gpp", GroupStatsPreProcessor(groups=["sunshine", "rain"]), features.columns.tolist()), ("selector", ColumnSelector(fncv3_cols), fncv3_cols)])
 61 | 
 62 |     model = DecisionTreeClassifier()
 63 |     crossval = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func="predict_proba")
 64 |     pred_rud = PredictionReducer(n_models=3, n_classes=5)
 65 |     ens = NumeraiEnsemble(donate_weighted=True)
 66 |     fn = FeatureNeutralizer(proportion=0.5)
 67 |     full_pipe = make_meta_pipeline(preproc_pipe, crossval, pred_rud, ens, fn)
 68 | 
 69 |     y_int = (y * 4).astype(int)
 70 |     full_pipe.fit(X, y_int, era_series=era_series)
 71 | 
 72 |     preds = full_pipe.predict(X, era_series=era_series, features=features)
 73 |     assert preds.min() >= 0
 74 |     assert abs(preds.max() - 1) <= 1e-9
 75 |     assert preds.shape[0] == X.shape[0]
 76 |     assert len(preds.shape) == 2
 77 | 
 78 | 
 79 | @pytest.mark.xfail(reason="Can only be tested with sklearn 1.5+")
 80 | def test_feature_union_pipeline(setup_data):
 81 |     df = setup_data
 82 |     X, y = df.get_feature_target_pair(multi_target=False)
 83 |     era_series = df.get_era_data
 84 |     features = df.get_feature_data
 85 |     fncv3_cols = df.get_fncv3_feature_data.columns.tolist()
 86 | 
 87 |     gpp = GroupStatsPreProcessor(groups=["sunshine", "rain"])
 88 |     fncv3_selector = ColumnSelector(fncv3_cols)
 89 |     preproc_pipe = make_union(gpp, fncv3_selector)
 90 | 
 91 |     xgb = MetaEstimator(XGBRegressor())
 92 |     fn = FeatureNeutralizer(proportion=0.5)
 93 |     model_pipe = make_pipeline(preproc_pipe, xgb, fn)
 94 | 
 95 |     model_pipe.fit(X, y)
 96 | 
 97 |     preds = model_pipe.predict(X, era_series=era_series, features=features)
 98 |     assert preds.min() >= 0
 99 |     assert abs(preds.max() - 1) <= 1e-9
100 |     assert preds.shape[0] == X.shape[0]
101 | 
102 | 
103 | def test_column_transformer_pipeline(setup_data):
104 |     df = setup_data
105 |     X, y = df.get_feature_target_pair(multi_target=False)
106 | 
107 |     era_series = df.get_era_data
108 |     features = df.get_feature_data
109 |     fncv3_cols = df.get_fncv3_feature_data.columns.tolist()
110 | 
111 |     gpp = GroupStatsPreProcessor(groups=["sunshine", "rain"])
112 |     preproc_pipe = ColumnTransformer([("gpp", gpp, features.columns.tolist()), ("selector", "passthrough", fncv3_cols[2:])])
113 |     xgb = MetaEstimator(XGBRegressor())
114 |     fn = FeatureNeutralizer(proportion=0.5)
115 |     model_pipe = make_pipeline(preproc_pipe, xgb, fn)
116 | 
117 |     model_pipe.fit(X, y)
118 | 
119 |     preds = model_pipe.predict(X, era_series=era_series, features=features)
120 |     assert preds.min() >= 0
121 |     assert abs(preds.max() - 1) <= 1e-9
122 |     assert preds.shape[0] == X.shape[0]
123 | 


--------------------------------------------------------------------------------
/tests/test_ensemble.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | from scipy.stats import rankdata
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.datasets import make_regression
  7 | from sklearn.utils.validation import check_is_fitted
  8 | 
  9 | from numerblox.ensemble import NumeraiEnsemble, PredictionReducer
 10 | 
 11 | 
 12 | ##### Mock objects #####
 13 | @pytest.fixture
 14 | def sample_data():
 15 |     return make_regression(n_samples=100, n_features=20, noise=0.1)
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def ensemble():
 20 |     return NumeraiEnsemble()
 21 | 
 22 | 
 23 | ##### NumeraiEnsemble #####
 24 | 
 25 | 
 26 | def test_numeraiensemble_fit(ensemble, sample_data):
 27 |     X, y = sample_data
 28 |     ensemble.fit(X, y)
 29 |     check_is_fitted(ensemble)
 30 |     assert issubclass(type(ensemble), (TransformerMixin, BaseEstimator))
 31 | 
 32 | 
 33 | def test_numeraiensemble_predict(ensemble, sample_data):
 34 |     X, y = sample_data
 35 |     ensemble = NumeraiEnsemble(weights=[0.05, 0.05, 0.3, 0.3, 0.3])
 36 |     ensemble.fit(X, y)
 37 |     eras = np.array([1] * 50 + [2] * 50)
 38 |     input_preds = np.random.uniform(size=(100, 5))
 39 | 
 40 |     ensemble_preds = ensemble.predict(input_preds, era_series=eras)
 41 |     # The length of output should have the same shape as input preds
 42 |     assert len(ensemble_preds) == len(input_preds)
 43 |     # Output should be a numpy array with values between 0 and 1
 44 |     assert isinstance(ensemble_preds, np.ndarray)
 45 |     assert len(ensemble_preds.shape) == 2
 46 |     assert ensemble_preds.min() >= 0
 47 |     assert ensemble_preds.max() <= 1
 48 | 
 49 |     # Test with Pandas Series into
 50 |     input_preds = pd.DataFrame(input_preds)
 51 |     eras = pd.Series(eras)
 52 |     ensemble_preds = ensemble.predict(input_preds, eras)
 53 | 
 54 | 
 55 | def test_numeraiensemble_standardize(ensemble, sample_data):
 56 |     X, y = sample_data
 57 |     ensemble.fit(X, y)
 58 | 
 59 |     data = np.array([1, 2, 3, 4, 5])
 60 |     standardized_data = ensemble._standardize(data)
 61 | 
 62 |     expected = (rankdata(data, method="ordinal") - 0.5) / len(data)
 63 | 
 64 |     assert np.allclose(standardized_data, expected)
 65 | 
 66 | 
 67 | def test_numeraiensemble_standardize_by_era(ensemble):
 68 |     eras = np.array([1, 1, 1, 2, 2, 2])
 69 | 
 70 |     # Test 1: Basic functionality
 71 |     X = np.array([0.5, 0.7, 0.1, 0.9, 0.6, 0.3])
 72 |     standardized = ensemble._standardize_by_era(X, eras)
 73 |     # These values are simply computed based on manual calculations for rank and normalization
 74 |     expected_values_1 = [0.5, 0.83333333, 0.16666667, 0.83333333, 0.5, 0.16666667]
 75 |     assert np.allclose(standardized, expected_values_1)
 76 | 
 77 |     # Test 2: Check standardized values for all same predictions split across two different eras
 78 |     X = np.array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5])
 79 |     standardized = ensemble._standardize_by_era(X, eras)
 80 |     expected_values_2 = [0.16666667, 0.5, 0.83333333, 0.16666667, 0.5, 0.83333333]
 81 |     assert np.allclose(standardized, expected_values_2)
 82 | 
 83 |     # Test 3: Different predictions but split across two eras
 84 |     X = np.array([0.1, 0.9, 0.9, 0.1, 0.1, 0.9])
 85 |     standardized = ensemble._standardize_by_era(X, eras)
 86 |     expected_values_3 = [0.16666667, 0.5, 0.83333333, 0.16666667, 0.5, 0.83333333]
 87 |     assert np.allclose(standardized, expected_values_3)
 88 | 
 89 | 
 90 | def test_numeraiensemble_predict_with_constant_values(ensemble):
 91 |     # Create an instance of your ensemble with mock estimators
 92 |     era_series = np.random.randint(1, 5, size=100)
 93 | 
 94 |     X_fit = np.random.rand(100, 3)
 95 |     y_fit = np.random.rand(100)
 96 |     ensemble.fit(X_fit, y_fit)
 97 | 
 98 |     constant_preds = np.ones((100, 5))
 99 | 
100 |     with pytest.raises(ValueError, match="Predictions for all columns are constant. No valid predictions to ensemble."):
101 |         with pytest.warns(UserWarning, match="Some estimator predictions are constant. Consider checking your estimators. Skipping these estimator predictions in ensembling."):
102 |             ensemble.predict(constant_preds, era_series)
103 | 
104 | 
105 | def test_numeraiensemble_predict_with_nans(ensemble):
106 |     # Create an instance of your ensemble with mock estimators
107 |     era_series = np.random.randint(1, 5, size=100)
108 | 
109 |     X_fit = np.random.rand(100, 3)
110 |     y_fit = np.random.rand(100)
111 |     ensemble.fit(X_fit, y_fit)
112 | 
113 |     nan_preds = np.ones((100, 5))
114 |     nan_preds[5:15, 0] = np.nan
115 |     nan_preds[:5, 1] = np.nan
116 | 
117 |     with pytest.warns(UserWarning, match="Predictions in column"):
118 |         ensemble_preds = ensemble.predict(nan_preds, era_series)
119 |     assert len(ensemble_preds) == len(nan_preds)
120 |     # Output should be a numpy array with values between 0 and 1
121 |     assert isinstance(ensemble_preds, np.ndarray)
122 |     # There must be some nans in the data.
123 |     assert np.sum(np.isnan(ensemble_preds)) >= 0
124 |     # None nan values should be between 0 and 1
125 |     non_nan_values = ensemble_preds[~np.isnan(ensemble_preds)]
126 |     if non_nan_values.size > 0:
127 |         assert non_nan_values.min() >= 0
128 |         assert non_nan_values.max() <= 1
129 | 
130 | 
131 | def test_numeraiensemble_donate_weights(ensemble):
132 |     ensemble.donate_weighted = True
133 |     # For 3 predictions, weights should be [0.25, 0.25, 0.5]
134 |     assert ensemble._get_donate_weights(n=3) == [0.25, 0.25, 0.5]
135 |     # For 5, weights should be [0.0625, 0.0625, 0.125, 0.25, 0.5]
136 |     assert ensemble._get_donate_weights(n=5) == [0.0625, 0.0625, 0.125, 0.25, 0.5]
137 | 
138 | 
139 | def test_numeraiensemble_donate_weights_sum_to_one(ensemble):
140 |     ensemble.donate_weighted = True
141 |     for n_estimators in range(1, 11):
142 |         # Assert that the sum of weights is close to 1
143 |         assert np.isclose(sum(ensemble._get_donate_weights(n=n_estimators)), 1.0)
144 | 
145 | 
146 | def test_numeraiensemble_get_feature_names_out(ensemble):
147 |     X = np.random.rand(10, 3)
148 |     y = np.random.rand(10)
149 |     ensemble.fit(X, y)
150 |     assert ensemble.get_feature_names_out() == ["numerai_ensemble_predictions"]
151 |     assert ensemble.get_feature_names_out(["a", "b"]) == ["a", "b"]
152 | 
153 | 
154 | def test_numeraiensemble_set_output(ensemble, sample_data):
155 |     X, y = sample_data
156 |     era_series = np.array([1] * 50 + [2] * 50)
157 |     ens_ins = ensemble
158 |     ens_ins.fit(X, y)
159 | 
160 |     ens_ins.set_output(transform="pandas")
161 |     preds = ens_ins.predict(X, era_series=era_series)
162 |     assert isinstance(preds, pd.DataFrame)
163 |     ens_ins.set_output(transform="default")
164 |     preds = ens_ins.predict(X, era_series=era_series)
165 |     assert isinstance(preds, np.ndarray)
166 | 
167 | 
168 | ##### PredictionReducer #####
169 | 
170 | 
171 | def test_prediction_reducer():
172 |     # Simulated probability predictions for 3 samples, 2 models and 3 classes
173 |     X = np.array([[0.1, 0.7, 0.2, 0.2, 0.5, 0.3], [0.2, 0.5, 0.3, 0.3, 0.3, 0.4], [0.6, 0.2, 0.2, 0.4, 0.4, 0.2]])
174 | 
175 |     reducer = PredictionReducer(n_models=2, n_classes=3)
176 |     reduced_X = reducer.fit_transform(X)
177 | 
178 |     # The expected result is a 3x2 matrix
179 |     expected_result = np.array([[0.7 * 1 + 0.2 * 2, 0.5 * 1 + 0.3 * 2], [0.5 * 1 + 0.3 * 2, 0.3 * 1 + 0.4 * 2], [0.2 * 1 + 0.2 * 2, 0.4 * 1 + 0.2 * 2]])
180 | 
181 |     assert reduced_X.shape == (3, 2)
182 |     np.testing.assert_array_almost_equal(reduced_X, expected_result)
183 | 
184 |     assert issubclass(type(reducer), (BaseEstimator, TransformerMixin))
185 | 
186 |     # Set output API
187 |     reducer.set_output(transform="pandas")
188 |     preds = reducer.predict(X)
189 |     assert isinstance(preds, pd.DataFrame)
190 |     reducer.set_output(transform="default")
191 |     preds = reducer.predict(X)
192 |     assert isinstance(preds, np.ndarray)
193 | 
194 | 
195 | def test_prediction_reducer_feature_names_out():
196 |     reducer = PredictionReducer(n_models=3, n_classes=4)
197 |     feature_names = reducer.get_feature_names_out()
198 |     expected_names = ["reduced_prediction_0", "reduced_prediction_1", "reduced_prediction_2"]
199 | 
200 |     assert feature_names == expected_names
201 | 


--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
 1 | from numerblox.misc import AttrDict, Key, load_key_from_json
 2 | 
 3 | 
 4 | def test_attrdict():
 5 |     test_dict = AttrDict({"test1": "hello", "test2": "world"})
 6 |     assert hasattr(test_dict, "test1")
 7 |     assert hasattr(test_dict, "test2")
 8 |     assert test_dict.test1 == test_dict["test1"]
 9 |     assert test_dict.test2 == test_dict["test2"]
10 | 
11 | 
12 | def test_key():
13 |     pub_id, secret_key = "Hello", "World"
14 |     example_key = Key(pub_id=pub_id, secret_key=secret_key)
15 |     assert str(example_key) == example_key.__repr__()
16 |     assert (example_key.pub_id, example_key.secret_key) == (pub_id, secret_key)
17 | 
18 | 
19 | def test_load_key_from_json():
20 |     example_key = load_key_from_json("tests/test_assets/mock_credentials.json")
21 |     assert (example_key.pub_id, example_key.secret_key) == ("Hello", "World")
22 | 


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utils import create_classic_sample_data
 3 | 
 4 | from numerblox.models import EraBoostedXGBRegressor
 5 | 
 6 | setup_data = create_classic_sample_data
 7 | 
 8 | 
 9 | def test_initialization():
10 |     model = EraBoostedXGBRegressor()
11 |     assert model.proportion == 0.5
12 |     assert model.trees_per_step == 10
13 |     assert model.num_iters == 200
14 |     assert model.n_estimators == 100
15 | 
16 |     custom_model = EraBoostedXGBRegressor(proportion=0.3, trees_per_step=5, num_iters=10)
17 |     assert custom_model.proportion == 0.3
18 |     assert custom_model.trees_per_step == 5
19 |     assert custom_model.num_iters == 10
20 | 
21 | 
22 | def test_fit_method(setup_data):
23 |     model = EraBoostedXGBRegressor(proportion=0.5, num_iters=5, n_estimators=100, max_depth=3, learning_rate=0.1)
24 |     X, y, era_series = setup_data[["feature1", "feature2"]], setup_data["target"], setup_data["era"]
25 |     initial_tree_count = model.n_estimators
26 | 
27 |     model.fit(X, y, era_series=era_series, verbose=500)
28 | 
29 |     assert model.n_estimators > initial_tree_count
30 |     # Check if the final number of trees is as expected
31 |     expected_final_tree_count = initial_tree_count + (model.num_iters - 1) * model.trees_per_step
32 |     assert model.n_estimators == expected_final_tree_count
33 | 
34 | 
35 | def test_predictions(setup_data):
36 |     model = EraBoostedXGBRegressor(num_iters=5, proportion=0.5, n_estimators=100, learning_rate=0.1, max_depth=3)
37 |     X, y, era_series = setup_data[["feature1", "feature2"]], setup_data["target"], setup_data["era"]
38 |     model.fit(X, y, era_series=era_series)
39 | 
40 |     predictions = model.predict(X)
41 |     assert len(predictions) == len(X)
42 |     # Check that predictions are not constant.
43 |     assert len(set(predictions)) > 1
44 |     # Check that it has fitted the data reasonably well.
45 |     correlation = np.corrcoef(predictions, y)[0, 1]
46 |     assert correlation > 0.8
47 | 


--------------------------------------------------------------------------------
/tests/test_neutralizers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | import sklearn
  5 | from sklearn.pipeline import make_pipeline
  6 | from sklearn.utils._metadata_requests import MetadataRequest
  7 | from utils import create_classic_sample_data
  8 | 
  9 | from numerblox.neutralizers import BaseNeutralizer, FeatureNeutralizer
 10 | 
 11 | sklearn.set_config(enable_metadata_routing=True)
 12 | 
 13 | setup_data = create_classic_sample_data
 14 | 
 15 | 
 16 | def test_base_neutralizer_initialization():
 17 |     bn = BaseNeutralizer(new_col_names=["test"])
 18 |     assert bn.new_col_names == ["test"]
 19 | 
 20 | 
 21 | def test_base_neutralizer_fit(setup_data):
 22 |     obj = BaseNeutralizer(new_col_names=["test"]).fit(setup_data)
 23 |     assert isinstance(obj, BaseNeutralizer)
 24 | 
 25 | 
 26 | def test_feature_neutralizer_initialization():
 27 |     fn = FeatureNeutralizer()
 28 |     assert fn.new_col_names[0].startswith("prediction_neutralized_")
 29 | 
 30 |     # Proportion must be between 0 and 1
 31 |     with pytest.raises(AssertionError):
 32 |         FeatureNeutralizer(proportion=[1.1])
 33 |     with pytest.raises(AssertionError):
 34 |         FeatureNeutralizer(proportion=[-0.1])
 35 | 
 36 |     # Test routing
 37 |     routing = fn.get_metadata_routing()
 38 |     assert isinstance(routing, MetadataRequest)
 39 |     assert routing.consumes("transform", ["features", "era_series"]) == set({"features", "era_series"})
 40 |     assert routing.consumes("predict", ["features", "era_series"]) == set({"features", "era_series"})
 41 | 
 42 | 
 43 | def test_feature_neutralizer_length_mismatch_X_features(setup_data):
 44 |     fn = FeatureNeutralizer()
 45 |     features = setup_data[["feature1", "feature2"]]
 46 |     era_series = setup_data["era"]
 47 |     X = setup_data["prediction"][:-1]  # Remove one element to cause mismatch
 48 | 
 49 |     with pytest.raises(AssertionError):
 50 |         fn.transform(X, features=features, era_series=era_series)
 51 | 
 52 | 
 53 | def test_feature_neutralizer_length_mismatch_X_eras(setup_data):
 54 |     fn = FeatureNeutralizer()
 55 |     features = setup_data[["feature1", "feature2"]]
 56 |     era_series = setup_data["era"][:-1]  # Remove one element to cause mismatch
 57 |     X = setup_data["prediction"]
 58 | 
 59 |     with pytest.raises(AssertionError):
 60 |         fn.transform(X, features=features, era_series=era_series)
 61 | 
 62 | 
 63 | def test_feature_neutralizer_incorrect_dim_X_single_pred(setup_data):
 64 |     fn = FeatureNeutralizer(pred_name=["prediction1", "prediction2"])
 65 |     features = setup_data[["feature1", "feature2"]]
 66 |     era_series = setup_data["era"]
 67 |     X = setup_data["prediction"]  # X is 1D, but two prediction names are provided
 68 | 
 69 |     with pytest.raises(AssertionError):
 70 |         fn.transform(X, features=features, era_series=era_series)
 71 | 
 72 | 
 73 | def test_feature_neutralizer_incorrect_dim_X_multi_pred(setup_data):
 74 |     fn = FeatureNeutralizer(pred_name=["prediction1", "prediction2"])
 75 |     features = setup_data[["feature1", "feature2"]]
 76 |     era_series = setup_data["era"]
 77 |     setup_data["prediction2"] = np.random.uniform(size=len(setup_data))
 78 |     X = setup_data[["prediction"]]  # Only one column provided, but two expected
 79 | 
 80 |     with pytest.raises(AssertionError):
 81 |         fn.transform(X, features=features, era_series=era_series)
 82 | 
 83 | 
 84 | def test_feature_neutralizer_predict(setup_data):
 85 |     fn = FeatureNeutralizer(pred_name="prediction", proportion=0.5)
 86 |     features = setup_data[["feature1", "feature2"]]
 87 |     era_series = setup_data["era"]
 88 |     X = setup_data["prediction"]
 89 |     result = fn.transform(X, features=features, era_series=era_series)
 90 |     assert len(result) == len(setup_data)
 91 |     assert result.shape[1] == 1
 92 |     assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0))
 93 |     assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1))
 94 | 
 95 | 
 96 | def test_feature_neutralizer_transform_no_era(setup_data):
 97 |     fn = FeatureNeutralizer(pred_name="prediction", proportion=0.5)
 98 |     features = setup_data[["feature1", "feature2"]]
 99 |     X = setup_data["prediction"]
100 |     # Ensure warning is raised. Omitting era_series with .set_transform_request(era_series=True) does not raise an error.
101 |     with pytest.warns(UserWarning):
102 |         result = make_pipeline(fn).transform(X, features=features)
103 |     assert len(result) == len(setup_data)
104 |     assert result.shape[1] == 1
105 |     assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0))
106 |     assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1))
107 | 
108 |     fn.set_transform_request(era_series=False)
109 |     # Ensure warning is raised
110 |     with pytest.warns(UserWarning):
111 |         result2 = fn.transform(X, features=features)
112 |     assert np.all(result == result2)
113 |     assert len(result2) == len(setup_data)
114 |     assert result2.shape[1] == 1
115 |     assert np.all(np.isclose(result2, 0, atol=1e-8) | (result >= 0))
116 |     assert np.all(np.isclose(result2, 1, atol=1e-8) | (result <= 1))
117 | 
118 |     fn.set_transform_request(era_series=None)
119 |     era_series = setup_data["era"]
120 |     # Passing era_series should give an error with metadata routing set to None
121 |     with pytest.raises(ValueError):
122 |         make_pipeline(fn).fit_transform(X, features=features, era_series=era_series)
123 | 
124 | 
125 | def test_feature_neutralizer_predict_multi_pred(setup_data):
126 |     fn = FeatureNeutralizer(pred_name=["prediction", "prediction2"], proportion=[0.5])
127 |     features = setup_data[["feature1", "feature2"]]
128 |     era_series = setup_data["era"]
129 |     setup_data["prediction2"] = np.random.uniform(size=len(setup_data))
130 |     X = setup_data[["prediction", "prediction2"]]
131 |     result = fn.transform(X, features=features, era_series=era_series)
132 |     assert len(result) == len(setup_data)
133 |     assert result.shape[1] == 2
134 |     assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0))
135 |     assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1))
136 | 
137 | 
138 | def test_feature_neutralizer_predict_multi_prop(setup_data):
139 |     fn = FeatureNeutralizer(pred_name="prediction", proportion=[0.5, 0.7])
140 |     features = setup_data[["feature1", "feature2"]]
141 |     era_series = setup_data["era"]
142 |     X = setup_data["prediction"]
143 |     result = fn.transform(X, features=features, era_series=era_series)
144 |     assert len(result) == len(setup_data)
145 |     assert result.shape[1] == 2
146 |     assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0))
147 |     assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1))
148 | 
149 | 
150 | def test_feature_neutralizer_multi_pred_multi_prop(setup_data):
151 |     fn = FeatureNeutralizer(pred_name=["prediction", "prediction2"], proportion=[0.5, 0.7, 0.9])
152 |     features = setup_data[["feature1", "feature2"]]
153 |     era_series = setup_data["era"]
154 |     setup_data["prediction2"] = np.random.uniform(size=len(setup_data))
155 |     X = setup_data[["prediction", "prediction2"]]
156 |     result = fn.transform(X, features=features, era_series=era_series)
157 |     assert len(result) == len(setup_data)
158 |     assert result.shape[1] == 6
159 |     assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0))
160 |     assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1))
161 | 
162 |     # Test with numpy X
163 |     result = fn.transform(X.to_numpy(), features=features, era_series=era_series)
164 |     assert len(result) == len(setup_data)
165 |     assert result.shape[1] == 6
166 |     assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0))
167 |     assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1))
168 | 
169 | 
170 | def test_feature_neutralizer_neutralize(setup_data):
171 |     columns = ["prediction"]
172 |     by = ["feature1", "feature2"]
173 |     scores = FeatureNeutralizer().neutralize(setup_data, columns, by, proportion=0.5)
174 |     assert isinstance(scores, pd.DataFrame)
175 | 
176 | 
177 | def test_feature_neutralizer_get_feature_names_out():
178 |     names = FeatureNeutralizer().get_feature_names_out()
179 |     assert names == ["prediction_neutralized_0.5"]
180 | 
181 | 
182 | def test_feature_neutralizer_get_feature_names_out_complex():
183 |     names = FeatureNeutralizer(pred_name="fancy", suffix="blob").get_feature_names_out()
184 |     assert names == ["fancy_neutralized_0.5_blob"]
185 | 
186 | 
187 | def test_feature_neutralizer_get_feature_names_out_with_input_features():
188 |     names = FeatureNeutralizer().get_feature_names_out(input_features=["prediction_fancy1"])
189 |     assert names == ["prediction_fancy1"]
190 | 


--------------------------------------------------------------------------------
/tests/test_numerframe.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | from numerai_era_data.date_utils import ERA_ONE_START
  5 | 
  6 | from numerblox.feature_groups import FNCV3_FEATURES, MEDIUM_FEATURES, SMALL_FEATURES, V5_FEATURE_GROUP_MAPPING
  7 | from numerblox.numerframe import NumerFrame, create_numerframe
  8 | 
  9 | TEST_FILE_PATH = "tests/test_assets/val_3_eras.parquet"
 10 | dataset = pd.read_parquet(TEST_FILE_PATH)
 11 | 
 12 | 
 13 | def test_numerframe_initialization():
 14 |     nf = NumerFrame(dataset)
 15 |     assert isinstance(nf, NumerFrame)
 16 |     assert nf.meta == {"era_col": "era"}
 17 |     assert nf.meta.era_col == "era"
 18 | 
 19 | 
 20 | def test_get_feature_data():
 21 |     nf = NumerFrame(dataset)
 22 |     features = nf.get_feature_data
 23 |     assert isinstance(features, NumerFrame)
 24 |     assert all([col.startswith("feature_") for col in features.columns.tolist()])
 25 | 
 26 | 
 27 | def test_get_pattern_data():
 28 |     nf = NumerFrame(dataset)
 29 |     xerxes_targets = nf.get_pattern_data("xerxes")
 30 |     assert isinstance(xerxes_targets, NumerFrame)
 31 |     assert xerxes_targets.columns.tolist() == ["target_xerxes_20", "target_xerxes_60"]
 32 | 
 33 | 
 34 | def test_get_target_data():
 35 |     nf = NumerFrame(dataset)
 36 |     targets = nf.get_target_data
 37 |     assert isinstance(targets, NumerFrame)
 38 |     assert all([col.startswith("target") for col in targets.columns.tolist()])
 39 | 
 40 | 
 41 | def test_get_single_target_data():
 42 |     nf = NumerFrame(dataset)
 43 |     single_target = nf.get_single_target_data
 44 |     assert isinstance(single_target, NumerFrame)
 45 |     assert single_target.columns.tolist() == ["target"]
 46 | 
 47 | 
 48 | def test_get_prediction_data():
 49 |     nf = NumerFrame(dataset)
 50 |     preds = nf.get_prediction_data
 51 |     assert isinstance(preds, NumerFrame)
 52 |     assert preds.columns.tolist() == []
 53 | 
 54 | 
 55 | def test_get_column_selection():
 56 |     nf = NumerFrame(dataset)
 57 |     result = nf.get_column_selection(["feature_itinerant_hexahedral_photoengraver"])
 58 |     assert isinstance(result, NumerFrame)
 59 |     assert result.columns.tolist() == ["feature_itinerant_hexahedral_photoengraver"]
 60 | 
 61 | 
 62 | def test_get_aux_data():
 63 |     nf = NumerFrame(dataset)
 64 |     aux_data = nf.get_aux_data
 65 |     assert isinstance(aux_data, NumerFrame)
 66 |     assert aux_data.columns.tolist() == ["era", "data_type"]
 67 | 
 68 | 
 69 | def test_get_era_data():
 70 |     nf = NumerFrame(dataset)
 71 |     era_data = nf.get_era_data
 72 |     assert isinstance(era_data, NumerFrame)
 73 |     assert era_data.columns.tolist() == ["era"]
 74 | 
 75 | 
 76 | def test_get_prediction_aux_data():
 77 |     nf = NumerFrame(dataset)
 78 |     nf["prediction"] = 1
 79 |     nf = NumerFrame(nf)
 80 |     pred_aux = nf.get_prediction_aux_data
 81 |     assert isinstance(pred_aux, NumerFrame)
 82 |     assert pred_aux.columns.tolist() == ["prediction", "era", "data_type"]
 83 | 
 84 | 
 85 | def test_get_feature_target_pair():
 86 |     nf = NumerFrame(dataset)
 87 |     X, y = nf.get_feature_target_pair()
 88 |     assert isinstance(X, NumerFrame)
 89 |     assert X.columns.tolist() == nf.get_feature_data.columns.tolist()
 90 |     assert y.columns.tolist() == ["target"]
 91 | 
 92 | 
 93 | def test_get_feature_target_pair_multi_target():
 94 |     nf = NumerFrame(dataset)
 95 |     X, y = nf.get_feature_target_pair(multi_target=True)
 96 |     assert isinstance(X, NumerFrame)
 97 |     assert X.columns.tolist() == nf.get_feature_data.columns.tolist()
 98 |     assert y.columns.tolist() == nf.get_target_data.columns.tolist()
 99 | 
100 | 
101 | def test_get_fncv3_features():
102 |     nf = NumerFrame(dataset)
103 |     result = nf.get_fncv3_feature_data
104 |     assert isinstance(result, NumerFrame)
105 |     assert result.columns.tolist() == FNCV3_FEATURES
106 | 
107 | 
108 | def test_get_small_features():
109 |     nf = NumerFrame(dataset)
110 |     result = nf.get_small_feature_data
111 |     assert isinstance(result, NumerFrame)
112 |     assert result.columns.tolist() == SMALL_FEATURES
113 | 
114 | 
115 | def test_get_medium_features():
116 |     nf = NumerFrame(dataset)
117 |     result = nf.get_medium_feature_data
118 |     assert isinstance(result, NumerFrame)
119 |     assert result.columns.tolist() == MEDIUM_FEATURES
120 | 
121 | 
122 | def test_get_unique_eras():
123 |     nf = NumerFrame(dataset)
124 |     result = nf.get_unique_eras
125 |     assert isinstance(result, list)
126 |     assert result == ["0575", "0576", "0577"]
127 | 
128 | 
129 | def test_get_feature_group():
130 |     # Test with a valid group name
131 |     nf = NumerFrame(dataset)
132 |     result = nf.get_feature_group("rain")
133 |     assert isinstance(result, NumerFrame)
134 |     assert result.columns.tolist() == V5_FEATURE_GROUP_MAPPING["rain"]
135 | 
136 |     # Test with an invalid group name
137 |     with pytest.raises(AssertionError, match=r".*not found in.*"):
138 |         nf.get_feature_group("group_invalid")
139 | 
140 | 
141 | def test_get_last_n_eras():
142 |     nf = NumerFrame(dataset)
143 |     result = nf.get_last_n_eras(2)
144 |     assert isinstance(result, NumerFrame)
145 |     assert result[nf.meta.era_col].unique().tolist() == ["0576", "0577"]
146 |     assert result.shape == (11313, 2415)
147 | 
148 | 
149 | def test_get_era_batch():
150 |     nf = NumerFrame(dataset)
151 |     eras = ["0575", "0576"]
152 |     X, y = nf.get_era_batch(eras=eras)
153 |     assert isinstance(X, np.ndarray)
154 |     assert X.shape == (11230, 2376)
155 |     assert y.shape == (11230, 37)
156 | 
157 | 
158 | def test_get_era_from_date():
159 |     nf = NumerFrame(dataset)
160 |     era = nf.get_era_from_date(pd.Timestamp("2016-01-01"))
161 |     assert isinstance(era, int)
162 |     assert era == 677
163 | 
164 |     era1 = nf.get_era_from_date(pd.Timestamp(ERA_ONE_START))
165 |     assert isinstance(era1, int)
166 |     assert era1 == 1
167 | 
168 | 
169 | def test_get_date_from_era():
170 |     nf = NumerFrame(dataset)
171 |     date = nf.get_date_from_era(era=4)
172 |     assert isinstance(date, pd.Timestamp)
173 |     assert date == pd.Timestamp("2003-02-01")
174 | 
175 |     date1 = nf.get_date_from_era(era=1)
176 |     assert isinstance(date1, pd.Timestamp)
177 |     assert date1 == pd.Timestamp(ERA_ONE_START)
178 | 
179 | 
180 | def test_get_dates_from_era_col():
181 |     nf = NumerFrame(dataset).iloc[:5]
182 |     result = nf.get_dates_from_era_col
183 |     assert isinstance(result, pd.Series)
184 |     assert all(result.index == nf.index[:5])
185 |     assert result.tolist() == [pd.Timestamp("2014-01-11 00:00:00")] * len(result)
186 | 
187 | 
188 | def test_get_eras_from_date_col():
189 |     dataset_copy = dataset.copy()
190 |     # Use a smaller range of dates
191 |     dataset_copy["date"] = [pd.Timestamp(ERA_ONE_START) + pd.Timedelta(days=i) for i in range(len(dataset_copy))]
192 |     dataset_copy = dataset_copy.drop(columns="era")
193 |     nf = NumerFrame(dataset_copy.iloc[:5])
194 |     result = nf.get_eras_from_date_col
195 |     assert isinstance(result, pd.Series)
196 |     assert all(result.index == nf.index[:5])
197 |     assert result.tolist() == [1, 1, 1, 1, 1]
198 | 
199 | 
200 | def test_get_era_range():
201 |     nf = NumerFrame(dataset)
202 |     result = nf.get_era_range(start_era=575, end_era=576)
203 |     assert isinstance(result, NumerFrame)
204 |     assert result[nf.meta.era_col].unique().tolist() == ["0575", "0576"]
205 |     assert result.shape == (11230, 2415)
206 | 
207 |     with pytest.raises(AssertionError):
208 |         no_era_dataset = dataset.drop("era", axis="columns")
209 |         no_era_dataset["date"] = pd.Timestamp("2016-01-01")
210 |         nf = NumerFrame(no_era_dataset)
211 |         nf.get_era_range(start_era=1, end_era=3)
212 |     # Negative era
213 |     with pytest.raises(AssertionError):
214 |         nf.get_era_range(-1, 5)
215 |     # End era before start era
216 |     with pytest.raises(AssertionError):
217 |         nf.get_era_range(20, 3)
218 |     # Start era not int
219 |     with pytest.raises(AssertionError):
220 |         nf.get_era_range("0001", 2)
221 |     # End era not int
222 |     with pytest.raises(AssertionError):
223 |         nf.get_era_range(1, "0002")
224 | 
225 | 
226 | def test_get_date_range():
227 |     date_col_dataset = dataset.drop("era", axis="columns")
228 |     date_col_dataset["date"] = [pd.Timestamp("2016-01-01") + pd.Timedelta(days=i) for i in range(0, len(date_col_dataset))]
229 |     nf = NumerFrame(date_col_dataset)
230 |     result = nf.get_date_range(start_date=pd.Timestamp("2016-01-01"), end_date=pd.Timestamp("2016-01-03"))
231 |     assert isinstance(result, NumerFrame)
232 |     assert result[nf.meta.era_col].unique().tolist() == [pd.Timestamp("2016-01-01"), pd.Timestamp("2016-01-02"), pd.Timestamp("2016-01-03")]
233 |     assert result.shape == (3, 2415)
234 | 
235 |     # End date before start date
236 |     with pytest.raises(AssertionError):
237 |         nf.get_date_range(pd.Timestamp("2022-01-05"), pd.Timestamp("2022-01-01"))
238 |     # Date before era 1
239 |     with pytest.raises(AssertionError):
240 |         nf.get_date_range(pd.Timestamp("1970-01-05"), pd.Timestamp("1971-01-10"))
241 |     # Start date not pd.Timestamp
242 |     with pytest.raises(AssertionError):
243 |         nf.get_date_range("2016-01-01", pd.Timestamp("2016-01-10"))
244 |     # End date not pd.Timestamp
245 |     with pytest.raises(AssertionError):
246 |         nf.get_date_range(pd.Timestamp("2016-01-01"), "2016-01-10")
247 | 
248 | 
249 | def test_create_numerframe():
250 |     nf = create_numerframe(TEST_FILE_PATH)
251 |     assert isinstance(nf, NumerFrame)
252 | 


--------------------------------------------------------------------------------
/tests/test_penalizers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from utils import create_classic_sample_data
 3 | 
 4 | from numerblox.penalizers import BasePenalizer, FeaturePenalizer
 5 | 
 6 | setup_data = create_classic_sample_data
 7 | 
 8 | 
 9 | def test_base_penalizer_initialization():
10 |     bn = BasePenalizer(new_col_name="test")
11 |     assert bn.new_col_name == "test"
12 | 
13 | 
14 | def test_base_penalizer_fit(setup_data):
15 |     obj = BasePenalizer(new_col_name="test").fit(setup_data)
16 |     assert isinstance(obj, BasePenalizer)
17 | 
18 | 
19 | @pytest.mark.xfail(reason="TensorFlow is not installed")
20 | def test_feature_penalizer_initialization():
21 |     fp = FeaturePenalizer(max_exposure=0.5)
22 |     assert fp.new_col_name.startswith("prediction_penalized_")
23 |     assert fp.max_exposure == 0.5
24 | 
25 | 
26 | @pytest.mark.xfail(reason="TensorFlow is not installed")
27 | def test_feature_penalizer_get_feature_names_out():
28 |     names = FeaturePenalizer(max_exposure=0.5).get_feature_names_out()
29 |     assert names == ["prediction_penalized_0.5"]
30 | 
31 | 
32 | @pytest.mark.xfail(reason="TensorFlow is not installed")
33 | def test_feature_penalizer_get_feature_names_out_complex():
34 |     names = FeaturePenalizer(max_exposure=0.7, pred_name="fancy", suffix="blob").get_feature_names_out()
35 |     assert names == ["fancy_penalized_0.7_blob"]
36 | 
37 | 
38 | @pytest.mark.xfail(reason="TensorFlow is not installed")
39 | def test_feature_penalizer_get_feature_names_out_with_input_features():
40 |     names = FeaturePenalizer(max_exposure=0.5).get_feature_names_out(input_features=["prediction_fancy1"])
41 |     assert names == ["prediction_fancy1"]
42 | 


--------------------------------------------------------------------------------
/tests/test_prediction_loaders.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn.base import BaseEstimator, TransformerMixin
 6 | from sklearn.datasets import make_regression
 7 | from sklearn.pipeline import FeatureUnion, Pipeline
 8 | from sklearn.preprocessing import StandardScaler
 9 | 
10 | from numerblox.prediction_loaders import BasePredictionLoader, ExamplePredictions
11 | 
12 | 
13 | def test_example_predictions_basic():
14 |     ep = ExamplePredictions()
15 |     preds = ep.fit_transform(None)
16 |     # Check all values are between 0 and 1
17 |     assert preds["prediction"].min() >= 0
18 |     assert preds["prediction"].max() <= 1
19 |     assert isinstance(preds, pd.DataFrame)
20 |     assert issubclass(ExamplePredictions, (BasePredictionLoader, TransformerMixin, BaseEstimator))
21 |     assert BasePredictionLoader.__bases__[-1] == BaseEstimator, "BaseEstimator must be the rightmost base class"
22 | 
23 | 
24 | def test_example_predictions_pipeline():
25 |     # Create dummy dataset
26 |     X, y = make_regression(n_samples=100, n_features=20, noise=0.1)
27 |     X = pd.DataFrame(X)
28 | 
29 |     # Create pipeline with standard scaler and example predictions
30 |     pipeline = Pipeline([("scaler", StandardScaler()), ("predictions", ExamplePredictions())])
31 |     # Get results
32 |     preds = pipeline.fit_transform(X, y)
33 | 
34 |     # Check all values are between 0 and 1
35 |     assert preds["prediction"].min() >= 0
36 |     assert preds["prediction"].max() <= 1
37 |     assert isinstance(preds, pd.DataFrame)
38 | 
39 | 
40 | def test_example_predictions_feature_union():
41 |     # Get predictions in basic setting to compare output
42 |     ep = ExamplePredictions()
43 |     preds = ep.fit_transform(None)
44 | 
45 |     # Dummy data
46 |     X, _ = make_regression(n_samples=len(preds), n_features=2, noise=0.1)
47 | 
48 |     # Create feature union
49 |     combined_features = FeatureUnion([("standard", StandardScaler()), ("example", ExamplePredictions())])
50 | 
51 |     # Transform data
52 |     X_transformed = combined_features.fit_transform(X)
53 | 
54 |     # Ensure the transformation worked
55 |     assert np.allclose(X_transformed[:, -1], preds["prediction"].values)
56 |     assert X_transformed.shape[0] == X.shape[0]
57 |     assert X_transformed.shape[1] == 3
58 | 
59 | 
60 | def test_example_predictions_get_feature_names_out():
61 |     ep = ExamplePredictions()
62 |     assert ep.get_feature_names_out() == ["v5.0/live_example_preds"]
63 |     assert ep.get_feature_names_out(["a", "b"]) == ["a", "b"]
64 | 
65 | 
66 | def test_example_predictions_keep_files():
67 |     # Test with keep_files = True
68 |     ep_keep = ExamplePredictions(keep_files=True)
69 |     ep_keep.fit_transform(None)
70 |     assert os.path.isdir(ep_keep.downloader.dir), "Directory should be kept with keep_files=True"
71 |     assert os.path.exists(ep_keep.dest_path), "File should be kept with keep_files=True"
72 |     # Clean up
73 |     ep_keep.downloader.remove_base_directory()
74 | 
75 |     # Test with keep_files = False
76 |     ep_remove = ExamplePredictions(keep_files=False)
77 |     ep_remove.fit_transform(None)
78 |     assert not os.path.isdir(ep_remove.downloader.dir), "Directory should be removed with keep_files=False"
79 | 


--------------------------------------------------------------------------------
/tests/test_submission.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from copy import deepcopy
  3 | from datetime import datetime
  4 | from random import choices
  5 | from string import ascii_uppercase
  6 | from unittest.mock import patch
  7 | from uuid import uuid4
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import pytest
 12 | from dateutil.relativedelta import FR, relativedelta
 13 | 
 14 | from numerblox.misc import Key
 15 | from numerblox.submission import NumeraiClassicSubmitter, NumeraiCryptoSubmitter, NumeraiSignalsSubmitter
 16 | 
 17 | TARGET_NAME = "prediction"
 18 | 
 19 | 
 20 | def _create_random_classic_df():
 21 |     # Create random predictions dataframe
 22 |     n_rows = 100
 23 |     test_dataf = pd.DataFrame(np.random.uniform(size=n_rows), columns=[TARGET_NAME])
 24 |     test_dataf["id"] = [uuid4() for _ in range(n_rows)]
 25 |     test_dataf = test_dataf.set_index("id")
 26 |     return test_dataf
 27 | 
 28 | 
 29 | def create_random_signals_df(n_rows=1000):
 30 |     signals_test_dataf = pd.DataFrame(np.random.uniform(size=(n_rows, 1)), columns=["signal"])
 31 |     signals_test_dataf["ticker"] = ["".join(choices(ascii_uppercase, k=4)) for _ in range(n_rows)]
 32 |     last_friday = str((datetime.now() + relativedelta(weekday=FR(-1))).date()).replace("-", "")
 33 |     signals_test_dataf["last_friday"] = last_friday
 34 |     signals_test_dataf["data_type"] = "live"
 35 |     return signals_test_dataf
 36 | 
 37 | 
 38 | def test_classic_submitter():
 39 |     # Initialization
 40 |     test_dir = f"test_sub_{uuid4()}"
 41 |     classic_key = Key(pub_id="Hello", secret_key="World")
 42 |     num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key)
 43 |     assert num_sub.dir.is_dir()
 44 | 
 45 |     # Save CSV
 46 |     test_dataf = _create_random_classic_df()
 47 |     file_name = "test.csv"
 48 |     num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME)
 49 |     num_sub.save_csv(dataf=test_dataf, file_name="test2.csv", cols=TARGET_NAME)
 50 |     assert (num_sub.dir / file_name).is_file()
 51 | 
 52 |     # Combine CSVs
 53 |     combined = num_sub.combine_csvs([f"{test_dir}/test.csv", f"{test_dir}/test2.csv"], aux_cols=["id"])
 54 |     assert combined.columns == [TARGET_NAME]
 55 | 
 56 |     # Test that saving breaks if range is invalid.
 57 |     with pytest.raises(ValueError):
 58 |         invalid_signal = deepcopy(test_dataf)
 59 |         invalid_signal[TARGET_NAME] = invalid_signal[TARGET_NAME].add(10)
 60 |         num_sub.save_csv(
 61 |             invalid_signal,
 62 |             file_name="should_not_save.csv",
 63 |             cols=TARGET_NAME,
 64 |         )
 65 | 
 66 |     # Wind down
 67 |     num_sub.remove_base_directory()
 68 |     assert not os.path.exists(test_dir)
 69 | 
 70 | 
 71 | def test_signals_submitter():
 72 |     # Initialization
 73 |     test_dir = f"test_sub_{uuid4()}"
 74 |     signals_key = Key(pub_id="Hello", secret_key="World")
 75 |     signals_sub = NumeraiSignalsSubmitter(directory_path=test_dir, key=signals_key)
 76 |     assert signals_sub.dir.is_dir()
 77 | 
 78 |     # Save CSVs
 79 |     test_dataf = create_random_signals_df()
 80 |     signals_cols = ["signal", "ticker", "data_type", "last_friday"]
 81 |     file_name = "signals_test.csv"
 82 |     signals_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=signals_cols)
 83 |     signals_sub.save_csv(dataf=test_dataf, file_name="signals_test2.csv", cols=signals_cols)
 84 | 
 85 |     combined_signals = signals_sub.combine_csvs(csv_paths=[f"{test_dir}/signals_test.csv", f"{test_dir}/signals_test2.csv"], aux_cols=["ticker", "last_friday", "data_type"], era_col="last_friday", pred_col="signal")
 86 |     assert combined_signals.columns == ["signal"]
 87 | 
 88 |     # Test that saving breaks if range is invalid.
 89 |     with pytest.raises(ValueError):
 90 |         invalid_signal = deepcopy(test_dataf)
 91 |         invalid_signal.loc[0, "signal"] += 10
 92 |         signals_sub.save_csv(
 93 |             invalid_signal,
 94 |             file_name="should_not_save.csv",
 95 |             cols=list(invalid_signal.columns),
 96 |         )
 97 | 
 98 |     # Test that saving breaks if ticker is invalid.
 99 |     with pytest.raises(NotImplementedError):
100 |         invalid_ticker = deepcopy(test_dataf)
101 |         invalid_ticker = invalid_ticker.rename({"ticker": "not_a_valid_ticker_format"}, axis=1)
102 |         signals_sub.save_csv(
103 |             invalid_ticker,
104 |             file_name="should_not_save.csv",
105 |             cols=list(invalid_ticker.columns),
106 |         )
107 |     # Wind down
108 |     signals_sub.remove_base_directory()
109 |     assert not os.path.exists(test_dir)
110 | 
111 | 
112 | def test_crypto_submitter():
113 |     # Initialization
114 |     test_dir = f"test_sub_{uuid4()}"
115 |     crypto_key = Key(pub_id="Hello", secret_key="World")
116 |     crypto_sub = NumeraiCryptoSubmitter(directory_path=test_dir, key=crypto_key)
117 |     assert crypto_sub.dir.is_dir()
118 | 
119 |     # Create random crypto predictions dataframe
120 |     def create_random_crypto_df(n_rows=1000):
121 |         crypto_test_dataf = pd.DataFrame(np.random.uniform(size=(n_rows, 1)), columns=["signal"])
122 |         crypto_test_dataf["symbol"] = [f"CRYPTO_{i:04d}" for i in range(n_rows)]
123 |         return crypto_test_dataf
124 | 
125 |     # Save CSVs
126 |     test_dataf = create_random_crypto_df()
127 |     crypto_cols = ["symbol", "signal"]
128 |     file_name = "crypto_test.csv"
129 |     crypto_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=crypto_cols)
130 |     crypto_sub.save_csv(dataf=test_dataf, file_name="crypto_test2.csv", cols=crypto_cols)
131 | 
132 |     combined_crypto = crypto_sub.combine_csvs(csv_paths=[f"{test_dir}/crypto_test.csv", f"{test_dir}/crypto_test2.csv"], aux_cols=["symbol"], pred_col="signal")
133 |     assert combined_crypto.columns == ["signal"]
134 | 
135 |     # Test that saving breaks if range is invalid.
136 |     with pytest.raises(ValueError):
137 |         invalid_signal = deepcopy(test_dataf)
138 |         invalid_signal.loc[0, "signal"] += 10
139 |         crypto_sub.save_csv(
140 |             invalid_signal,
141 |             file_name="should_not_save.csv",
142 |             cols=list(invalid_signal.columns),
143 |         )
144 | 
145 |     # Test that saving breaks if symbol column is missing
146 |     with pytest.raises(AssertionError):
147 |         invalid_symbol = deepcopy(test_dataf)
148 |         invalid_symbol = invalid_symbol.rename(columns={"symbol": "not_symbol"})
149 |         crypto_sub.save_csv(
150 |             invalid_symbol,
151 |             file_name="should_not_save.csv",
152 |             cols=list(invalid_symbol.columns),
153 |         )
154 | 
155 |     # Wind down
156 |     crypto_sub.remove_base_directory()
157 |     assert not os.path.exists(test_dir)
158 | 
159 | 
160 | def raise_api_error(*args, **kwargs):
161 |     raise ValueError("Your session is invalid or has expired.")
162 | 
163 | 
164 | @patch.object(NumeraiClassicSubmitter, "_get_model_id", return_value="mocked_model_id")
165 | def test_upload_predictions_retries(mocked_get_model_id):
166 |     test_dir = f"test_sub_{uuid4()}"
167 |     classic_key = Key(pub_id="Hello", secret_key="World")
168 |     num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key, sleep_time=0.1, fail_silently=True)
169 |     file_name = "test.csv"
170 | 
171 |     # Save CSV
172 |     test_dataf = _create_random_classic_df()
173 |     num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME)
174 | 
175 |     with patch.object(num_sub.api, "upload_predictions", side_effect=raise_api_error) as mock_upload:
176 |         num_sub.upload_predictions(file_name=file_name, model_name="mock_model")
177 |         # Check if retries happened 'max_retries' times
178 |         assert mock_upload.call_count == num_sub.max_retries
179 |     num_sub.remove_base_directory()
180 | 
181 | 
182 | @patch.object(NumeraiClassicSubmitter, "_get_model_id", return_value="mocked_model_id")
183 | def test_upload_predictions_fail_silently(mocked_get_model_id):
184 |     test_dir = f"test_sub_{uuid4()}"
185 |     classic_key = Key(pub_id="Hello", secret_key="World")
186 |     num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key, sleep_time=0.1, fail_silently=True)
187 |     file_name = "test.csv"
188 | 
189 |     # Save CSV
190 |     test_dataf = _create_random_classic_df()
191 |     num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME)
192 | 
193 |     with patch.object(num_sub.api, "upload_predictions", side_effect=raise_api_error):
194 |         num_sub.upload_predictions(file_name=file_name, model_name="mock_model")
195 | 
196 |     num_sub.remove_base_directory()
197 | 
198 | 
199 | @patch.object(NumeraiClassicSubmitter, "_get_model_id", return_value="mocked_model_id")
200 | def test_upload_predictions_exception_handling(mocked_get_model_id):
201 |     test_dir = f"test_sub_{uuid4()}"
202 |     classic_key = Key(pub_id="Hello", secret_key="World")
203 |     num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key, sleep_time=0.1, fail_silently=True)
204 |     file_name = "test.csv"
205 | 
206 |     # Save CSV
207 |     test_dataf = _create_random_classic_df()
208 |     num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME)
209 | 
210 |     with patch("builtins.print") as mock_print:
211 |         num_sub.upload_predictions(file_name=file_name, model_name="mock_model")
212 |         assert mock_print.call_count >= num_sub.max_retries
213 | 
214 |     num_sub.remove_base_directory()
215 | 
216 | 
217 | # Tests for NumerBaySubmitter
218 | def test_numerbay_submitter():
219 |     pass
220 | 


--------------------------------------------------------------------------------
/tests/test_targets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import polars as pl
 4 | from sklearn.base import BaseEstimator, TransformerMixin
 5 | from tqdm import tqdm
 6 | from utils import create_signals_sample_data
 7 | 
 8 | from numerblox.targets import BaseTargetProcessor, BayesianGMMTargetProcessor, SignalsTargetProcessor
 9 | 
10 | dataset = pd.read_parquet("tests/test_assets/val_3_eras.parquet")
11 | dummy_signals_data = create_signals_sample_data
12 | 
13 | ALL_PROCESSORS = [BayesianGMMTargetProcessor, SignalsTargetProcessor]
14 | 
15 | 
16 | def test_processors_sklearn():
17 |     data = dataset.sample(50)
18 |     data = data.drop(columns=["data_type"])
19 | 
20 |     assert BaseTargetProcessor.__bases__[-1] == BaseEstimator, "BaseEstimator must be the rightmost base class"
21 | 
22 |     for processor_cls in tqdm(ALL_PROCESSORS, desc="Testing target processors for scikit-learn compatibility"):
23 |         # Initialization
24 |         processor = processor_cls()
25 | 
26 |         # Inherits from Sklearn classes
27 |         assert issubclass(processor_cls, (BaseTargetProcessor, TransformerMixin, BaseEstimator))
28 | 
29 |         # Test every processor has get_feature_names_out
30 |         assert hasattr(processor, "get_feature_names_out"), "Processor {processor.__name__} does not have get_feature_names_out. Every implemented preprocessors should have this method."
31 | 
32 | 
33 | def test_bayesian_gmm_target_preprocessor():
34 |     bgmm = BayesianGMMTargetProcessor(n_components=2)
35 | 
36 |     y = dataset["target_xerxes_20"].fillna(0.5)
37 |     era_series = dataset["era"]
38 |     feature_names = [
39 |         "feature_melismatic_daily_freak",
40 |         "feature_pleasurable_facultative_benzol",
41 |     ]
42 |     X = dataset[feature_names]
43 | 
44 |     bgmm.fit(X, y, era_series=era_series)
45 | 
46 |     result = bgmm.transform(X, era_series=era_series)
47 |     assert bgmm.get_feature_names_out() == ["fake_target"]
48 |     assert len(result) == len(dataset)
49 |     assert result.min() >= 0.0
50 |     assert result.max() <= 1.0
51 | 
52 |     # _get_coefs
53 |     coefs = bgmm._get_coefs(X, y, era_series=era_series)
54 |     assert coefs.shape == (3, 2)
55 |     assert coefs.min() >= 0.0
56 |     assert coefs.max() <= 1.0
57 | 
58 |     # Test set_output API
59 |     bgmm.set_output(transform="pandas")
60 |     result = bgmm.transform(X, era_series=era_series)
61 |     assert isinstance(result, pd.DataFrame)
62 |     bgmm.set_output(transform="default")
63 |     result = bgmm.transform(X, era_series=era_series)
64 |     assert isinstance(result, np.ndarray)
65 | 
66 | 
67 | def test_signals_target_processor(dummy_signals_data):
68 |     stp = SignalsTargetProcessor()
69 |     stp.set_output(transform="pandas")
70 |     era_series = dummy_signals_data["date"]
71 |     stp.fit(dummy_signals_data)
72 |     result = stp.transform(dummy_signals_data, era_series=era_series)
73 |     expected_target_cols = ["target_10d_raw", "target_10d_rank", "target_10d_group", "target_20d_raw", "target_20d_rank", "target_20d_group"]
74 |     for col in expected_target_cols:
75 |         assert col in result.columns
76 |     assert stp.get_feature_names_out() == expected_target_cols
77 | 
78 |     # Test set_output API
79 |     stp.set_output(transform="default")
80 |     result = stp.transform(dummy_signals_data, era_series=era_series)
81 |     assert isinstance(result, np.ndarray)
82 | 
83 |     stp.set_output(transform="polars")
84 |     result = stp.transform(dummy_signals_data, era_series=era_series)
85 |     assert isinstance(result, pl.DataFrame)
86 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def create_classic_sample_data():
 8 |     data = {"feature1": [1, 2, 3, 4, 3, 2, 1], "feature2": [4, 3, 2, 1, 3, 1, 2], "prediction": [0.5, 0.6, 0.7, 0.8, 0.2, 0.3, 0.4], "target": [0, 1, 0, 1, 0.25, 0.75, 0.5], "target_2": [0, 0.25, 0.75, 0.50, 0.25, 0.75, 0.5], "era": ["era1", "era2", "era1", "era2", "era1", "era2", "era1"]}
 9 |     return pd.DataFrame(data)
10 | 
11 | 
12 | @pytest.fixture
13 | def create_signals_sample_data():
14 |     instances = []
15 |     tickers = ["ABC.US", "DEF.US", "GHI.US", "JKL.US", "MNO.US"]
16 |     for ticker in tickers:
17 |         price = np.random.randint(10, 100)
18 |         for i in range(100):
19 |             price += np.random.uniform(-1, 1)
20 |             instances.append(
21 |                 {
22 |                     "ticker": ticker,
23 |                     "date": pd.Timestamp("2020-01-01") + pd.Timedelta(days=i),
24 |                     "open": price - 0.05,
25 |                     "high": price + 0.02,
26 |                     "low": price - 0.01,
27 |                     "close": price,
28 |                     "adjusted_close": price * np.random.uniform(0.5, 1.5),
29 |                     "volume": np.random.randint(1000, 10000),
30 |                     "target": np.random.uniform(),
31 |                     "target_2": np.random.uniform(),
32 |                     "prediction": np.random.uniform(),
33 |                     "prediction_random": np.random.uniform(),
34 |                 }
35 |             )
36 |     # Add instances with only 10 days of data
37 |     unwanted_tickers = ["XYZ.US", "RST.US", "UVW.US"]
38 |     price = np.random.randint(10, 100)
39 |     for ticker in unwanted_tickers:
40 |         for i in range(10):
41 |             price += np.random.uniform(-1, 1)
42 |             instances.append(
43 |                 {
44 |                     "ticker": ticker,
45 |                     "date": pd.Timestamp("2020-01-01") + pd.Timedelta(days=i),
46 |                     "open": price - 0.05,
47 |                     "high": price + 0.02,
48 |                     "low": price - 0.01,
49 |                     "close": price,
50 |                     "adjusted_close": price * np.random.uniform(0.5, 1.5),
51 |                     "volume": np.random.randint(1000, 10000),
52 |                     "target": np.random.uniform(),
53 |                     "target_2": np.random.uniform(),
54 |                     "prediction": np.random.uniform(),
55 |                     "prediction_random": np.random.uniform(),
56 |                 }
57 |             )
58 |     return pd.DataFrame(instances)
59 | 


--------------------------------------------------------------------------------