├── .github └── workflows │ ├── ci.yml │ ├── codecov.yml │ ├── deploy-mkdocs.yml │ └── ruff.yml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── api.md ├── assets │ └── .icons │ │ ├── cc_white.svg │ │ └── favicon.ico ├── contributing.md ├── crowdcent.md ├── disclaimer.md ├── download.md ├── end_to_end.md ├── evaluation.md ├── index.md ├── meta.md ├── model_upload.md ├── models.md ├── numerframe.md ├── postprocessing.md ├── prediction_loaders.md ├── preprocessing.md ├── submission.md └── targets.md ├── examples ├── end_to_end.ipynb ├── google_cloud_storage.ipynb ├── numerai_pipeline.ipynb ├── numerbay_integration.ipynb ├── numerframe_tutorial.ipynb ├── quickstart.ipynb ├── submitting.ipynb └── synthetic_data_generation.ipynb ├── mkdocs.yml ├── pyproject.toml ├── pytest.ini ├── src └── numerblox │ ├── __init__.py │ ├── download.py │ ├── ensemble.py │ ├── evaluation.py │ ├── feature_groups.py │ ├── meta.py │ ├── misc.py │ ├── model_upload.py │ ├── models.py │ ├── neutralizers.py │ ├── numerframe.py │ ├── penalizers.py │ ├── prediction_loaders.py │ ├── preprocessing │ ├── __init__.py │ ├── base.py │ ├── classic.py │ └── signals.py │ ├── submission.py │ └── targets.py ├── tests ├── test_assets │ ├── eodhd-map.csv │ ├── mock_credentials.json │ └── val_3_eras.parquet ├── test_download │ ├── __init__.py │ ├── test_download_classic.py │ ├── test_download_crypto.py │ └── test_download_signals.py ├── test_end_to_end.py ├── test_ensemble.py ├── test_evaluation.py ├── test_meta.py ├── test_misc.py ├── test_models.py ├── test_neutralizers.py ├── test_numerframe.py ├── test_penalizers.py ├── test_prediction_loaders.py ├── test_preprocessing.py ├── test_submission.py ├── test_targets.py └── utils.py └── uv.lock /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.10', '3.11', '3.12'] 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install uv 25 | run: | 26 | curl -LsSf https://astral.sh/uv/install.sh | sh 27 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 28 | 29 | - name: Create and activate virtual environment 30 | run: | 31 | uv venv 32 | echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH 33 | 34 | - name: Install dependencies 35 | run: | 36 | uv pip install 'setuptools[pkg_resources]' 37 | uv pip install -e ".[test]" 38 | 39 | - name: Run tests with coverage 40 | run: | 41 | uv pip install pytest-cov 42 | pytest -s tests/ --cov=numerblox --cov-report term-missing 43 | 44 | - name: Build wheel 45 | run: | 46 | uv pip install build 47 | python -m build --wheel 48 | 49 | - name: Install built wheel 50 | run: | 51 | uv pip install dist/*.whl 52 | -------------------------------------------------------------------------------- /.github/workflows/codecov.yml: -------------------------------------------------------------------------------- 1 | name: Upload coverage to Codecov 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | coverage: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python 3.12 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.12' 17 | - name: Install uv 18 | run: | 19 | curl -LsSf https://astral.sh/uv/install.sh | sh 20 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 21 | - name: Create and activate virtual environment 22 | run: | 23 | uv venv 24 | echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH 25 | - name: Install dependencies 26 | run: | 27 | uv pip install 'setuptools[pkg_resources]' 28 | uv pip install -e ".[test]" 29 | - name: Run tests with coverage 30 | run: | 31 | uv pip install pytest pytest-cov 32 | pytest -s tests/ --cov=numerblox --cov-report term-missing --cov-report=xml 33 | - name: Upload coverage to Codecov 34 | uses: codecov/codecov-action@v3 35 | env: 36 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 37 | with: 38 | files: ./coverage.xml 39 | fail_ci_if_error: false 40 | verbose: true 41 | -------------------------------------------------------------------------------- /.github/workflows/deploy-mkdocs.yml: -------------------------------------------------------------------------------- 1 | name: MKDocs -> GitHub Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.12' 19 | 20 | - name: Install uv 21 | run: | 22 | curl -LsSf https://astral.sh/uv/install.sh | sh 23 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 24 | 25 | - name: Create and activate virtual environment 26 | run: | 27 | uv venv 28 | echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH 29 | 30 | - name: Install dependencies 31 | run: | 32 | uv pip install 'setuptools[pkg_resources]' 33 | uv pip install mkdocs mkdocs-material mkdocstrings mkdocstrings-python 34 | 35 | - name: Install project 36 | run: uv pip install -e . 37 | 38 | - name: Build the MkDocs site 39 | run: mkdocs build 40 | 41 | - name: Deploy to GitHub Pages 42 | uses: peaceiris/actions-gh-pages@v3 43 | with: 44 | github_token: ${{ secrets.GITHUB_TOKEN }} 45 | publish_dir: ./site 46 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | on: [push, pull_request] 3 | jobs: 4 | ruff: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: astral-sh/ruff-action@v1.1.0 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .jekyll-cache/ 2 | Gemfile.lock 3 | *.bak 4 | .gitattributes 5 | .last_checked 6 | .gitconfig 7 | *.bak 8 | *.log 9 | *~ 10 | ~* 11 | _tmp* 12 | tmp* 13 | tags 14 | 15 | # Byte-compiled / optimized / DLL files 16 | __pycache__/ 17 | *.py[cod] 18 | *$py.class 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | env/ 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # dotenv 97 | .env 98 | 99 | # virtualenv 100 | .venv 101 | venv/ 102 | ENV/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | # Rope project settings 109 | .ropeproject 110 | 111 | # mkdocs documentation 112 | /site 113 | 114 | # mypy 115 | .mypy_cache/ 116 | 117 | .vscode 118 | *.swp 119 | 120 | # osx generated files 121 | .DS_Store 122 | .DS_Store? 123 | .Trashes 124 | ehthumbs.db 125 | Thumbs.db 126 | .idea 127 | 128 | # pytest 129 | .pytest_cache 130 | 131 | # tools/trust-doc-nbs 132 | docs_src/.last_checked 133 | 134 | # link checker 135 | checklink/cookies.txt 136 | 137 | # Numerai authentication 138 | key.json 139 | keys.json 140 | finnhub_key.json 141 | eod_key.json 142 | 143 | _docs/ 144 | sidebar.yml 145 | 146 | # Test files 147 | edu_nbs/*.h5 148 | prod_requirements.txt 149 | test_numclassic_general_* 150 | test_numcrypto_general_* 151 | some_path_* 152 | test_kaggle_* 153 | test_eod_* 154 | 155 | examples/data 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # Full API Reference 2 | 3 | This section provides a detailed reference to all objects defined in NumerBlox. 4 | 5 | ## Download 6 | 7 | ::: numerblox.download 8 | 9 | ------------------------------------------------ 10 | 11 | ## NumerFrame 12 | 13 | ::: numerblox.numerframe 14 | 15 | ------------------------------------------------ 16 | 17 | ## Preprocessing 18 | 19 | ### Base Preprocessing 20 | 21 | ::: numerblox.preprocessing.base 22 | 23 | ### Classic Preprocessing 24 | 25 | ::: numerblox.preprocessing.classic 26 | 27 | ### Signals Preprocessing 28 | 29 | ::: numerblox.preprocessing.signals 30 | 31 | ------------------------------------------------ 32 | 33 | ## Meta 34 | 35 | ::: numerblox.meta 36 | 37 | ------------------------------------------------ 38 | 39 | ## Ensemble 40 | 41 | ::: numerblox.ensemble 42 | 43 | ------------------------------------------------ 44 | 45 | ## Neutralizers 46 | 47 | ::: numerblox.neutralizers 48 | 49 | ------------------------------------------------ 50 | 51 | ## Penalizers 52 | 53 | ::: numerblox.penalizers 54 | 55 | ------------------------------------------------ 56 | 57 | ## Prediction Loaders 58 | 59 | ::: numerblox.prediction_loaders 60 | 61 | ------------------------------------------------ 62 | 63 | ## Targets 64 | 65 | ::: numerblox.targets 66 | 67 | ------------------------------------------------ 68 | 69 | ## Evaluation 70 | 71 | ::: numerblox.evaluation 72 | 73 | ------------------------------------------------ 74 | 75 | ## Submission 76 | 77 | ::: numerblox.submission 78 | 79 | ------------------------------------------------ 80 | 81 | ## Model Upload 82 | 83 | ::: numerblox.model_upload 84 | 85 | ------------------------------------------------ 86 | 87 | ## Models 88 | 89 | ::: numerblox.models 90 | 91 | ------------------------------------------------ 92 | 93 | ## Miscellaneous 94 | 95 | ::: numerblox.misc 96 | 97 | ------------------------------------------------ -------------------------------------------------------------------------------- /docs/assets/.icons/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/docs/assets/.icons/favicon.ico -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # How To Contribute 2 | 3 | First, thank you for your consideration to contribute to `numerblox`! This document provides some general guidelines to streamline the contribution process. 4 | 5 | ## Installation 6 | 7 | If you haven't installed `numerblox` yet, clone the project into your favorite development environment. Install the repository in editable mode with all dev dependencies. 8 | 9 | Using pip: 10 | ```bash 11 | git clone https://github.com/crowdcent/numerblox.git 12 | cd numerblox 13 | pip install -e ".[test]" 14 | ``` 15 | 16 | Using [uv](https://github.com/astral-sh/uv): 17 | ```bash 18 | git clone https://github.com/crowdcent/numerblox.git 19 | cd numerblox 20 | uv venv 21 | uv pip install -e ".[test]" 22 | ``` 23 | 24 | ## Developing considerations 25 | 26 | ### 1. Building a new component 27 | 28 | If you would like to build a new component for Numerblox, please consider the following steps: 29 | 30 | 1. Place the new component in the appropriate section. Is it a Downloader (`download.py`), a Preprocessor (`preprocessing.py`) or a Submitting tool (`submission.py`)? Also check the documentation on that section for templates, conventions and how these blocks are constructed in general. 31 | 2. Add tests for this new component in the appropriate test file. If you are introducing a new Downloader, add tests in `tests/test_downloader.py`. If you are introducing a new Preprocessor, add tests in `tests/test_preprocessing.py`. etc. 32 | 3. When making a preprocessor or postprocessor, make sure the component follows [scikit-learn conventions](https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator). The core things to implement are inheriting from `BaseEstimator` and implementing `fit`, `transform` and `get_feature_names_out` methods. 33 | 4. If your component introduces new dependencies, make sure to add them to uv with `uv add `. 34 | 5. Consider adding support for [metadata routing](https://scikit-learn.org/stable/metadata_routing.html) if your component uses additional arguments for `fit`, `transform` and/or `predict`. Check out the documentation and other Numerblox components that use this feature for examples. We are also happy to help out with implementation of metadata routing. 35 | 36 | 37 | ### 2. Fixing bugs 38 | 39 | Even though most of the components in this library are tested, users will still likely run into issues. If you discover bugs, other issues or ideas for enhancements, do not hesitate to make a [Github issue](https://github.com/crowdcent/numerblox/issues). Describe in the issue what code was run on what machine and background on the issue. Add stacktraces and screenshots if this is relevant for solving the issue. Also, please add appropriate labels for the Github issue. 40 | 41 | - Ensure the bug was not already reported by searching on GitHub under Issues. 42 | - If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring. 43 | - Be sure to add the complete error messages. 44 | - Be sure to add tests that fail without your patch, and pass with it. 45 | 46 | ### 3. Creating an example notebook 47 | 48 | We welcome example notebooks that demonstrate the use of `numerblox`. If you want to create an example notebook, please make a notebook in the `examples/` folder. Make sure to add appropriate descriptions and explain the process of using the various components. Before committing please run the notebook from top to bottom. If it runs without errors, you can commit the notebook. 49 | Lastly, if the notebook uses additional libraries, please note this at the top of the notebook and create a code block with `!pip install `. 50 | 51 | Example pip install cell: 52 | 53 | ```bash 54 | !pip install scikit-lego plotly 55 | ``` 56 | 57 | #### Did you write a patch that fixes a bug? 58 | - Open a new GitHub pull request with the patch. 59 | - Ensure that your PR includes a test that fails without your patch, and pass with it. 60 | - Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. 61 | 62 | ## PR submission guidelines 63 | - Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused. 64 | - Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project. 65 | - If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another. 66 | -------------------------------------------------------------------------------- /docs/crowdcent.md: -------------------------------------------------------------------------------- 1 | # About CrowdCent 2 | 3 | CrowdCent is on a mission to decentralize investment management by changing the way investment funds make decisions and allocate capital. We are the machine learning and coordination layer for online investment communities looking to turn their data into actionable, investable portfolios. 4 | 5 | More information about CrowdCent can be found on [crowdcent.com](https://crowdcent.com). -------------------------------------------------------------------------------- /docs/disclaimer.md: -------------------------------------------------------------------------------- 1 | # Disclaimer 2 | 3 | -------------------------------------- 4 | 5 | Under no circumstances should any information provided in this software — or on associated distribution outlets — be construed as an offer soliciting the purchase or sale of any security or interest in any pooled investment vehicle sponsored, discussed, or mentioned by CrowdCent LLC or affiliates. Nor should it be construed as an offer to provide investment advisory services; an offer to invest in a CrowdCent investment vehicle will be made separately and only by means of the confidential offering documents of the specific pooled investment vehicles — which should be read in their entirety, and only to those who, among other requirements, meet certain qualifications under federal securities laws. Such investors, defined as accredited investors and qualified purchasers, are generally deemed capable of evaluating the merits and risks of prospective investments and financial matters. There can be no assurances that CrowdCent’s investment objectives will be achieved or investment strategies will be successful. Any investment in a vehicle managed by CrowdCent involves a high degree of risk including the risk that the entire amount invested is lost. Any investments or portfolio companies mentioned, referred to, or described are not representative of all investments in vehicles managed by CrowdCent and there can be no assurance that the investments will be profitable or that other investments made in the future will have similar characteristics or results. 6 | 7 | -------------------------------------- 8 | -------------------------------------------------------------------------------- /docs/download.md: -------------------------------------------------------------------------------- 1 | # Downloaders 2 | 3 | ## Numerai Classic 4 | 5 | `NumeraiClassicDownloader` simplifies downloading of datasets from Numerai's API. It allows you to easily download data with a few lines and the data is automatically organized in directories. 6 | 7 | More information: [https://numer.ai/data](https://numer.ai/data) 8 | 9 | 10 | ```py 11 | from numerblox.download import NumeraiClassicDownloader 12 | 13 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder") 14 | 15 | # Training and validation data 16 | dl.download_training_data("train_val", version="5.0") 17 | 18 | # Live data 19 | dl.download_live_data("live", version="5.0") 20 | ``` 21 | 22 | Besides these common use cases you can also get feature sets and meta model predictions with `NumeraiClassicDownloader`. 23 | 24 | ```py 25 | from numerblox.download import NumeraiClassicDownloader 26 | 27 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder") 28 | 29 | # Get feature sets 30 | features = dl.get_classic_features() 31 | 32 | # Get meta model predictions 33 | dl.download_meta_model_preds() 34 | meta_model_preds = pd.read_parquet("my_numerai_data_folder/meta_model.parquet") 35 | ``` 36 | 37 | ## Numerai Signals 38 | 39 | Numerai provides a dataset for Numerai Signals. This is a good starting point for new users. 40 | 41 | More information: [https://signals.numer.ai/data](https://signals.numer.ai/data) 42 | 43 | ```py 44 | from numerblox.download import NumeraiSignalsDownloader 45 | 46 | dl = NumeraiSignalsDownloader(directory_path="my_numerai_signals_folder") 47 | 48 | # Download full dataset 49 | dl.download_training_data() 50 | 51 | # Live data 52 | dl.download_live_data() 53 | ``` 54 | 55 | ## Numerai Crypto 56 | 57 | For Numerai Crypto there are files to download. 58 | 59 | More information: [https://crypto.numer.ai/data](https://crypto.numer.ai/data) 60 | 61 | ```py 62 | from numerblox.download import NumeraiCryptoDownloader 63 | 64 | dl = NumeraiCryptoDownloader(directory_path="my_numerai_crypto_folder") 65 | 66 | # Training targets 67 | dl.download_training_data() 68 | 69 | # Live Crypto universe data 70 | dl.download_live_data() 71 | ``` 72 | 73 | ### EOD Historical Data 74 | 75 | Download data from EOD historical data. A common data vendor used for Numerai Signals. 76 | 77 | More information: [https://eodhistoricaldata.com](https://eodhistoricaldata.com) 78 | 79 | 80 | Make sure you have the underlying Python package for EOD installed. 81 | 82 | ```bash 83 | pip install eod 84 | ``` 85 | 86 | For EOD you also need to define credentials in the form of an API key. 87 | 88 | More information: [https://eodhd.com/pricing](https://eodhd.com/pricing) 89 | 90 | ```py 91 | from numerblox.download import EODDownloader 92 | 93 | eod_api_key = "MY_EOD_API_KEY" 94 | tickers = ["AAPL.US", "MSFT.US", "GOOG.US"] 95 | dl = EODDownloader(directory_path="my_numerai_signals_folder", 96 | key=eod_api_key, tickers=tickers) 97 | 98 | # Download full dataset 99 | dl.download_training_data(start="2008-01-01") 100 | 101 | # load data directly into DataFrame from January 1st 2024 for live. 102 | live_data = dl.download_live_data(start="2024-01-01") 103 | ``` 104 | 105 | ### Kaggle 106 | 107 | Some Numerai dataset are uploaded and maintained on Kaggle Datasets. NumerBlox offers a convenient API to download these datasets. 108 | 109 | For authentication, make sure you have a directory called .kaggle in your home directory 110 | with therein a kaggle.json file. kaggle.json should have the following structure: 111 | `{"username": USERNAME, "key": KAGGLE_API_KEY}` 112 | 113 | More info on authentication: [github.com/Kaggle/kaggle-api#api-credentials](https://github.com/Kaggle/kaggle-api#api-credentials) 114 | 115 | More info on the Kaggle Python API: [kaggle.com/donkeys/kaggle-python-api](https://kaggle.com/donkeys/kaggle-python-api) 116 | 117 | Also make sure you have the `kaggle` Python package installed. 118 | 119 | ```bash 120 | pip install kaggle 121 | ``` 122 | 123 | Below is a quickstart example using Katsu's starter dataset. 124 | 125 | ```py 126 | from numerblox.download import KaggleDownloader 127 | 128 | kd = KaggleDownloader(directory_path="my_numerai_signals_folder") 129 | 130 | # A good example of Numerai Signals data on Kaggle Datasets is Katsu1110's yfinance price dataset. 131 | kd.download_live_data("code1110/yfinance-stock-price-data-for-numerai-signals") 132 | ``` 133 | 134 | ### Google Cloud Storage Integration 135 | 136 | All NumerBlox downloaders inherit from `BaseIO`, which provides built-in support for Google Cloud Storage (GCS). This allows you to easily upload and download data to/from GCS buckets. 137 | 138 | #### Prerequisites 139 | 140 | Make sure you have Google Cloud Storage credentials configured. You'll need: 141 | - The `google-cloud-storage` Python package installed 142 | - Authentication set up (typically via `GOOGLE_APPLICATION_CREDENTIALS` environment variable or default credentials) 143 | 144 | #### Usage 145 | 146 | ```py 147 | from numerblox.download import NumeraiClassicDownloader 148 | 149 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder") 150 | 151 | # Download from GCS 152 | dl.download_file_from_gcs(bucket_name="my-bucket", gcs_path="path/to/file.parquet") 153 | dl.download_directory_from_gcs(bucket_name="my-bucket", gcs_path="path/to/directory") 154 | 155 | # Upload to GCS 156 | dl.upload_file_to_gcs(bucket_name="my-bucket", gcs_path="path/to/file.parquet", local_path="local_file.parquet") 157 | dl.upload_directory_to_gcs(bucket_name="my-bucket", gcs_path="path/to/directory") 158 | ``` 159 | 160 | This functionality is available for all downloaders (NumeraiClassicDownloader, NumeraiSignalsDownloader, NumeraiCryptoDownloader, EODDownloader, and KaggleDownloader) since they all inherit from BaseIO. 161 | 162 | ### Rolling your own downloader 163 | 164 | We invite users to build out their own downloaders for Numerai Signals. The only requirements are that you inherit from `numerblox.download.BaseDownloader` and implement the `download_training_data` and `download_live_data` methods. Below you will find a template for this. 165 | 166 | If you have a downloader that you would like to share with the community, please open a Pull Request in NumerBlox. 167 | 168 | ```py 169 | class AwesomeCustomDownloader(BaseDownloader): 170 | """ 171 | TEMPLATE - 172 | Download awesome financial data for Numerai Signals from who knows where. 173 | 174 | :param directory_path: Base folder to download files to. 175 | """ 176 | def __init__(self, directory_path: str): 177 | super().__init__(directory_path=directory_path) 178 | 179 | def download_live_data(self, *args, **kwargs): 180 | """ (minimal) weekly live downloading here. """ 181 | ... 182 | 183 | def download_training_data(self, *args, **kwargs): 184 | """ Training + validation dataset downloading here. """ 185 | ... 186 | 187 | ``` 188 | -------------------------------------------------------------------------------- /docs/end_to_end.md: -------------------------------------------------------------------------------- 1 | # End To End Examples 2 | 3 | This section will show NumerBlox in action for some more advanced use cases. If you are looking for inspiration to leverage the power of NumerBlox, check out these examples. 4 | 5 | First we download the classic data with NumeraiClassicDownloader. We use a NumerFrame for convenience to parse the dataset. 6 | 7 | ```py 8 | from numerblox.numerframe import create_numerframe 9 | from numerblox.download import NumeraiClassicDownloader 10 | dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder") 11 | dl.download_training_data("train_val", version="5.0", int8=True) 12 | df = create_numerframe("my_numerai_data_folder/train_val/train.parquet") 13 | val_df = create_numerframe("my_numerai_data_folder/train_val/val.parquet") 14 | 15 | X, y = df.get_feature_target_pair(multi_target=False) 16 | fncv3_cols = df.get_fncv3_feature_data.columns.tolist() 17 | 18 | val_X, val_y = val_df.get_feature_target_pair(multi_target=False) 19 | val_features = val_df.get_feature_data 20 | val_eras = val_df.get_era_data 21 | ``` 22 | 23 | ## 1. Neutralized XGBoost pipeline. 24 | 25 | Let's construct an end-to-end pipeline that does the following: 26 | - Augment FNCv3 features with group statistics features for the `sunshine` and `rain` data. 27 | - Fit 5 folds of XGBoost. 28 | - Ensemble them with a weighted average where the more recent folds get a higher weight. 29 | - Neutralize the prediction with respect to the original features. 30 | 31 | External libraries are xgboost and sklego. Make sure to have these dependencies installed. 32 | 33 | ```bash 34 | !pip install xgboost sklego 35 | ``` 36 | 37 | ```py 38 | from xgboost import XGBRegressor 39 | from sklego.preprocessing import ColumnSelector 40 | from sklearn.model_selection import TimeSeriesSplit 41 | from sklearn.pipeline import make_union 42 | from sklearn.compose import make_column_transformer 43 | 44 | from numerblox.preprocessing import GroupStatsPreProcessor 45 | from numerblox.meta import CrossValEstimator, make_meta_pipeline 46 | from numerblox.ensemble import NumeraiEnsemble 47 | from numerblox.neutralizers import FeatureNeutralizer 48 | 49 | # Preprocessing 50 | gpp = GroupStatsPreProcessor(groups=['sunshine', 'rain']) 51 | fncv3_selector = ColumnSelector(fncv3_cols) 52 | 53 | preproc_pipe = make_union(gpp, fncv3_selector) 54 | 55 | # Model 56 | xgb = XGBRegressor() 57 | cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5)) 58 | ens = NumeraiEnsemble() 59 | fn = FeatureNeutralizer(proportion=0.5) 60 | full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn) 61 | 62 | # Train full model 63 | full_pipe.fit(X, y, era_series=era_series); 64 | 65 | # Inference on validation data 66 | val_preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features) 67 | ``` 68 | 69 | ## 2. Multi Classification Ensemble 70 | 71 | This example shows a multiclass classification example where the Numerai target is transformed into integers (`[0, 0.25, 0.5, 0.75, 1.0] -> [0, 1, 2, 3, 4]`) and treated as a classification problem. 72 | 73 | When we call `predict_proba` on a classifier the result will be a probability for every class, like for example `[0.1, 0.2, 0.3, 0.2, 0.2]`. In order to reduce these to one number we use the `PredictionReducer`, which takes the probabilities for every model and reduces it with a vector multiplication (Fro example, `[0.1, 0.2, 0.3, 0.2, 0.2] @ [0, 1, 2, 3, 4] = 2.2`). It does this for every model so the output of `PredictionReducer` has 3 columns. 74 | 75 | Because we set `donate_weighted=True` in `NumeraiEnsemble` 3 columns are reduced to one column using a weighted ensemble where the most recent fold get the highest weight. Lastly, the final prediction column is neutralized. 76 | 77 | ```py 78 | from sklearn.tree import DecisionTreeClassifier 79 | from sklearn.model_selection import TimeSeriesSplit 80 | from numerblox.meta import CrossValEstimator, make_meta_pipeline 81 | from numerblox.ensemble import NumeraiEnsemble, PredictionReducer 82 | from numerblox.neutralizers import FeatureNeutralizer 83 | 84 | model = DecisionTreeClassifier() 85 | crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba') 86 | pred_rud = PredictionReducer(n_models=3, n_classes=5) 87 | ens2 = NumeraiEnsemble(donate_weighted=True) 88 | neut2 = FeatureNeutralizer(proportion=0.5) 89 | full_pipe = make_meta_pipeline(preproc_pipe, crossval1, pred_rud, ens2, neut2) 90 | 91 | full_pipe.fit(X, y, era_series=era_series) 92 | 93 | preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features) 94 | ``` 95 | 96 | ## 3. Ensemble of ensemble of regressors 97 | 98 | This object introduces a `ColumnTransformer` that contains 3 pipelines. Each pipeline can have a different set of arguments. Here we simplify by passing every pipeline with the same columns. 99 | The output from all pipelines is concatenated, ensembled with `NumeraiEnsemble` and the final ensembles column is neutralized. Note that every fold here is equal weighted. If you want to give recent folds more weight set `weights` in `NumeraiEnsemble` for all `ColumnTransformer` output. 100 | 101 | ```py 102 | from sklearn.tree import DecisionTreeRegressor 103 | from sklearn.model_selection import TimeSeriesSplit 104 | from sklearn.pipeline import make_pipeline 105 | from sklearn.compose import make_column_transformer 106 | from numerblox.meta import CrossValEstimator, make_meta_pipeline 107 | from numerblox.ensemble import NumeraiEnsemble, 108 | from numerblox.neutralizers import FeatureNeutralizer 109 | 110 | 111 | pipes = [] 112 | for i in range(3): 113 | model = DecisionTreeRegressor() 114 | crossval = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=5), predict_func='predict') 115 | pipe = make_pipeline(crossval) 116 | pipes.append(pipe) 117 | 118 | models = make_column_transformer(*[(pipe, features.columns.tolist()) for pipe in pipes]) 119 | ens_end = NumeraiEnsemble() 120 | neut = FeatureNeutralizer(proportion=0.5) 121 | full_pipe = make_meta_pipeline(models, ens_end, neut) 122 | 123 | full_pipe.fit(X, y, era_series=era_series) 124 | 125 | preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features) 126 | ``` 127 | -------------------------------------------------------------------------------- /docs/evaluation.md: -------------------------------------------------------------------------------- 1 | # Evaluators 2 | 3 | NumerBlox offers evaluators for both Numerai Classic and Numerai Signals. 4 | 5 | ## Common Metrics 6 | 7 | For both `NumeraiClassicEvaluator` and `NumeraiSignalsEvaluator` you can set a custom `metrics_list` with all metrics you want to compute. 8 | 9 | By default, metrics will include `["mean_std_sharpe", "apy", "max_drawdown", "calmar_ratio"]` 10 | 11 | All valid metrics for `metrics_list` are: 12 | 13 | - "mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on Corrv2 (Numerai Correlation). 14 | 15 | - "apy" -> Annual Percentage Yield. 16 | 17 | - "max_drawdown" -> Max drawdown. 18 | 19 | - "calmar_ratio" -> [Calmar Ratio](https://www.investopedia.com/terms/c/calmarratio.asp). 20 | 21 | - "autocorrelation" -> Autocorrelation (1st order). 22 | 23 | - "max_feature_exposure" -> [Max feature exposure](https://forum.numer.ai/t/model-diagnostics-feature-exposure/899). 24 | 25 | - "smart_sharpe" -> Smart Sharpe. 26 | 27 | - "legacy_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on legacy model contribution. 28 | 29 | - "fn_mean_std_sharpe" -> [Feature Neutral](https://docs.numer.ai/tournament/feature-neutral-correlation) mean, standard deviation and Sharpe ratio (can take some time to compute). 30 | 31 | - "tb200_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on TB200. 32 | 33 | - "tb500_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on TB500. 34 | 35 | The following metrics only work if `benchmark_cols` are defined in `full_evaluation`: 36 | 37 | - "mc_mean_std_sharpe" -> Mean, standard deviation and Sharpe ratio based on [model contribution](https://forum.numer.ai/t/mmc-staking-starts-jan-2-2024/6827). 38 | 39 | - "corr_with" -> Correlation with benchmark predictions. 40 | 41 | - "ex_diss_pearson" (alias "ex_diss") -> [Exposure Dissimilarity](https://forum.numer.ai/t/true-contribution-details/5128/4) to benchmark predictions using Pearson correlation. 42 | 43 | - "ex_diss_spearman" -> [Exposure Dissimilarity](https://forum.numer.ai/t/true-contribution-details/5128/4) to benchmark predictions using Spearman correlation. Will be slower compared to "ex_diss_pearson". 44 | 45 | - "churn" -> [Churn](https://forum.numer.ai/t/better-lgbm-params-signals-v2-data-and-reducing-signals-churn/7638) is a statistic describing how the alpha scores of a signal changes over time. 46 | 47 | - "tb200_churn" -> Churn based on TB200. 48 | 49 | - "tb500_churn" -> Churn based on TB500. 50 | 51 | ## Numerai Classic specific metrics 52 | 53 | `NumeraiClassicEvaluator` can also compute [FNCv3](https://docs.numer.ai/numerai-tournament/scoring/feature-neutral-correlation#fnc-on-the-website). If you want to compute this add `fncv3_mean_std_sharpe` to the `metrics_list`. 54 | 55 | ```py 56 | from numerblox.evaluation import NumeraiClassicEvaluator, FAST_METRICS 57 | 58 | # Validation DataFrame to compute metrics on 59 | # Should have at least era_col, pred_cols and target_col columns. 60 | val_df = ... 61 | 62 | evaluator = NumeraiClassicEvaluator(era_col="era", metrics_list=FAST_METRICS) 63 | metrics = evaluator.full_evaluation(val_df, 64 | pred_cols=["prediction"], 65 | target_col="target", 66 | benchmark_cols=["benchmark1", "benchmark2"]) 67 | ``` 68 | 69 | ## Numerai Signals specific metrics 70 | 71 | `NumeraiSignalsEvaluator` offers [Numerai Signals diagnostics](https://forum.numer.ai/t/signals-diagnostics-guide/5950) scores. This is a special operation as it calls on Numerai servers and needs additional authentication, so it is not included in `full_evaluation`. 72 | 73 | Example of how to get diagnostic scores for Numerai Signals: 74 | ```py 75 | from numerblox.misc import Key 76 | from numerblox.evaluation import NumeraiSignalsEvaluator 77 | 78 | evaluator = NumeraiSignalsEvaluator() 79 | 80 | # A Numerai Signals model name you use. 81 | model_name = "MY_MODEL" 82 | # NumerBlox Key for accessing the Numerai API 83 | key = Key(pub_id="Hello", secret_key="World") 84 | # DataFrame with validation data containing prediction, date, ticker and data_type columns 85 | val_df = pd.DataFrame() 86 | 87 | evaluator.get_neutralized_corr(val, model_name=model_name, key=key, corr_col="validationRic") 88 | # Returns a Pandas DataFrame with validationRic. 89 | ``` 90 | 91 | ## Custom functions 92 | 93 | In addition to the default metrics, evaluators can be augmented with custom metrics. This can be done by defining a dictionary of functions and arguments. 94 | 95 | The custom function dictionary should have the following structure: 96 | ```py 97 | { 98 | "func1": # Metric name 99 | { 100 | "func": custom_function, # Function to call 101 | "args": { # General arguments (can be any type) 102 | "dataf": "dataf", 103 | "some_arg": "some_arg", 104 | }, 105 | "local_args": ["dataf"] # List of local variables to use/resolve 106 | }, 107 | "func2": 108 | { 109 | "func": custom_function2, 110 | "args": { 111 | "dataf": "dataf", 112 | "some_arg": "some_arg", 113 | }, 114 | "local_args": ["dataf"] 115 | }, 116 | (...) 117 | } 118 | ``` 119 | 120 | - The main keys (`func1` and `func2` in the example) will be the metric key names for the output evaluation DataFrame. 121 | 122 | - The `func` key should be a function that takes in the arguments defined in `args` as keyword arguments. `func` should be a callable function or class (i.e. class that implements `__call__`). 123 | 124 | - The `args` key should be a dictionary with arguments to pass to `func`. The values of the dictionary can be any type. Arguments that you want resolved as local variables should be defined as strings (see `local_args` explanation). 125 | 126 | - The `local_args` key should be a list of strings that refer to variables that exist locally in the [evaluation_one_col](https://crowdcent.github.io/numerblox/api/#numerblox.evaluation.BaseEvaluator.evaluation_one_col) function. These local variables will be resolved to local variables for `func`. This allows you to use [evaluation_one_col](https://crowdcent.github.io/numerblox/api/#numerblox.evaluation.BaseEvaluator.evaluation_one_col) variables like `dataf`, `pred_col`, `target_col`, `col_stats`, `mean`, `per_era_numerai_corrs`, etc. 127 | 128 | 129 | Example of how to use custom functions in `NumeraiClassicEvaluator`: 130 | ```py 131 | from numerblox.evaluation import NumeraiClassicEvaluator 132 | 133 | def residuals(dataf, target_col, pred_col, val: int): 134 | """ Simple dummy func: mean of residuals. """ 135 | return np.mean(dataf[target_col] - dataf[pred_col] + val) 136 | 137 | custom_functions = { 138 | "residuals": { 139 | # Callable function 140 | "func": residuals, 141 | "args": { 142 | # String referring to local variables 143 | "dataf": "dataf", 144 | "pred_col": "pred_col", 145 | "target_col": "target_col", 146 | # Static argument 147 | "val": 0.0001, 148 | }, 149 | # List of local variables to use/resolve 150 | "local_args": ["dataf", "pred_col", "target_col"] 151 | }, 152 | } 153 | 154 | evaluator = NumeraiClassicEvaluator(custom_functions=custom_functions) 155 | 156 | # In evaluator residuals(dataf=dataf, pred_col="prediction", target_col="target", val="0.0001) is called. 157 | metrics = evaluator.full_evaluation(val_df, 158 | pred_cols=["prediction"], 159 | target_col="target") 160 | # metrics will contain a "residuals" column. 161 | ``` -------------------------------------------------------------------------------- /docs/meta.md: -------------------------------------------------------------------------------- 1 | # Meta Estimators 2 | 3 | Meta estimator wrap existing scikit-learn estimators to provide additional functionality. Currently, the following meta estimators are available: 4 | 5 | - [CrossValEstimator](#crossvalestimator) 6 | - [MetaPipeline](#metapipeline) 7 | 8 | ## CrossValEstimator 9 | 10 | `CrossValEstimator` provides a way to integrate cross-validation directly into model training, enabling simultaneous fitting of multiple models across data folds. By doing this, you can fit it as one transformer and get outputs for each fold during the prediction phase. 11 | 12 | ### Why CrossValEstimator? 13 | 14 | - **Holistic Training**: Cross-validation offers a more robust model training process by leveraging multiple sub-sets of your data. This way, your model's performance is less susceptible to the peculiarities of any single data split. 15 | 16 | - **Inherent Ensemble**: By training on multiple folds, you're essentially building an ensemble of models. Ensembles often outperform individual models since they average out biases, reduce variance, and are less likely to overfit. 17 | 18 | - **Custom Evaluation**: With the `evaluation_func` parameter, you can input your custom evaluation logic, allowing for flexible and tailored performance assessment for each fold. 19 | 20 | - **Flexibility with Predictions**: Choose between different prediction functions like 'predict', 'predict_proba', and 'predict_log_proba' using the `predict_func` parameter. 21 | 22 | - **Verbose Logging**: Gain insights into the training process with detailed logs during the fitting phase, aiding in debugging and understanding model performance across folds. 23 | 24 | ### Example 25 | 26 | ```py 27 | from sklearn.model_selection import KFold 28 | from xgboost import XGBRegressor 29 | 30 | from numerblox.meta import CrossValEstimator 31 | 32 | # Define the cross-validation strategy 33 | cv = KFold(n_splits=5) 34 | 35 | # Initialize the estimator 36 | estimator = XGBRegressor(n_estimators=100, max_depth=3) 37 | 38 | # (optional) Define a custom evaluation function 39 | def custom_eval(y_true, y_pred): 40 | return {"mse": ((y_true - y_pred) ** 2).mean()} 41 | 42 | # Initialize the CrossValEstimator 43 | cross_val_estimator = CrossValEstimator(cv=cv, 44 | estimator=estimator, 45 | evaluation_func=custom_eval) 46 | 47 | # Fit the CrossValEstimator 48 | cross_val_estimator.fit(X_train, y_train) 49 | predictions = cross_val_estimator.predict(X_test) 50 | ``` 51 | 52 | ## MetaPipeline 53 | 54 | The `MetaPipeline` extends the functionality of scikit-learn's `Pipeline` by seamlessly integrating models and post-model transformations. It empowers you to employ sophisticated data transformation techniques not just before, but also after your model's predictions. This is particularly useful when post-processing predictions, such as neutralizing feature exposures in financial models. 55 | 56 | ## Why MetaPipeline? 57 | 58 | - **Post-Model Transformations**: It can be crucial to apply transformations, like feature neutralization, after obtaining predictions. `MetaPipeline` facilitates such operations, leading to improved model generalization and stability. 59 | 60 | - **Streamlined Workflow**: Instead of managing separate sequences for transformations and predictions, you can orchestrate them under a single umbrella, simplifying both development and production workflows. 61 | 62 | - **Flexible Integration**: `MetaPipeline` gracefully handles a variety of objects, including `Pipeline`, `FeatureUnion`, and `ColumnTransformer`. This makes it a versatile tool adaptable to diverse tasks and data structures. 63 | 64 | #### Example 65 | 66 | Consider a scenario where you have an `XGBRegressor` model and want to apply a `FeatureNeutralizer` after obtaining the model's predictions: 67 | 68 | ```py 69 | from xgboost import XGBRegressor 70 | from numerblox.meta import MetaPipeline 71 | from numerblox.neutralizers import FeatureNeutralizer 72 | 73 | # Define MetaPipeline steps 74 | steps = [ 75 | ('xgb_regressor', XGBRegressor(n_estimators=100, max_depth=3)), 76 | ('feature_neutralizer', FeatureNeutralizer(proportion=0.5)) 77 | ] 78 | 79 | # Create MetaPipeline 80 | meta_pipeline = MetaPipeline(steps) 81 | 82 | # Train and predict using MetaPipeline 83 | meta_pipeline.fit(X_train, y_train) 84 | predictions = meta_pipeline.predict(X_test) 85 | ``` 86 | 87 | For a more succinct creation of a `MetaPipeline`, you can use the `make_meta_pipeline` function: 88 | 89 | ```py 90 | from numerblox.meta import make_meta_pipeline 91 | 92 | pipeline = make_meta_pipeline(XGBRegressor(n_estimators=100, max_depth=3), 93 | FeatureNeutralizer(proportion=0.5)) 94 | ``` -------------------------------------------------------------------------------- /docs/model_upload.md: -------------------------------------------------------------------------------- 1 | # Numerai Model Upload 2 | 3 | The `NumeraiModelUpload` class is designed for uploading trained models to Numerai for automated submissions. You can upload a single trained model or a complete `sklearn` pipeline, allowing seamless integration with various machine learning workflows. This class efficiently handles model serialization, validation, and uploading, making it adaptable for different types of models and workflows. 4 | 5 | ***Warning**: The `NumeraiModelUpload` class is designed to work with very specific requirements. For compatibility, make sure your environment matches the requirements listed in the official numerai-predict repository: [numerai-predict/requirements.txt](https://github.com/numerai/numerai-predict/blob/master/requirements.txt). Using different versions or additional packages may lead to issues during model upload and execution.* 6 | 7 | ## Why Use NumeraiModelUpload? 8 | 9 | - **Automation**: Automates the model submission process to Numerai, reducing the need for manual intervention. 10 | - **Support for Sklearn Pipelines**: Integrates seamlessly with `sklearn` pipelines and NumerBlox processors, allowing users to submit models with preprocessing, feature engineering, and stacking in a single workflow. 11 | - **Error Handling**: Offers robust error handling with retry logic, ensuring reliable uploads even in case of network or API errors. 12 | - **Custom Predict Function**: Supports custom prediction functions for advanced use cases, offering greater flexibility. 13 | 14 | ## Instantiation 15 | 16 | To use `NumeraiModelUpload`, instantiate it with a `Key` object containing your credentials and optional parameters for error handling. 17 | 18 | ```python 19 | from numerblox.misc import Key 20 | from numerblox.submission import NumeraiModelUpload 21 | 22 | key = Key(pub_id="your_public_id", secret_key="your_secret_key") 23 | 24 | uploader = NumeraiModelUpload( 25 | key=key, 26 | max_retries=3, 27 | sleep_time=15, 28 | fail_silently=True 29 | ) 30 | ``` 31 | 32 | ### Parameters: 33 | 34 | - **`key`**: (Key) Key object containing valid credentials for Numerai Classic. 35 | - **`max_retries`**: (int, optional) Maximum number of retries for uploading models to Numerai. Defaults to 2. 36 | - **`sleep_time`**: (int, optional) Time in seconds to wait between retries. Defaults to 10. 37 | - **`fail_silently`**: (bool, optional) Whether to suppress errors and skip failed uploads without raising exceptions. Useful for batch processing. Defaults to `False`. 38 | - **`*args, **kwargs`**: Additional arguments passed to `NumerAPI` initialization. 39 | 40 | ## Model Uploading 41 | 42 | The primary method for uploading models is `create_and_upload_model`, which serializes the model using `cloudpickle`, saves it to a file, and uploads it to Numerai. 43 | 44 | ### Example: Upload a Single Model 45 | 46 | ```python 47 | import pandas as pd 48 | from some_ml_library import TrainedModel 49 | 50 | # Assume you have a trained model named 'my_model' 51 | my_model = TrainedModel() 52 | 53 | uploader.create_and_upload_model( 54 | model=my_model, 55 | model_name="my_model_name", 56 | file_path="models/my_model.pkl" 57 | ) 58 | ``` 59 | 60 | ### Method: `create_and_upload_model` 61 | 62 | Creates a model prediction function, serializes it, and uploads the model to Numerai. 63 | 64 | #### Parameters: 65 | 66 | - **`model`**: (Any) The machine learning model object. 67 | - **`feature_cols`**: (Optional[List[str]]) List of feature column names for predictions. If `None`, all columns starting with "feature_" will be used. 68 | - **`model_name`**: (str) Numerai model name. 69 | - **`file_path`**: (str) Full path where the serialized model function will be saved. 70 | - **`data_version`**: (Optional[str]) Data version to use for model upload. 71 | - **`docker_image`**: (Optional[str]) Docker image to use for model upload. 72 | - **`custom_predict_func`**: (Optional[Callable[[pd.DataFrame], pd.DataFrame]]) Custom predict function. If provided, it should accept a DataFrame and return a DataFrame with a "prediction" column. 73 | 74 | #### Returns: 75 | 76 | - **`upload_id`**: Upload ID if successful, `None` otherwise. 77 | 78 | ### Method: `get_available_data_versions` 79 | 80 | Retrieve available data versions for model uploads. 81 | 82 | #### Example 83 | 84 | ```python 85 | available_data_versions = uploader.get_available_data_versions() 86 | print(available_data_versions) 87 | ``` 88 | 89 | ### Method: `get_available_docker_images` 90 | 91 | Retrieve available Docker images for model uploads. 92 | 93 | #### Example 94 | 95 | ```python 96 | available_docker_images = uploader.get_available_docker_images() 97 | print(available_docker_images) 98 | ``` 99 | 100 | ### Method: `_get_model_id` 101 | 102 | Private method to get the model ID needed for model uploading. 103 | 104 | #### Parameters: 105 | 106 | - **`model_name`**: (str) The name of the model registered in Numerai. 107 | 108 | #### Returns: 109 | 110 | - **`model_id`**: (str) Corresponding model ID for the given model name. 111 | 112 | ### Method: `get_model_mapping` 113 | 114 | Property that returns a mapping between raw model names and their corresponding model IDs. 115 | 116 | #### Example 117 | 118 | ```python 119 | model_mapping = uploader.get_model_mapping 120 | print(model_mapping) 121 | ``` 122 | 123 | ## Example: Upload an Ensemble Model with Sklearn Pipeline 124 | 125 | To upload an ensemble model with multiple layers using an `sklearn` pipeline: 126 | 127 | ```python 128 | from sklearn.ensemble import StackingRegressor 129 | from sklearn.linear_model import RidgeCV 130 | from sklearn.ensemble import RandomForestRegressor 131 | 132 | # Create base models 133 | base_models = [ 134 | ('rf', RandomForestRegressor()), 135 | ('ridge', RidgeCV()) 136 | ] 137 | 138 | # Create stacking ensemble model 139 | stacking_model = StackingRegressor(estimators=base_models, final_estimator=RandomForestRegressor()) 140 | 141 | uploader.create_and_upload_model( 142 | model=stacking_model, 143 | model_name="ensemble_model_name", 144 | file_path="models/ensemble_model.pkl" 145 | ) 146 | ``` 147 | 148 | ## Note 149 | 150 | Ensure that the credentials and model names used in the above examples match those configured in your Numerai account. 151 | -------------------------------------------------------------------------------- /docs/models.md: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | ## EraBoostedXGBRegressor 4 | 5 | NOTE: This is still an experimental feature and subject to change. 6 | 7 | `EraBoostedXGBRegressor` is a custom regressor extending the functionality of XGBoost, aimed at improving accuracy on specific eras in a dataset. It upweights the eras that are toughest to fit. It is designed to integrate seamlessly with scikit-learn. 8 | 9 | ### Why? 10 | - Era-Specific Focus: Targets the worst-performing eras in your data for performance enhancement, ensuring that the model improves where it is most needed. 11 | - Scikit-learn integration: `EraBoostedXGBRegressor` is designed to integrate seamlessly with scikit-learn. 12 | - Customization Options: Offers flexibility to adjust the proportion of eras to focus on, the number of trees added per iteration, and the total number of iterations for era boosting. 13 | 14 | ### Quickstart 15 | 16 | Make sure to include the era column as a `pd.Series` in the `fit` method. 17 | ```python 18 | from numerblox.models import EraBoostedXGBRegressor 19 | 20 | model = EraBoostedXGBRegressor(proportion=0.5, trees_per_step=10, num_iters=20) 21 | model.fit(X=X_train, y=y_train, era_series=eras_train) 22 | 23 | predictions = model.predict(X_live) 24 | ``` -------------------------------------------------------------------------------- /docs/numerframe.md: -------------------------------------------------------------------------------- 1 | # NumerFrame 2 | 3 | `NumerFrame` is an extension of `pd.DataFrame` tailored specifically for the data format and workflow commonly used by Numerai participants. It builds upon the base functionalities of a Pandas DataFrame by offering utilities that simplify working with Numerai datasets. 4 | 5 | ## Why? 6 | - **Intuitive Data Handling**: With built-in features like `get_feature_data`, `get_target_data`, and more, it simplifies extracting data subsets specific to Numerai competitions. 7 | 8 | - **Automated Column Grouping**: Automatically parses columns into recognizable groups such as features, targets, predictions, making data retrieval more intuitive and less error-prone. 9 | 10 | - **Support for Multiple Formats**: Through `create_numerframe`, it supports initializing from various data formats such as CSV, Parquet, Excel, and Pickle, providing a flexible interface for users. 11 | 12 | - **Optimized for Numerai**: Whether you're trying to fetch specific eras, feature groups or patterns like all 20-day targets, `NumerFrame` is designed to simplify those tasks for Numerai participants. 13 | 14 | - **Chainable Operations**: Since most operations return another `NumerFrame`, they can be conveniently chained for more complex workflows. 15 | 16 | - **Tailored for Machine Learning**: With methods like `get_feature_target_pair`, it aids in easily splitting the data for machine learning tasks specific to the Numerai competition. 17 | 18 | By using `NumerFrame`, participants can focus more on model development and less on data wrangling, leading to a smoother and more efficient workflow in the Numerai competition. 19 | 20 | 21 | ## Initialization 22 | A NumerFrame can be initialized either from an existing `pd.DataFrame` or with `create_numerframe`. The `create_numerframe` function takes a path to a file and returns a `NumerFrame` object. This function automatically parses the file and supports CSV, Parquet, Excel and Pickle formats. 23 | 24 | `NumerFrame` automatically parses columns into groups so you can easily retrieve what you need. It automatically is aware of the `era` column for its operations. 25 | 26 | `NumerFrame` follows a convention for feature groups. 27 | 28 | - Features are all columns that start with `feature`. 29 | 30 | - Targets are all columns that start with `target`. 31 | 32 | - Predictions are all columns that start with `prediction`. 33 | 34 | - Aux columns are all that fall in none of these buckets, like `era`, `data_type` and `id`. 35 | 36 | - Era column is either `era` or `date`. 37 | 38 | ```py 39 | import pandas as pd 40 | from numerblox.numerframe import NumerFrame, create_numerframe 41 | # From DataFrame 42 | data = pd.read_parquet('train.parquet') 43 | df = NumerFrame(data) 44 | 45 | # With create_numerframe 46 | df = create_numerframe('train.parquet') 47 | ``` 48 | 49 | 50 | ## Examples 51 | 52 | Basic functionality: 53 | ```py 54 | # Get data for features, targets, predictions, and aux 55 | features = df.get_feature_data 56 | targets = df.get_target_data 57 | predictions = df.get_prediction_data 58 | aux_data = df.get_aux_data 59 | ``` 60 | 61 | Additionally it is possible to get groups specific to Numerai Classic like FNCv3 and internal feature groups. The examples below show some advanced functionality in `NumerFrame`. 62 | 63 | ```py 64 | # Get data for features, targets and predictions 65 | features = df.get_feature_data 66 | targets = df.get_target_data 67 | predictions = df.get_prediction_data 68 | 69 | # Get specific data groups 70 | fncv3_features = df.get_fncv3_feature_data 71 | group_features = df.get_group_features(group='rain') 72 | small_features = df.get_small_feature_data 73 | medium_features = df.get_medium_feature_data 74 | 75 | # Fetch columns by pattern. For example all 20 day targets. 76 | pattern_data = df.get_pattern_data(pattern='_20') 77 | # Or for example Jerome targets. 78 | jerome_targets = df.get_pattern_data(pattern='_jerome_') 79 | 80 | # Split into feature and target pairs. Will get single target by default. 81 | X, y = df.get_feature_target_pair() 82 | # Optionally get all targets 83 | X, y = df.get_feature_target_pair(multi_target=True) 84 | 85 | # Fetch data for specified eras 86 | X, y = df.get_era_batch(eras=['0001', '0002']) 87 | 88 | # Since every operation returns a NumerFrame they can be chained. 89 | # An example chained operation is getting features and targets for the last 2 eras. 90 | X, y = df.get_last_eras(2).get_feature_target_pair() 91 | ``` 92 | 93 | -------------------------------------------------------------------------------- /docs/postprocessing.md: -------------------------------------------------------------------------------- 1 | # Postprocessing 2 | 3 | ## Feature Neutralization 4 | 5 | `FeatureNeutralizer` provides classic feature neutralization by subtracting linear model influence, ensuring that predictions are not overly influenced by a specific set of features. 6 | 7 | ### Why? 8 | - **Reduce Overfitting**: By neutralizing predictions, you can potentially reduce the risk of overfitting to specific feature characteristics. 9 | - **Control Feature Influence**: Allows you to have a granular control on how much influence a set of features can exert on the final predictions. 10 | - **Enhance Model Robustness**: By limiting the influence of potentially noisy or unstable features, you might improve the robustness of your model's predictions across different data periods. 11 | 12 | ### Quickstart 13 | 14 | Make sure to pass both the features to use for penalization as a `pd.DataFrame` and the accompanying era column as a `pd.Series` to the `predict` method. 15 | 16 | Additionally, `pred_name` and `proportion` can be lists. In this case, the neutralization will be performed for each prediction name and proportion. For example, if `pred_name=["prediction1", "prediction2"]` and `proportion=[0.5, 0.7]`, then the result will be an array with 4 neutralized prediction columns. 17 | All neutralizations will be performed in parallel. 18 | 19 | Single column neutralization: 20 | ```python 21 | import pandas as pd 22 | from numerblox.neutralizers import FeatureNeutralizer 23 | 24 | predictions = pd.Series([0.24, 0.87, 0.6]) 25 | feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) 26 | era_data = pd.Series([1, 1, 2]) 27 | 28 | neutralizer = FeatureNeutralizer(pred_name="prediction", proportion=0.5) 29 | neutralizer.fit() 30 | neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, era_series=era_data) 31 | ``` 32 | 33 | Multiple column neutralization: 34 | ```python 35 | import pandas as pd 36 | from numerblox.neutralizers import FeatureNeutralizer 37 | 38 | predictions = pd.DataFrame({"prediction1": [0.24, 0.87, 0.6], "prediction2": [0.24, 0.87, 0.6]}) 39 | feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) 40 | era_data = pd.Series([1, 1, 2]) 41 | 42 | neutralizer = FeatureNeutralizer(pred_name=["prediction1", "prediction2"], proportion=[0.5, 0.7]) 43 | neutralizer.fit() 44 | neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, era_series=era_data) 45 | ``` 46 | 47 | ## FeaturePenalizer 48 | 49 | `FeaturePenalizer` neutralizes predictions using TensorFlow based on provided feature exposures. It's designed to integrate seamlessly with scikit-learn. 50 | 51 | ### Why? 52 | - **Limit Feature Exposure**: Ensures that predictions are not excessively influenced by any individual feature, which can help in achieving more stable predictions. 53 | - **Enhanced Prediction Stability**: By penalizing high feature exposures, it might lead to more stable and consistent predictions across different eras or data splits. 54 | - **Mitigate Model Biases**: If a model is relying too heavily on a particular feature, penalizing can help in balancing out the biases and making the model more generalizable. 55 | 56 | ### Quickstart 57 | 58 | Make sure to pass both the features to use for penalization as a `pd.DataFrame` and the accompanying era column as a `pd.Series` to the `predict` method. 59 | ```python 60 | from numerblox.penalizers import FeaturePenalizer 61 | 62 | predictions = pd.Series([0.24, 0.87, 0.6]) 63 | feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) 64 | era_data = pd.Series([1, 1, 2]) 65 | 66 | penalizer = FeaturePenalizer(max_exposure=0.1, pred_name="prediction") 67 | penalizer.fit(X=predictions) 68 | penalized_predictions = penalizer.predict(X=predictions, features=feature_data, era_series=era_data) 69 | ``` 70 | -------------------------------------------------------------------------------- /docs/prediction_loaders.md: -------------------------------------------------------------------------------- 1 | # Prediction Loaders 2 | 3 | Prediction loaders are designed to seamlessly fetch and transform prediction data, especially from Numerai's API. These classes can be integrated into pipelines to automate the prediction generation process for the Numerai competition. 4 | 5 | # Why? 6 | 7 | Numerai provides example predictions to help participants understand the expected structure and format of predictions. With the ExamplePredictions class, you can easily fetch these example predictions for different data versions, allowing you to quickly evaluate or test your models against the Numerai's standard prediction dataset. 8 | 9 | # ExamplePredictions 10 | 11 | ## Usage: 12 | The `ExamplePredictions` class fetches the example predictions for the specified version of the Numerai dataset. This can be useful for testing or understanding the prediction structure and data distribution. 13 | 14 | Downloaded files are automatically cleaned up after data is loaded with the `transform` method. To keep the files make sure to set `keep_files=True` when instantiating the class. 15 | 16 | ```py 17 | from numerblox.prediction_loaders import ExamplePredictions 18 | # Instantiate and load example predictions for v5.0 19 | example_loader = ExamplePredictions(file_name="v5.0/live_example_preds.parquet", keep_files=False) 20 | example_preds_df = example_loader.transform() 21 | ``` 22 | -------------------------------------------------------------------------------- /docs/preprocessing.md: -------------------------------------------------------------------------------- 1 | # Preprocessors 2 | 3 | NumerBlox offers a suite of preprocessors to easily do Numerai specific data transformations. All preprocessors are compatible with `scikit-learn` pipelines and feature a similar API. Note that some preprocessors may require an additional `eras` or `tickers` argument in the `transform` step. 4 | 5 | ## Numerai Classic 6 | 7 | ### GroupStatsPreProcessor 8 | 9 | The v4.2 (rain) dataset for Numerai Classic reintroduced feature groups. The `GroupStatsPreProcessor` calculates group statistics for all data groups. It uses predefined feature group mappings to generate statistical measures (mean, standard deviation, skew) for each of the feature groups. 10 | 11 | #### Example 12 | 13 | Here's how you can use the `GroupStatsPreProcessor`: 14 | 15 | ```python 16 | from numerblox.preprocessing import GroupStatsPreProcessor 17 | group_processor = GroupStatsPreProcessor(groups=['intelligence']) 18 | 19 | # Return features with group statistics for the 'intelligence' group 20 | features = group_processor.transform(X) 21 | ``` 22 | 23 | ## Numerai Signals 24 | 25 | ### ReduceMemoryProcessor 26 | 27 | The `ReduceMemoryProcessor` reduces the memory usage of the data as much as possible. It's particularly useful for Numerai Signals dataset which can be quite large. 28 | 29 | Note that modern Numerai Classic Data (v4.2+) already is an int8 format so this processor will be not be useful for Numerai Classic. 30 | 31 | ```py 32 | from numerblox.preprocessing import ReduceMemoryProcessor 33 | 34 | processor = ReduceMemoryProcessor(deep_mem_inspect=True, verbose=True) 35 | reduced_data = processor.fit_transform(dataf) 36 | ``` 37 | 38 | ### KatsuFeatureGenerator 39 | 40 | `KatsuFeatureGenerator` performs feature engineering based on [Katsu's starter notebook](https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners). This is useful for those participating in the Numerai Signals contest. 41 | 42 | You can specify custom windows that indicates how many days to look back when generating features. 43 | 44 | ```py 45 | from numerblox.preprocessing import KatsuFeatureGenerator 46 | 47 | feature_gen = KatsuFeatureGenerator(windows=[7, 14, 21]) 48 | enhanced_data = feature_gen.fit_transform(dataf) 49 | ``` 50 | 51 | ### EraQuantileProcessor 52 | 53 | `EraQuantileProcessor` transforms features into quantiles by era. This can help normalize data and make patterns more distinguishable. Quantiling operation are parallelized across features for faster processing. 54 | 55 | Using `.transform` requires passing `era_series`. This is because the quantiles are calculated per era so it needs that information along with the raw input features. 56 | 57 | ```py 58 | from numerblox.preprocessing import EraQuantileProcessor 59 | 60 | eq_processor = EraQuantileProcessor(num_quantiles=50, random_state=42) 61 | transformed_data = eq_processor.fit_transform(X, era_series=eras_series) 62 | ``` 63 | 64 | ### LagPreProcessor 65 | 66 | `LagPreProcessor` generates lag features based on specified windows. Lag features can capture temporal patterns in time-series data. 67 | 68 | Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step. 69 | 70 | ```py 71 | from numerblox.preprocessing import LagPreProcessor 72 | 73 | lag_processor = LagPreProcessor(windows=[5, 10, 20]) 74 | lag_processor.fit(X) 75 | lagged_data = lag_processor.transform(X, ticker_series=tickers_series) 76 | 77 | ``` 78 | 79 | ### DifferencePreProcessor 80 | 81 | `DifferencePreProcessor` computes the difference between features and their lags. It's used after `LagPreProcessor`. 82 | 83 | WARNING: `DifferencePreProcessor` works only on `pd.DataFrame` and with columns that are generated in `LagPreProcessor`. If you are using these in a Pipeline make sure `LagPreProcessor` is defined before `DifferencePreProcessor` and that output API is set to Pandas (`pipeline.set_output(transform="pandas")`). 84 | 85 | Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step so a pipeline with both preprocessors will need a `tickers` argument in `.transform`. 86 | 87 | ```py 88 | from sklearn.pipeline import make_pipeline 89 | from numerblox.preprocessing import DifferencePreProcessor 90 | 91 | lag = LagPreProcessor(windows=[5, 10]) 92 | diff = DifferencePreProcessor(windows=[5, 10], pct_diff=True) 93 | pipe = make_pipeline(lag, diff) 94 | pipe.set_output(transform="pandas") 95 | pipe.fit(X) 96 | diff_data = pipe.transform(X, ticker_series=tickers_series) 97 | ``` 98 | 99 | ### PandasTaFeatureGenerator 100 | 101 | `PandasTaFeatureGenerator` uses the `pandas-ta-classic` library to generate technical analysis features. It's a powerful tool for those interested in financial time-series data. 102 | 103 | Make sure you have `pandas-ta-classic` installed before using this feature generator: 104 | 105 | ```bash 106 | !pip install pandas-ta-classic 107 | ``` 108 | 109 | Currently `PandasTaFeatureGenerator` only works on `pd.DataFrame` input. Its input is a DataFrame with columns `[ticker, date, open, high, low, close, volume]`. 110 | 111 | ```py 112 | from numerblox.preprocessing import PandasTaFeatureGenerator 113 | 114 | ta_gen = PandasTaFeatureGenerator() 115 | ta_features = ta_gen.transform(dataf) 116 | ``` 117 | 118 | ### MinimumDataFilter 119 | 120 | `MinimumDataFilter` filters out dates and tickers that don't have enough data. For example, it makes sense to filter out dates for which you have less than 100 days of data. Also, dates that have less than 100 unique tickers can be filtered out. 121 | 122 | Additionally, you can specify a list of tickers to blacklist and exclude from your data. 123 | 124 | NOTE: This step only works with DataFrame input. 125 | 126 | ```py 127 | from numerblox.preprocessing import MinimumDataFilter 128 | 129 | min_data_filter = MinimumDataFilter(min_samples_date=200, min_samples_ticker=1200, blacklist_tickers=["SOMETICKER.BLA"]) 130 | filtered_data = min_data_filter.fit_transform(dataf) 131 | ``` 132 | 133 | ## Rolling your own preprocessor 134 | 135 | We invite the community to contribute their own preprocessors to NumerBlox. If you have a preprocessor that you think would be useful to others, please open a PR with your code and tests. 136 | The new preprocessor should adhere to [scikit-learn conventions](https://scikit-learn.org/stable/developers/develop.html). Here are some the most important things to keep in mind and a template. 137 | 138 | - Make sure that your preprocessor inherits from `numerblox.preprocessing.base.BasePreProcessor`. This will automatically implement a blank fit method. It will also inherit from `sklearn.base.TransformerMixin` and `sklearn.base.BaseEstimator`. 139 | - Make sure your preprocessor implements a `transform` method that can take a `np.array` or `pd.DataFrame` as input and outputs an `np.array`. If your preprocessor can only work with `pd.DataFrame` input, mention this explicitly in the docstring. 140 | - Implement a `get_feature_names_out` method so it can support `pd.DataFrame` output with valid column names. 141 | 142 | ```py 143 | import numpy as np 144 | import pandas as pd 145 | from typing import Union 146 | from sklearn.validation import check_is_fitted, check_X_y 147 | from numerblox.preprocessing.base import BasePreProcessor 148 | 149 | class MyAwesomePreProcessor(BasePreProcessor): 150 | def __init__(self, random_state: int = 0): 151 | super().__init__() 152 | # If you introduce additional arguments be sure to add them as attributes. 153 | self.random_state = random_state 154 | 155 | def fit(self, X: Union[np.array, pd.DataFrame], y=None): 156 | # Arguments can be set for later use. 157 | self.n_cols_ = X.shape[1] 158 | return self 159 | 160 | def transform(self, X: Union[np.array, pd.DataFrame]) -> np.array: 161 | # Do your preprocessing here. 162 | # Can involve additional checks. 163 | check_is_fitted(self) 164 | X = check_X_y(X) 165 | return X 166 | 167 | def get_feature_names_out(self, input_features=None) -> list: 168 | # Return a list of feature names. 169 | # If you are not using pandas output, you can skip this method. 170 | check_is_fitted(self) 171 | return ["awesome_output_feature_{i}" for i in range(self.n_cols_)] 172 | ``` 173 | -------------------------------------------------------------------------------- /docs/submission.md: -------------------------------------------------------------------------------- 1 | # Submitters 2 | 3 | NumerBlox provides submitters for both Numerai Classic and Signals. 4 | Also check out `example/submitting.ipynb` for more information on Numerai submission. 5 | 6 | ## Why? 7 | - **Simplified Workflow**: Instead of managing multiple manual steps for submissions, `Submitters` allow you to simplify the submission process down to a few lines of code. 8 | 9 | - **Integrated Validation Checks**: Before submitting, `Submitters` performs a series of checks to ensure the submission format is correct and prevent common mistakes that could lead to invalid submissions. 10 | 11 | - **Security**: By providing a way to load credentials from a `.json` file, `Submitters` ensures that you're not hard-coding your secret credentials in the main code, reducing the risk of accidental exposure. 12 | 13 | - **Automatic Cleanup**: For users who run automated jobs, the ability to automatically clean up the environment post-submission ensures that your workspace remains clutter-free. 14 | 15 | With `Submitters`, you can focus more on developing and refining your model and spend less time on the manual aspects of the submission process. 16 | 17 | ## Instantiation 18 | 19 | In order to use a Submitter you should first create a `Key` object which handles credentials. 20 | There are two ways to create a `Key`: 21 | 22 | **1. Initialize `Key` with `pub_id` and `secret_key` from memory.** 23 | 24 | ```py 25 | from numerblox.misc import Key 26 | key = Key(pub_id="Hello", secret_key="World") 27 | ``` 28 | 29 | **2. Load credentials from `.json` file with `load_key_from_json`.** 30 | 31 | JSON file should have the following format: 32 | ```json 33 | {"pub_id": "PUBLIC_ID", "secret_key": "SECRET_KEY"} 34 | ``` 35 | We recommend loading from `.json`. With this method you only have to save your credentials in one (safe) place and avoid leaving reference to a secret key in Python code. 36 | 37 | ```py 38 | from numerblox.misc import load_key_from_json 39 | key = load_key_from_json("my_credentials.json") 40 | ``` 41 | 42 | ## Numerai Classic 43 | 44 | Submissions can be done in 2 lines of code. To initialize the submitter object, pass a directory path for saving submissions and a `Key` object. 45 | 46 | `NumeraiClassicSubmitter.full_submission` will perform: 47 | 1. Checks to prevent surprise behavior (including value range and column validity) 48 | 2. Saving to CSV 49 | 3. Uploading with `numerapi`. 50 | 51 | The `dataf` argument can be either a `pd.DataFrame` or `NumerFrame`. 52 | 53 | ```py 54 | from numerblox.submission import NumeraiClassicSubmitter 55 | submitter = NumeraiClassicSubmitter(directory_path="sub_current_round", key=key) 56 | # Your prediction file with 'id' as index and defined 'cols' below. 57 | dataf = pd.DataFrame(columns=["prediction"]) 58 | # Only works with valid key credentials and model_name 59 | submitter.full_submission(dataf=dataf, 60 | cols="prediction", 61 | file_name="submission.csv", 62 | model_name="my_model") 63 | ``` 64 | 65 | ## Numerai Signals 66 | 67 | `NumeraiSignalsSubmitter` is very similar to `NumeraiClassicSubmitter`, but has a few additional checks specific to Signals. Mainly, it checks if the data contains a valid ticker column (`"cusip"`, `"sedol"`, `"ticker"`, `"numerai_ticker"` or `"bloomberg_ticker"`) and a `'signal'` column. 68 | 69 | `NumeraiSignalsSubmitter.full_submission` handles checks, saving of CSV and uploading with `numerapi`. 70 | 71 | ```py 72 | from numerblox.submission import NumeraiSignalsSubmitter 73 | submitter = NumeraiSignalsSubmitter(directory_path="sub_current_round", key=key) 74 | # Your prediction file with 'id' as index, a valid ticker column and signal column below. 75 | dataf = pd.DataFrame(columns=['bloomberg_ticker', 'signal']) 76 | # Only works with valid key credentials and model_name 77 | submitter.full_submission(dataf=dataf, 78 | cols=["bloomberg_ticker", "signal"], 79 | file_name="submission.csv", 80 | model_name="my_signals_model") 81 | ``` 82 | 83 | ## Numerai Crypto 84 | 85 | `NumeraiCryptoSubmitter` has checks specific to Crypto. Mainly, it checks if the data contains a valid symbol column (`"symbol"`) and a `'signal'` column. 86 | 87 | `NumeraiCryptoSubmitter.full_submission` handles checks, saving of CSV and uploading with `numerapi`. 88 | 89 | ```py 90 | from numerblox.submission import NumeraiCryptoSubmitter 91 | submitter = NumeraiCryptoSubmitter(directory_path="sub_current_round", key=key) 92 | # Your prediction file with 'id' as index, a valid symbol column and signal column below. 93 | dataf = pd.DataFrame(columns=['symbol', 'signal']) 94 | # Only works with valid key credentials and model_name 95 | submitter.full_submission(dataf=dataf, 96 | cols=["symbol", "signal"], 97 | file_name="submission.csv", 98 | model_name="my_crypto_model") 99 | ``` 100 | 101 | ## NumerBay 102 | 103 | NumerBlox also offers functionality to submit predictions from [NumerBay](https://numerbay.ai). This is a marketplace where Numerai predictions are bought and sold. Uploading from Numerbay is similar, but also requires authentication with your NumerBay account. 104 | 105 | Also make sure the `numerbay` library is installed. 106 | 107 | ```bash 108 | pip install numerbay 109 | ``` 110 | 111 | ```py 112 | from numerblox.submission import NumeraiClassicSubmitter, NumerBaySubmitter 113 | # Your prediction DataFrame 114 | dataf = pd.DataFrame(columns=["prediction"]) 115 | 116 | # Full submission to both Numerai and NumerBay 117 | numerbay_submitter = NumerBaySubmitter( 118 | tournament_submitter = NumeraiClassicSubmitter(directory_path="sub_current_round", key=key), 119 | numerbay_username="yourusername", 120 | numerbay_password="yourpassword" 121 | ) 122 | numerbay_submitter.full_submission( 123 | dataf=dataf, 124 | model_name="my_model", 125 | numerbay_product_full_name="numerai-predictions-yourproductname", 126 | file_name="submission.csv" 127 | ) 128 | ``` 129 | 130 | ## Note 131 | 132 | When you are done with submissions and don't need the submission file you can remove the submission directory with 1 line. Convenient if you have automated jobs and want to avoid clutter due to saving submission files for every round. 133 | 134 | ```py 135 | # Clean up environment 136 | submitter.remove_base_directory() 137 | ``` -------------------------------------------------------------------------------- /docs/targets.md: -------------------------------------------------------------------------------- 1 | # Target Engineering 2 | 3 | Target engineering object allows you to easily create synthetic targets to train on or to convert raw price data into Numerai-style targets. 4 | 5 | ## Why? 6 | 7 | - **Enhanced Experimentation**: The availability of synthetic targets through the `BayesianGMMTargetProcessor` allows modelers to test new algorithms, techniques, or strategies. 8 | 9 | - **Align with Numerai's Methodology**: `SignalsTargetProcessor` ensures that the targets you use are consistent with Numerai's approach. This alignment boosts the relevance of your models, potentially leading to better performance in the competition. 10 | 11 | - **Versatility**: With different windows and target types, `SignalsTargetProcessor` offers a rich set of features, allowing for a more nuanced approach to model training. By exploring different timeframes and target representations, you can gain a deeper understanding of the data's dynamics. 12 | 13 | - **Efficiency**: Manually engineering features or creating synthetic targets can be time-consuming and error-prone. These processors automate intricate steps, saving time and ensuring accuracy. 14 | 15 | By integrating these processors into your workflow, you can enhance your modeling capabilities, streamline experimentation, and align closer to Numerai's expectations. 16 | 17 | ## BayesianGMMTargetProcessor 18 | 19 | The `BayesianGMMTargetProcessor` generates synthetic targets based on a Bayesian Gaussian Mixture model. It's primarily used for creating fake targets, which are useful for experimenting and validating model structures without exposing true labels. 20 | 21 | ### Example: 22 | ```py 23 | from numerblox.targets import BayesianGMMTargetProcessor 24 | processor = BayesianGMMTargetProcessor(n_components=3) 25 | processor.fit(X=train_features, y=train_targets, era_series=train_eras) 26 | fake_target = processor.transform(X=train_features, era_series=train_eras) 27 | ``` 28 | 29 | For more detailed examples and use-cases, check out `examples/synthetic_data_generation.ipynb.` 30 | 31 | 32 | ## SignalsTargetProcessor 33 | 34 | The `SignalsTargetProcessor` is specifically designed to engineer targets for Numerai Signals. This involves converting raw price data into Numerai-style targets. 35 | 36 | ### Example: 37 | ```py 38 | from numerblox.targets import SignalsTargetProcessor 39 | processor = SignalsTargetProcessor(price_col="close") 40 | signals_target_data = processor.transform(dataf=data, era_series=eras_column) 41 | ``` -------------------------------------------------------------------------------- /examples/google_cloud_storage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "All `Downloaders` and `Submittors` support Google Cloud Storage (GCS).\n", 8 | "\n", 9 | "__Credentials are detected automatically in the following way:__\n", 10 | "1. The environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set and points to a valid `.json` file.\n", 11 | "\n", 12 | "2. (Fallback 1) You have a valid Cloud SDK installation.\n", 13 | "\n", 14 | "3. (Fallback 2) The machine running the code is a GCP machine." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from numerblox.download import NumeraiClassicDownloader" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Example usage" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In order to use GCS you should:\n", 38 | "1. Instantiate a `Downloader` or `Submitter`.\n", 39 | "\n", 40 | "2a. For single files, call `.upload_file_to_gcs` or `.download_file_from_gcs`.\n", 41 | "\n", 42 | "2b. For directories, call `.upload_directory_to_gcs` or `.download_directory_from_gcs`." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "#### 1a. Downloading Numerai Classic inference data and uploading to GCS" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# This should point to a valid GCS bucket within your Google Cloud environment.\n", 59 | "bucket_name = \"test\"\n", 60 | "\n", 61 | "# Get inference data for current round\n", 62 | "downloader = NumeraiClassicDownloader(\"round_n\")\n", 63 | "downloader.download_inference_data(\"inference\", version=\"5.0\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "All the data that has been downloaded can be uploaded to a GCS bucket with 1 line of code." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Upload inference data for most recent round to GCS\n", 80 | "# downloader.upload_directory_to_gcs(bucket_name=bucket_name, gcs_path=\"round_n\")" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "#### 2b. Downloading inference data from GCS Bucket\n", 88 | "\n", 89 | "Conversely, A directory stored in a GCS bucket can be downloaded to your local directory. It will be stored in the base directory specified when you instantiated `nmr_downloader`." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Download data from bucket to local directory\n", 99 | "# downloader.download_directory_from_gcs(bucket_name=bucket_name, gcs_path=\"round_n\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Your local environment can be cleaned up with 1 line of code. Convenient if you are done with inference and would like to delete downloaded inference data automatically." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# Clean up environment\n", 116 | "downloader.remove_base_directory()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "------------------------------------" 124 | ] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "python3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.9.12" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 0 148 | } 149 | -------------------------------------------------------------------------------- /examples/numerbay_integration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This part of the tutorial demonstrates how to use `NumerFrame` to download predictions bought on [NumerBay](http://numerbay.ai/) community marketplace. Currently only the main tournament is supported. Signals support will be added in future." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from numerblox.download import NumeraiClassicDownloader\n", 17 | "from numerblox.numerframe import create_numerframe" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "First, we download validation data using `NumeraiClassicDownloader`." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "downloader = NumeraiClassicDownloader(\"numerframe_edu\")\n", 34 | "# Path variables\n", 35 | "tournament_file = \"v5.0/validation.parquet\"\n", 36 | "tournament_save_path = f\"{str(downloader.dir)}/{tournament_file}\"\n", 37 | "# Download only tournament parquet file\n", 38 | "downloader.download_single_dataset(tournament_file, dest_path=tournament_save_path)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Loading in data and initializing a `NumerFrame` takes one line of code. It will automatically recognize the data format such as `.csv` or `.parquet`. You have the option to add metadata, which is stored in the `meta` attribute." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Initialize NumerFrame from parquet file path\n", 55 | "dataf = create_numerframe(tournament_save_path)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "dataf.head(2)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Call the `predict` method on the `NumerFrame` to fetch the prediction file from NumerBay. If the file already exists in the `data_directory`, that file will be loaded without re-downloading." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# preds = nb_model.predict(dataf)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "The predictions are concatenated to the `NumerFrame` with column name `prediction_numerai-predictions-numerbay`" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# preds" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "In this part of the tutorial we have downloaded a prediction file from NumerBay with `NumerFrame`. This makes things easier for post processing such as ensembling and neutralization." 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "------------------------------------------------------" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "# NumerBay submission" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "This part of the tutorial is for sellers who want to upload their predictions to NumerBay to fulfill sale orders. Using `NumerBaySubmitter`, a seller can choose to submit to both Numerai and NumerBay or just NumerBay." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Assume we have some prediction column to upload for the Numerai main tournament, in this case the `prediction` column which simply takes the value of a feature." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# dataf = create_numerframe(tournament_save_path)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "# dataf['prediction'] = dataf['feature_dichasial_hammier_spawner']" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Set `upload_to_numerai` to True (default) if you want to submit to both Numerai and NumerBay, set to False to submit only to NumerBay." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# nb_submitter = NumerBaySubmitter(tournament_submitter=numerai_submitter, upload_to_numerai=True, numerbay_username=\"numerbay\", numerbay_password=\"your_password\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Finally, we call the `full_submission` method to perform the submission" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# nb_submitter.full_submission(dataf, file_name='upload-full.csv', model_name='numerbay', numerbay_product_full_name='numerai-predictions-numerbay', cols='prediction')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "The process for Signals submission is very similar and is omitted for brevity, just do the following:\n", 189 | "- Use Signals NumerFrame\n", 190 | "- Change `NumeraiClassicSubmitter` to `NumeraiSignalsSubmitter` for the `tournament_submitter` argument\n", 191 | "- When calling `full_submission`, change the `cols` argument to the list of Signals column to submit (e.g. `['bloomberg_ticker', 'signal']`)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "------------------------------------------------------" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "After we are done we can easily clean up our downloaded data with one line of code called from the downloader." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# Clean up environment\n", 215 | "downloader.remove_base_directory()" 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "python3", 222 | "language": "python", 223 | "name": "python3" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 4 228 | } 229 | -------------------------------------------------------------------------------- /examples/quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 0. Dependencies" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from xgboost import XGBRegressor\n", 18 | "\n", 19 | "from numerblox.download import NumeraiClassicDownloader\n", 20 | "from numerblox.evaluation import NumeraiClassicEvaluator\n", 21 | "from numerblox.misc import Key\n", 22 | "from numerblox.numerframe import create_numerframe\n", 23 | "from numerblox.prediction_loaders import ExamplePredictions\n", 24 | "from numerblox.submission import NumeraiClassicSubmitter" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# 1. Download" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "`NumeraiClassicDownloader` allows you to download training and inference data with a single line of code." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Download data\n", 48 | "downloader = NumeraiClassicDownloader(\"data\")\n", 49 | "# Training and validation data\n", 50 | "downloader.download_training_data(\"train_val\", version=\"5.0\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# 2. Train" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "We use a custom Pandas DataFrame data structure called `NumerFrame` with `create_numerframe` here to easily parse the Numerai data. The usage of `NumerFrame` is completely optional, but greatly simplify the building of Numerai pipelines and experimentation with Numerai data.\n", 65 | "\n", 66 | "We then fit a simple XGBoost regressor model." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "df = create_numerframe(\"data/train_val/train.parquet\")\n", 76 | "X, y = df.sample(100).get_feature_target_pair(multi_target=False)\n", 77 | "xgb = XGBRegressor()\n", 78 | "xgb.fit(X.values, y.values)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "# 3. Evaluate" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "`NumeraiClassicEvaluator` will calculate all relevant Numerai metrics. \n", 93 | "\n", 94 | "`ExamplePredictions` is a NumerBlox class that handles downloading of example predictions for you. This object like all other NumerBlox processors can also used end to end in a scikit-learn pipeline." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "val_df = create_numerframe(\"data/train_val/validation.parquet\")[:100]\n", 104 | "val_df[\"prediction\"] = xgb.predict(val_df.get_feature_data)\n", 105 | "val_df[\"example_preds\"] = ExamplePredictions(\"v5.0/validation_example_preds.parquet\").fit_transform(None)[\"prediction\"].values[:100]\n", 106 | "evaluator = NumeraiClassicEvaluator()\n", 107 | "metrics = evaluator.full_evaluation(val_df, example_col=\"example_preds\", pred_cols=[\"prediction\"], target_col=\"target\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "# 4. Inference" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Here again `NumeraiClassicDownloader` and `NumerFrame` are leveraged to simplify inference." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "downloader.download_inference_data(\"current_round\", version=\"5.0\")\n", 131 | "live_df = create_numerframe(file_path=\"data/current_round/live.parquet\")\n", 132 | "live_X, live_y = live_df.get_feature_target_pair(multi_target=False)\n", 133 | "preds = xgb.predict(live_X)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "# 5. Submission" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "`NumeraiClassicSubmitter` takes care of data integrity checks and submission to Numerai for you. Credentials are conveniently initialized with a `Key` object." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# Submit\n", 157 | "NUMERAI_PUBLIC_ID = \"YOUR_PUBLIC_ID\"\n", 158 | "NUMERAI_SECRET_KEY = \"YOUR_SECRET_KEY\"\n", 159 | "key = Key(pub_id=NUMERAI_PUBLIC_ID, secret_key=NUMERAI_SECRET_KEY)\n", 160 | "submitter = NumeraiClassicSubmitter(directory_path=\"sub_current_round\", key=key)\n", 161 | "# Your prediction file with 'id' as index and defined 'cols' below.\n", 162 | "pred_dataf = pd.DataFrame(preds, index=live_df.index, columns=[\"prediction\"])\n", 163 | "# Only works with valid key credentials and model_name\n", 164 | "# submitter.full_submission(dataf=pred_dataf,\n", 165 | "# cols=\"prediction\",\n", 166 | "# file_name=\"submission.csv\",\n", 167 | "# model_name=\"MY_MODEL_NAME\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# 6. Clean up environment (optional)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "All downloader and submitter have functionality to remove themselver. This is especially convenient if you are running a daily inference pipeline on your server or a cloud VM." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "downloader.remove_base_directory()\n", 191 | "submitter.remove_base_directory()" 192 | ] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "classic_prod", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.11.5" 212 | }, 213 | "orig_nbformat": 4 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 2 217 | } 218 | -------------------------------------------------------------------------------- /examples/synthetic_data_generation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This example notebook covers ways to generate synthetic data using `numerblox` components. Synthetic data can be a great way to improve performance simply by having more data to train. We will both cover ways to generate synthetic target variables and features." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 0. Download and load" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from uuid import uuid4\n", 24 | "\n", 25 | "import pandas as pd\n", 26 | "\n", 27 | "from numerblox.download import NumeraiClassicDownloader" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "unique_id = uuid4()\n", 37 | "\n", 38 | "dl = NumeraiClassicDownloader(directory_path=f\"synth_test_{unique_id}\")\n", 39 | "dl.download_training_data(version=\"5.0\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "dataf = pd.read_parquet(f\"synth_test_{unique_id}/train.parquet\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "dataf.head(2)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## 1. Synthetic target (Bayesian GMM)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "First we will tackle the problem of creating a synthetic target column to improve model performance. `BayesianGMMTargetProcessor` allows you to generate a new target variable based on a given target. The preprocessor sample the target from a [Bayesian Gaussian Mixture model](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html) which is fitted on coefficients from a [regularized linear model (Ridge regression)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html).\n", 72 | "\n", 73 | "This implementation is based on a [Github Gist by Michael Oliver (mdo)](https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93)." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from numerblox.targets import BayesianGMMTargetProcessor" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "dataf.head()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "bgmm = BayesianGMMTargetProcessor()\n", 101 | "bgmm.set_output(transform=\"pandas\")\n", 102 | "sample = dataf.sample(1000)\n", 103 | "X = sample[[\"feature_polaroid_vadose_quinze\", \"feature_genuine_kyphotic_trehala\"]].fillna(0.5)\n", 104 | "y = sample[\"target\"]\n", 105 | "eras = sample[\"era\"]\n", 106 | "bgmm.fit(X, y, eras=eras)\n", 107 | "fake_target = bgmm.transform(X, eras=eras)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "fake_target.head(10)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "# Clean up environment\n", 126 | "dl.remove_base_directory()" 127 | ] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "python3", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.9.12" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 0 151 | } 152 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: numerblox 2 | theme: 3 | name: material 4 | palette: 5 | # Palette toggle for light mode 6 | - media: "(prefers-color-scheme: light)" 7 | scheme: default 8 | primary: black 9 | accent: cyan 10 | toggle: 11 | icon: material/brightness-7 12 | name: Switch to dark mode 13 | # Palette toggle for dark mode 14 | - media: "(prefers-color-scheme: dark)" 15 | scheme: slate 16 | primary: black 17 | accent: cyan 18 | toggle: 19 | icon: material/brightness-4 20 | name: Switch to light mode 21 | font: 22 | text: Roboto 23 | custom_dir: docs/assets 24 | icon: 25 | logo: cc_white 26 | favicon: assets/.icons/favicon.ico 27 | features: 28 | - navigation.top 29 | - navigation.tracking 30 | - navigation.expand 31 | - navigation.path 32 | - content.code.copy 33 | - navigation.instant 34 | - navigation.instant.prefetch 35 | - navigation.sections 36 | 37 | repo_url: https://github.com/crowdcent/numerblox/ 38 | repo_name: crowdcent/numerblox 39 | 40 | nav: 41 | - Home: index.md 42 | - End-To-End Examples: end_to_end.md 43 | 44 | - Blox: 45 | - Downloaders: download.md 46 | - NumerFrame: numerframe.md 47 | - Preprocessing: preprocessing.md 48 | - Target Engineering: targets.md 49 | - Postprocessing: postprocessing.md 50 | - Meta Pipelines: meta.md 51 | - Prediction Loaders: prediction_loaders.md 52 | - Models: models.md 53 | - Evaluation: evaluation.md 54 | - Submitters: submission.md 55 | - Model Upload: model_upload.md 56 | 57 | - API: 58 | - API Reference: api.md 59 | 60 | - More: 61 | - How To Contribute: contributing.md 62 | - About CrowdCent: crowdcent.md 63 | - Disclaimer: disclaimer.md 64 | 65 | extra: 66 | social: 67 | - icon: fontawesome/brands/github 68 | link: https://github.com/CrowdCent 69 | name: crowdcent on github 70 | - icon: fontawesome/brands/x-twitter 71 | link: https://x.com/CrowdCent 72 | name: crowdcent on X 73 | generator: false 74 | 75 | copyright: Made by CrowdCent 76 | plugins: 77 | - search 78 | - mkdocstrings -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "numerblox" 3 | version = "1.6.1" 4 | description = "Solid Numerai Pipelines" 5 | authors = [ 6 | {name = "CrowdCent", email = "support@crowdcent.com"}, 7 | ] 8 | license = {text = "MIT License"} 9 | readme = "README.md" 10 | requires-python = ">=3.10,<4" 11 | dependencies = [ 12 | "tqdm>=4.66.1", 13 | "numpy>=1.26.3", 14 | "scipy>=1.10.0", 15 | "pandas>=2.1.1", 16 | "pandas-ta-classic>=0.3.14b", 17 | "joblib>=1.3.2", 18 | "pyarrow>=14.0.1", 19 | "numerapi>=2.19.1", 20 | "matplotlib>=3.4.0", 21 | "scikit-learn>=1.6.1", 22 | "python-dateutil>=2.8.2", 23 | "google-cloud-storage>=2.11.0", 24 | "numerai-era-data>=0.1.1", 25 | "numerai-tools>=0.2.2", 26 | "polars>=1.5.0", 27 | "werkzeug>=3.0.3", 28 | ] 29 | 30 | [project.optional-dependencies] 31 | test = [ 32 | "pytest<8.0.0,>=7.4.2", 33 | "pytest-cov<5.0.0,>=4.1.0", 34 | "pytest-mock<4.0.0,>=3.11.1", 35 | "mkdocs<2.0.0,>=1.5.3", 36 | "mkdocs-material<10.0.0,>=9.4.2", 37 | "eod<1.0.0,>=0.2.1", 38 | "kaggle<2.0.0,>=1.5.16", 39 | "scikit-lego>=0.9.1", 40 | "xgboost>=2.0.0", 41 | "mkdocstrings-python<2.0.0,>=1.7.1", 42 | "ruff>=0.3.0", 43 | ] 44 | 45 | [tool.ruff] 46 | line-length = 300 47 | 48 | [tool.ruff.lint] 49 | ignore = ["F403", "F811"] 50 | select = ["E", "F", "I"] 51 | 52 | [tool.ruff.lint.per-file-ignores] 53 | "__init__.py" = ["F401"] 54 | 55 | [build-system] 56 | requires = ["setuptools>=61.0"] 57 | build-backend = "setuptools.build_meta" 58 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore:pkg_resources is deprecated as an API:DeprecationWarning 4 | -------------------------------------------------------------------------------- /src/numerblox/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/src/numerblox/__init__.py -------------------------------------------------------------------------------- /src/numerblox/ensemble.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import List, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import scipy 7 | import sklearn 8 | from sklearn.base import BaseEstimator, TransformerMixin 9 | 10 | 11 | class NumeraiEnsemble(TransformerMixin, BaseEstimator): 12 | """ 13 | Ensembler that standardizes predictions by era and averages them. 14 | :param weights: Sequence of weights (float or int), optional, default: None. 15 | If None, then uniform weights are used. 16 | :param n_jobs: The number of jobs to run in parallel for fit. 17 | Will revert to 1 CPU core if not defined. 18 | -1 means using all processors. 19 | :param donate_weighted: Whether to use Donate et al.'s weighted average formula. 20 | Often used when ensembling predictions from multiple folds over time. 21 | Paper Link: https://doi.org/10.1016/j.neucom.2012.02.053 22 | Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5] 23 | """ 24 | 25 | def __init__(self, weights=None, donate_weighted=False): 26 | sklearn.set_config(enable_metadata_routing=True) 27 | self.set_transform_request(era_series=True) 28 | self.set_predict_request(era_series=True) 29 | super().__init__() 30 | self.weights = weights 31 | if self.weights and sum(self.weights) != 1: 32 | warnings.warn(f"Warning: Weights do not sum to 1. Got {sum(self.weights)}.") 33 | self.donate_weighted = donate_weighted 34 | 35 | def fit(self, X: Union[np.array, pd.DataFrame], y=None): 36 | self.is_fitted_ = True 37 | return self 38 | 39 | def transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array: 40 | """ 41 | Standardize by era and ensemble. 42 | :param X: Input data where each column contains predictions from an estimator. 43 | :param era_series: Era labels (strings) for each row in X. 44 | :return: Ensembled predictions. 45 | """ 46 | assert era_series is not None, "Era series must be provided for NumeraiEnsemble." 47 | assert len(X) == len(era_series), f"input X and era_series must have the same length. Got {len(X)} != {len(era_series)}." 48 | 49 | if len(X.shape) == 1: 50 | raise ValueError("NumeraiEnsemble requires at least 2 prediction columns. Got 1.") 51 | 52 | n_models = X.shape[1] 53 | if n_models <= 1: 54 | raise ValueError(f"NumeraiEnsemble requires at least 2 predictions columns. Got {len(n_models)}.") 55 | 56 | # Override weights if donate_weighted is True 57 | if self.donate_weighted: 58 | weights = self._get_donate_weights(n=n_models) 59 | else: 60 | weights = self.weights 61 | 62 | if isinstance(X, pd.DataFrame): 63 | X = X.values 64 | # Standardize predictions by era 65 | standardized_pred_list = [] 66 | for i in range(n_models): 67 | # Skip standardization if all predictions are the same 68 | pred = X[:, i] 69 | if np.isnan(pred).any(): 70 | warnings.warn(f"Warning: Some predictions in column '{i}' contain NaNs. Consider checking your estimators. Ensembled predictions will also be a NaN.") 71 | if np.all(pred == pred[0]): 72 | warnings.warn(f"Warning: Predictions in column '{i}' are all constant. Consider checking your estimators. Skipping these estimator predictions in ensembling.") 73 | else: 74 | standardized_pred = self._standardize_by_era(pred, era_series) 75 | standardized_pred_list.append(standardized_pred) 76 | standardized_pred_arr = np.asarray(standardized_pred_list).T 77 | 78 | if not standardized_pred_list: 79 | raise ValueError("Predictions for all columns are constant. No valid predictions to ensemble.") 80 | 81 | # Average out predictions 82 | ensembled_predictions = np.average(standardized_pred_arr, axis=1, weights=weights) 83 | return ensembled_predictions.reshape(-1, 1) 84 | 85 | def fit_transform(self, X: Union[np.array, pd.DataFrame], y=None, era_series: pd.Series = None) -> np.array: 86 | self.fit(X, y) 87 | return self.transform(X, era_series) 88 | 89 | def predict(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array: 90 | """ 91 | For if a NumeraiEnsemble happens to be the last step in the pipeline. Has same behavior as transform. 92 | """ 93 | return self.transform(X, era_series=era_series) 94 | 95 | def _standardize(self, X: np.array) -> np.array: 96 | """ 97 | Standardize single era. 98 | :param X: Predictions for a single era. 99 | :return: Standardized predictions. 100 | """ 101 | percentile_X = (scipy.stats.rankdata(X, method="ordinal") - 0.5) / len(X) 102 | return percentile_X 103 | 104 | def _standardize_by_era(self, X: np.array, era_series: Union[np.array, pd.Series, pd.DataFrame]) -> np.array: 105 | """ 106 | Standardize predictions of a single estimator by era. 107 | :param X: All predictions of a single estimator. 108 | :param era_series: Era labels (strings) for each row in X. 109 | :return: Standardized predictions. 110 | """ 111 | if isinstance(era_series, (pd.Series, pd.DataFrame)): 112 | era_series = era_series.to_numpy().flatten() 113 | df = pd.DataFrame({"prediction": X, "era": era_series}) 114 | df["standardized_prediction"] = df.groupby("era")["prediction"].transform(self._standardize) 115 | return df["standardized_prediction"].values.flatten() 116 | 117 | def _get_donate_weights(self, n: int) -> list: 118 | """ 119 | Exponential weights as per Donate et al.'s formula. 120 | Example donate weighting for 3 folds: [0.25, 0.25, 0.5] 121 | Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5] 122 | 123 | :param n: Number of estimators. 124 | :return: List of weights. 125 | """ 126 | weights = [] 127 | for j in range(1, n + 1): 128 | j = 2 if j == 1 else j 129 | weights.append(1 / (2 ** (n + 1 - j))) 130 | return weights 131 | 132 | def get_feature_names_out(self, input_features=None) -> List[str]: 133 | return ["numerai_ensemble_predictions"] if not input_features else input_features 134 | 135 | 136 | class PredictionReducer(TransformerMixin, BaseEstimator): 137 | """ 138 | Reduce multiclassification and proba preds to 1 column per model. 139 | If predictions were generated with a regressor or regular predict you don't need this step. 140 | :param n_models: Number of resulting columns. 141 | This indicates how many models were trained to generate the prediction array. 142 | :param n_classes: Number of classes for each prediction. 143 | If predictions were generated with predict_proba and binary classification -> n_classes = 2. 144 | """ 145 | 146 | def __init__(self, n_models: int, n_classes: int): 147 | super().__init__() 148 | if n_models < 1: 149 | raise ValueError(f"n_models must be >= 1. Got '{n_models}'.") 150 | self.n_models = n_models 151 | if n_classes < 2: 152 | raise ValueError(f"n_classes must be >= 2. If n_classes = 1 you don't need PredictionReducer. Got '{n_classes}'.") 153 | self.n_classes = n_classes 154 | self.dot_array = [i for i in range(self.n_classes)] 155 | 156 | def fit(self, X: np.array, y=None): 157 | return self 158 | 159 | def transform(self, X: np.array): 160 | """ 161 | :param X: Input predictions. 162 | :return: Reduced predictions of shape (X.shape[0], self.n_models). 163 | """ 164 | reduced = [] 165 | expected_n_cols = self.n_models * self.n_classes 166 | if len(X.shape) != 2: 167 | raise ValueError(f"Expected X to be a 2D array. Got '{len(X.shape)}' dimension(s).") 168 | if X.shape[1] != expected_n_cols: 169 | raise ValueError(f"Input X must have {expected_n_cols} columns. Got {X.shape[1]} columns while n_models={self.n_models} * n_classes={self.n_classes} = {expected_n_cols}. ") 170 | for i in range(self.n_models): 171 | # Extracting the predictions of the i-th model 172 | model_preds = X[:, i * self.n_classes : (i + 1) * self.n_classes] 173 | r = model_preds @ self.dot_array 174 | reduced.append(r) 175 | reduced_arr = np.column_stack(reduced) 176 | return reduced_arr 177 | 178 | def predict(self, X: np.array): 179 | """ 180 | For if PredictionReducer happens to be the last step in the pipeline. Has same behavior as transform. 181 | :param X: Input predictions. 182 | :return: Reduced predictions of shape (X.shape[0], self.n_models). 183 | """ 184 | return self.transform(X) 185 | 186 | def get_feature_names_out(self, input_features=None) -> List[str]: 187 | return [f"reduced_prediction_{i}" for i in range(self.n_models)] if not input_features else input_features 188 | -------------------------------------------------------------------------------- /src/numerblox/misc.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class AttrDict(dict): 5 | """Access dictionary elements as attributes.""" 6 | 7 | def __init__(self, *args, **kwargs): 8 | super(AttrDict, self).__init__(*args, **kwargs) 9 | self.__dict__ = self 10 | 11 | 12 | class Key: 13 | """Numerai credentials.""" 14 | 15 | def __init__(self, pub_id: str, secret_key: str): 16 | self.pub_id = pub_id 17 | self.secret_key = secret_key 18 | 19 | def __repr__(self): 20 | return f"Numerai Auth Key. pub_id = '{self.pub_id}'" 21 | 22 | def __str__(self): 23 | return self.__repr__() 24 | 25 | 26 | def load_key_from_json(file_path: str, *args, **kwargs): 27 | """ 28 | Initialize Key object from JSON file. \n 29 | Credentials file must have the following format: \n 30 | `{"pub_id": "PUBLIC_ID", "secret_key": "SECRET_KEY"}` 31 | """ 32 | with open(file_path) as json_file: 33 | json_data = json.load(json_file, *args, **kwargs) 34 | pub_id = json_data["pub_id"] 35 | secret_key = json_data["secret_key"] 36 | return Key(pub_id=pub_id, secret_key=secret_key) 37 | -------------------------------------------------------------------------------- /src/numerblox/model_upload.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Any, Callable, List, Optional, Union 3 | 4 | import cloudpickle 5 | import pandas as pd 6 | from numerapi import NumerAPI 7 | 8 | from .misc import Key 9 | 10 | 11 | class NumeraiModelUpload: 12 | """ 13 | A class to handle the uploading of machine learning models to Numerai's servers. 14 | 15 | :param key: API key object containing public and secret keys for NumerAPI authentication. 16 | :param max_retries: Maximum number of attempts to upload the model. 17 | :param sleep_time: Number of seconds to wait between retries. 18 | :param fail_silently: Whether to suppress exceptions during upload. 19 | """ 20 | 21 | def __init__(self, key: Key = None, max_retries: int = 2, sleep_time: int = 10, fail_silently: bool = False, *args, **kwargs): 22 | """ 23 | Initializes the NumeraiModelUpload class with the necessary configuration. 24 | 25 | :param key: API key object containing public and secret keys for NumerAPI authentication. 26 | :param max_retries: Maximum number of retry attempts for model upload. 27 | :param sleep_time: Time (in seconds) to wait between retries. 28 | :param fail_silently: If True, suppress errors during model upload. 29 | :param *args: Additional arguments for NumerAPI. 30 | :param **kwargs: Additional keyword arguments for NumerAPI. 31 | """ 32 | # Initialize NumerAPI with the provided keys and other arguments 33 | self.api = NumerAPI(public_id=key.pub_id, secret_key=key.secret_key, *args, **kwargs) 34 | self.max_retries = max_retries # Set the maximum number of retries 35 | self.sleep_time = sleep_time # Set the sleep time between retries 36 | self.fail_silently = fail_silently # Determine whether to fail silently 37 | 38 | def create_and_upload_model(self, model: Any, feature_cols: Optional[List[str]] = None, model_name: str = None, file_path: str = None, data_version: str = None, docker_image: str = None, custom_predict_func: Callable[[pd.DataFrame], pd.DataFrame] = None) -> Union[str, None]: 39 | """ 40 | Creates a model prediction function, serializes it, and uploads the model to Numerai. 41 | :param model: The machine learning model object. 42 | :param feature_cols: List of feature column names for predictions. Defaults to None. 43 | :param model_name: The name of the model to upload. 44 | :param file_path: The file path where the serialized model function will be saved. 45 | :param data_version: Data version to use for model upload. 46 | :param docker_image: Docker image to use for model upload. 47 | :param custom_predict_func: Custom prediction function to use instead of the model's predict method. 48 | 49 | :return: Upload ID if the upload is successful, None otherwise. 50 | """ 51 | # Determine which prediction function to use 52 | if custom_predict_func is not None: 53 | predict = custom_predict_func # Use custom prediction function if provided 54 | else: 55 | # Define default prediction function 56 | def predict(live_features: pd.DataFrame) -> pd.DataFrame: 57 | # Determine feature columns to use for predictions 58 | if feature_cols is None: 59 | feature_cols_local = [col for col in live_features.columns if col.startswith("feature_")] 60 | else: 61 | feature_cols_local = feature_cols 62 | 63 | # Predict using the model 64 | live_predictions = model.predict(live_features[feature_cols_local]) 65 | 66 | # Rank predictions and convert to a DataFrame 67 | submission = pd.Series(live_predictions, index=live_features.index).rank(pct=True, method="first") 68 | return submission.to_frame("prediction") 69 | 70 | # Serialize the prediction function and save to the specified file path 71 | print(f"Serializing the predict function and saving to '{file_path}'") 72 | with open(file_path, "wb") as f: 73 | cloudpickle.dump(predict, f) 74 | 75 | # Get the model ID for the specified model name 76 | model_id = self._get_model_id(model_name=model_name) 77 | api_type = self.api.__class__.__name__ # Get the type of API being used 78 | print(f"{api_type}: Uploading model from '{file_path}' for model '{model_name}' (model_id='{model_id}')") 79 | 80 | # Attempt to upload the model, retrying if necessary 81 | for attempt in range(self.max_retries): 82 | try: 83 | # Attempt to upload the model 84 | upload_id = self.api.model_upload(file_path=file_path, model_id=model_id, data_version=data_version, docker_image=docker_image) 85 | print(f"{api_type} model upload of '{file_path}' for '{model_name}' is successful! Upload ID: {upload_id}") 86 | return upload_id # Return upload ID if successful 87 | except Exception as e: 88 | # Handle failed upload attempts 89 | if attempt < self.max_retries - 1: 90 | print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Retrying in {self.sleep_time} seconds...") 91 | print(f"Error: {e}") 92 | time.sleep(self.sleep_time) # Wait before retrying 93 | else: 94 | # Handle final failed attempt 95 | if self.fail_silently: 96 | print(f"Failed to upload model '{file_path}' for '{model_name}' to Numerai. Skipping...") 97 | print(f"Error: {e}") 98 | else: 99 | print(f"Failed to upload model '{file_path}' for '{model_name}' after {self.max_retries} attempts.") 100 | raise e # Raise the exception if not failing silently 101 | 102 | def get_available_data_versions(self) -> dict: 103 | """ 104 | Retrieves the available data versions for model uploads. 105 | 106 | :return: A dictionary of available data versions. 107 | """ 108 | # Call NumerAPI to get available data versions 109 | return self.api.model_upload_data_versions() 110 | 111 | def get_available_docker_images(self) -> dict: 112 | """ 113 | Retrieves the available Docker images for model uploads. 114 | 115 | :return: A dictionary of available Docker images. 116 | """ 117 | # Call NumerAPI to get available Docker images 118 | return self.api.model_upload_docker_images() 119 | 120 | def _get_model_id(self, model_name: str) -> str: 121 | """ 122 | Retrieves the model ID for a given model name. 123 | 124 | :param model_name: The name of the model. 125 | :return: The ID of the model. 126 | 127 | Raises ValueError if the model name is not found in the user's Numerai account. 128 | """ 129 | # Get the mapping of model names to model IDs 130 | model_mapping = self.get_model_mapping 131 | if model_name in model_mapping: 132 | return model_mapping[model_name] # Return the model ID if found 133 | else: 134 | # Raise an error if the model name is not found 135 | available_models = ", ".join(model_mapping.keys()) 136 | raise ValueError(f"Model name '{model_name}' not found in your Numerai account. " f"Available model names: {available_models}") 137 | 138 | @property 139 | def get_model_mapping(self) -> dict: 140 | """ 141 | Retrieves the mapping of model names to their IDs from the user's Numerai account. 142 | 143 | :return: A dictionary mapping model names to model IDs. 144 | """ 145 | # Call NumerAPI to get the model mapping 146 | return self.api.get_models() 147 | -------------------------------------------------------------------------------- /src/numerblox/models.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sklearn 3 | from sklearn.utils.validation import check_is_fitted 4 | from xgboost import XGBRegressor 5 | 6 | from .evaluation import NumeraiClassicEvaluator 7 | 8 | 9 | class EraBoostedXGBRegressor(XGBRegressor): 10 | """ 11 | Custom XGBRegressor model that upweights the worst eras in the data. 12 | The worst eras are determined by Corrv2. 13 | NOTE: Currently only supports single target regression. 14 | 15 | This idea was first proposed by Richard Craib in the Numerai forums: 16 | https://forum.numer.ai/t/era-boosted-models/189 17 | 18 | Credits to Michael Oliver (mdo) for proposing the 1st XGBoost implementation of era boosting: 19 | https://forum.numer.ai/t/era-boosted-models/189/3 20 | 21 | :param proportion: Proportion of eras to upweight. 22 | :param trees_per_step: Number of trees to add per iteration. 23 | :param num_iters: Number of total era boosting iterations. 24 | """ 25 | 26 | def __init__(self, proportion=0.5, trees_per_step=10, num_iters=200, **xgb_params): 27 | sklearn.set_config(enable_metadata_routing=True) 28 | self.set_fit_request(era_series=True) 29 | super().__init__(**xgb_params) 30 | if not self.n_estimators: 31 | self.n_estimators = 100 32 | assert self.n_estimators >= 1, "n_estimators must be at least 1." 33 | 34 | assert 0 < proportion < 1, "proportion must be between 0 and 1." 35 | self.proportion = proportion 36 | assert trees_per_step >= 0, "trees_per_step must be at least 1." 37 | self.trees_per_step = trees_per_step 38 | assert num_iters >= 2, "num_iters must be at least 2." 39 | self.num_iters = num_iters 40 | 41 | def fit(self, X, y, era_series: pd.Series, **fit_params): 42 | super().fit(X, y, **fit_params) 43 | evaluator = NumeraiClassicEvaluator(era_col="era") 44 | self.feature_names = self.get_booster().feature_names 45 | iter_df = pd.DataFrame(X, columns=self.feature_names) 46 | iter_df["target"] = y 47 | iter_df["era"] = era_series 48 | 49 | for _ in range(self.num_iters - 1): 50 | preds = self.predict(X) 51 | iter_df["predictions"] = preds 52 | era_scores = pd.Series(index=iter_df["era"].unique()) 53 | 54 | # Per era Corrv2 aka "Numerai Corr". 55 | era_scores = evaluator.per_era_numerai_corrs(dataf=iter_df, pred_col="predictions", target_col="target") 56 | # Filter on eras with worst Corrv2. 57 | era_scores.sort_values(inplace=True) 58 | worst_eras = era_scores[era_scores <= era_scores.quantile(self.proportion)].index 59 | worst_df = iter_df[iter_df["era"].isin(worst_eras)] 60 | 61 | # Add estimators and fit on worst eras. 62 | self.n_estimators += self.trees_per_step 63 | booster = self.get_booster() 64 | super().fit(worst_df.drop(columns=["target", "era", "predictions"]), worst_df["target"], xgb_model=booster, **fit_params) 65 | return self 66 | 67 | def get_feature_names_out(self, input_features=None): 68 | """Get output feature names for transformation.""" 69 | check_is_fitted(self) 70 | return self.feature_names if not input_features else input_features 71 | -------------------------------------------------------------------------------- /src/numerblox/neutralizers.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from abc import abstractmethod 3 | from typing import List, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.stats as sp 8 | import sklearn 9 | from joblib import Parallel, delayed 10 | from sklearn.base import BaseEstimator, TransformerMixin 11 | from sklearn.preprocessing import MinMaxScaler 12 | from tqdm import tqdm 13 | 14 | 15 | class BaseNeutralizer(TransformerMixin, BaseEstimator): 16 | """ 17 | Base class for neutralization so it is compatible with scikit-learn. 18 | :param new_col_name: Name of new neutralized column. 19 | """ 20 | 21 | def __init__(self, new_col_names: list): 22 | self.new_col_names = new_col_names 23 | sklearn.set_config(enable_metadata_routing=True) 24 | self.set_transform_request(features=True, era_series=True) 25 | self.set_predict_request(features=True, era_series=True) 26 | super().__init__() 27 | 28 | def fit(self, X=None, y=None): 29 | return self 30 | 31 | @abstractmethod 32 | def transform(self, X: Union[np.array, pd.DataFrame], features: pd.DataFrame, era_series: pd.Series) -> np.array: ... 33 | 34 | def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array: 35 | """Convenience function for scikit-learn compatibility.""" 36 | return self.transform(X=X, features=features, era_series=era_series) 37 | 38 | def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array: 39 | """ 40 | Convenience function for scikit-learn compatibility. 41 | Needed because fit and transform except different arguments here. 42 | """ 43 | return self.fit().transform(X=X, features=features, era_series=era_series) 44 | 45 | def get_feature_names_out(self, input_features: list = None) -> list: 46 | """ 47 | Get feature names for neutralized output. 48 | 49 | :param input_features: Optional list of input feature names. 50 | :return: List of feature names for neutralized output. 51 | """ 52 | return input_features if input_features else self.new_col_names 53 | 54 | 55 | class FeatureNeutralizer(BaseNeutralizer): 56 | """ 57 | Classic feature neutralization by subtracting a linear model. 58 | 59 | :param pred_name: Name of prediction column. For creating the new column name. 60 | :param proportion: Number in range [0...1] indicating how much to neutralize. 61 | :param suffix: Optional suffix that is added to new column name. 62 | :param num_cores: Number of cores to use for parallel processing. 63 | By default, all CPU cores are used. 64 | """ 65 | 66 | def __init__(self, pred_name: Union[str, list] = "prediction", proportion: Union[float, List[float]] = 0.5, suffix: str = None, num_cores: int = -1): 67 | self.pred_name = [pred_name] if isinstance(pred_name, str) else pred_name 68 | self.proportion = [proportion] if isinstance(proportion, float) else proportion 69 | assert len(self.pred_name) == len(set(self.pred_name)), "Duplicate 'pred_names' found. Make sure all names are unique." 70 | assert len(self.proportion) == len(set(self.proportion)), "Duplicate 'proportions' found. Make sure all proportions are unique." 71 | for prop in self.proportion: 72 | assert 0.0 <= prop <= 1.0, f"'proportion' should be a float in range [0...1]. Got '{prop}'." 73 | 74 | new_col_names = [] 75 | for pred_name in self.pred_name: 76 | for prop in self.proportion: 77 | new_col_names.append(f"{pred_name}_neutralized_{prop}_{suffix}" if suffix else f"{pred_name}_neutralized_{prop}") 78 | super().__init__(new_col_names=new_col_names) 79 | self.suffix = suffix 80 | self.num_cores = num_cores 81 | 82 | def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], features: pd.DataFrame, era_series: Union[np.array, pd.Series] = None) -> np.array: 83 | """ 84 | Main transform function. 85 | :param X: Input predictions to neutralize. \n 86 | :param features: DataFrame with features for neutralization. \n 87 | :param era_series: Series with era labels for each row in features. \n 88 | Features, era_series and the prediction column must all have the same length. 89 | :return: Neutralized predictions NumPy array. 90 | """ 91 | if era_series is None: 92 | warnings.warn("""WARNING: 'era_series' not provided for 93 | neutralization! Neutralization will be treated as if 'X' is 1 era of data. Ensure you are not passing multiple eras to neutralization in this way! Not providing 'era_series' is valid for live inference, where only one era is used to generate predictions.""") 94 | else: 95 | assert len(X) == len(era_series), "Input predictions must have same length as era_series." 96 | assert len(X) == len(features), "Input predictions must have same length as features." 97 | 98 | df = features.copy() 99 | if not isinstance(X, np.ndarray): 100 | X = np.array(X) 101 | # Ensure X is a 2D array and has the same number of columns as pred_name 102 | if X.ndim == 1: 103 | assert len(self.pred_name) == 1, "Only one prediction column found. Please input a 2D array or define one column for 'pred_name'." 104 | X = X.reshape(-1, 1) 105 | else: 106 | assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'." 107 | for i, pred_name in enumerate(self.pred_name): 108 | df[pred_name] = X[:, i] 109 | # Treat input as 1 era if era_series is not provided. 110 | df["era"] = era_series if era_series is not None else "X" 111 | 112 | feature_cols = list(features.columns) 113 | tasks = [delayed(self._process_pred_name)(df, pred_name, proportion, feature_cols) for pred_name in tqdm(self.pred_name, desc="Processing feature neutralizations") for proportion in self.proportion] 114 | neutralized_results = Parallel(n_jobs=self.num_cores)(tasks) 115 | neutralized_preds = pd.concat(neutralized_results, axis=1).to_numpy() 116 | return neutralized_preds 117 | 118 | def _process_pred_name(self, df: pd.DataFrame, pred_name: str, proportion: float, feature_cols: List[str]) -> pd.DataFrame: 119 | """ 120 | Process one combination of prediction and proportion. 121 | :param df: DataFrame with features and predictions. 122 | :param pred_name: Name of prediction column. 123 | :param proportion: Proportion to neutralize. 124 | :param feature_cols: List of feature column names. 125 | :return: Neutralized predictions. 126 | Neutralized predictions are scaled to [0...1]. 127 | """ 128 | neutralized_pred = df.groupby("era", group_keys=False).apply(lambda x: self.normalize_and_neutralize(x, [pred_name], feature_cols, proportion)) 129 | return pd.DataFrame(MinMaxScaler().fit_transform(neutralized_pred)) 130 | 131 | def neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame: 132 | """ 133 | Neutralize on CPU. 134 | :param dataf: DataFrame with features and predictions. 135 | :param columns: List of prediction column names. 136 | :param by: List of feature column names. 137 | :param proportion: Proportion to neutralize. 138 | :return: Neutralized predictions. 139 | """ 140 | scores = dataf[columns] 141 | exposures = dataf[by].values 142 | scores = scores - proportion * self._get_raw_exposures(exposures, scores) 143 | return scores / scores.std() 144 | 145 | @staticmethod 146 | def normalize(dataf: pd.DataFrame) -> np.ndarray: 147 | """Normalize predictions. 148 | 1. Rank predictions. 149 | 2. Normalize ranks. 150 | 3. Gaussianize ranks. 151 | :param dataf: DataFrame with predictions. 152 | :return: Gaussianized rank predictions. 153 | """ 154 | normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf) 155 | # Gaussianized ranks 156 | return sp.norm.ppf(normalized_ranks) 157 | 158 | def normalize_and_neutralize(self, dataf: pd.DataFrame, columns: list, by: list, proportion: float) -> pd.DataFrame: 159 | """ 160 | Gaussianize predictions and neutralize with one combination of prediction and proportion. 161 | :param dataf: DataFrame with features and predictions. 162 | :param columns: List of prediction column names. 163 | :param by: List of feature column names. 164 | :param proportion: Proportion to neutralize. 165 | :return: Neutralized predictions DataFrame. 166 | """ 167 | dataf[columns] = self.normalize(dataf[columns]) 168 | dataf[columns] = self.neutralize(dataf, columns, by, proportion) 169 | return dataf[columns] 170 | 171 | @staticmethod 172 | def _get_raw_exposures(exposures: np.array, scores: pd.DataFrame) -> np.array: 173 | """ 174 | Get raw feature exposures. 175 | Make sure predictions are normalized! 176 | :param exposures: Exposures for each era. 177 | :param scores: DataFrame with predictions. 178 | :return: Raw exposures for each era. 179 | """ 180 | return exposures @ np.linalg.lstsq(exposures, scores.values, rcond=None)[0] 181 | -------------------------------------------------------------------------------- /src/numerblox/penalizers.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from abc import abstractmethod 3 | from typing import Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy 8 | import sklearn 9 | from sklearn.base import BaseEstimator, TransformerMixin 10 | from tqdm.auto import tqdm 11 | 12 | try: 13 | import tensorflow as tf 14 | except ImportError: 15 | warnings.warn("TensorFlow is not installed. Some NumerBlox Penalizers may not work. " "To use all features, please install TensorFlow: `pip install tensorflow`", ImportWarning) 16 | 17 | 18 | class BasePenalizer(TransformerMixin, BaseEstimator): 19 | """ 20 | Base class for penalization so it is compatible with scikit-learn. 21 | :param new_col_name: Name of new neutralized column. 22 | """ 23 | 24 | def __init__(self, new_col_name: str): 25 | sklearn.set_config(enable_metadata_routing=True) 26 | self.set_transform_request(features=True, era_series=True) 27 | self.set_predict_request(features=True, era_series=True) 28 | self.new_col_name = new_col_name 29 | super().__init__() 30 | 31 | def fit(self, X=None, y=None): 32 | return self 33 | 34 | @abstractmethod 35 | def transform(self, X: Union[np.array, pd.DataFrame], features: pd.DataFrame, era_series: pd.Series) -> np.array: ... 36 | 37 | def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: 38 | """Convenience function for scikit-learn compatibility.""" 39 | return self.transform(X=X, features=features, era_series=era_series) 40 | 41 | def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: 42 | """ 43 | Convenience function for scikit-learn compatibility. 44 | Needed because fit and transform except different arguments here. 45 | """ 46 | return self.fit().transform(X=X, features=features, era_series=era_series) 47 | 48 | def get_feature_names_out(self, input_features: list = None) -> list: 49 | """ 50 | Get feature names for neutralized output. 51 | 52 | :param input_features: Optional list of input feature names. 53 | :return: List of feature names for neutralized output. 54 | """ 55 | return input_features if input_features else [self.new_col_name] 56 | 57 | 58 | class FeaturePenalizer(BasePenalizer): 59 | """ 60 | Feature penalization with TensorFlow. 61 | 62 | Source (by jrb): https://github.com/jonrtaylor/twitch/blob/master/FE_Clipping_Script.ipynb 63 | 64 | Source of first PyTorch implementation (by Michael Oliver / mdo): https://forum.numer.ai/t/model-diagnostics-feature-exposure/899/12 65 | 66 | :param max_exposure: Number in range [0...1] indicating how much to reduce max feature exposure to. 67 | :param pred_name: Prediction column name. Used for new column name. \n 68 | :param suffix: Optional suffix that is added to new column name. 69 | """ 70 | 71 | def __init__( 72 | self, 73 | max_exposure: float, 74 | pred_name: str = "prediction", 75 | suffix: str = None, 76 | ): 77 | self.max_exposure = max_exposure 78 | self.pred_name = pred_name 79 | assert 0.0 <= max_exposure <= 1.0, f"'max_exposure' should be a float in range [0...1]. Got '{self.max_exposure}'." 80 | new_col_name = f"{self.pred_name}_penalized_{self.max_exposure}_{suffix}" if suffix else f"{self.pred_name}_penalized_{self.max_exposure}" 81 | super().__init__(new_col_name=new_col_name) 82 | self.suffix = suffix 83 | 84 | def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array: 85 | """ 86 | Main transform method. 87 | :param X: Input predictions to neutralize. 88 | :param features: DataFrame with features for neutralization. 89 | :param era_series: Series with era labels for each row in features. 90 | Features, eras and the prediction column must all have the same length. 91 | :return: Penalized predictions. 92 | """ 93 | assert len(X) == len(features), "Input predictions must have same length as features." 94 | assert len(X) == len(era_series), "Input predictions must have same length as eras." 95 | df = features.copy() 96 | df["prediction"] = X 97 | df["era"] = era_series 98 | penalized_data = self._reduce_all_exposures(dataf=df, column=self.pred_name, neutralizers=list(features.columns)) 99 | return penalized_data 100 | 101 | def _reduce_all_exposures( 102 | self, 103 | dataf: pd.DataFrame, 104 | column: str = "prediction", 105 | neutralizers: list = None, 106 | normalize=True, 107 | gaussianize=True, 108 | ) -> pd.DataFrame: 109 | neutralized = [] 110 | 111 | for era in tqdm(dataf["era"].unique()): 112 | dataf_era = dataf[dataf["era"] == era] 113 | scores = dataf_era[[column]].values 114 | exposure_values = dataf_era[neutralizers].values 115 | 116 | if normalize: 117 | scores2 = [] 118 | for x in scores.T: 119 | x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x) 120 | if gaussianize: 121 | x = scipy.stats.norm.ppf(x) 122 | scores2.append(x) 123 | scores = np.array(scores2)[0] 124 | 125 | scores, _ = self._reduce_exposure(scores, exposure_values, len(neutralizers), None) 126 | 127 | scores /= tf.math.reduce_std(scores) 128 | scores -= tf.reduce_min(scores) 129 | scores /= tf.reduce_max(scores) 130 | neutralized.append(scores.numpy()) 131 | 132 | predictions = pd.DataFrame(np.concatenate(neutralized), columns=[column], index=dataf.index) 133 | return predictions 134 | 135 | def _reduce_exposure(self, prediction, features, input_size=50, weights=None): 136 | model = tf.keras.models.Sequential( 137 | [ 138 | tf.keras.layers.Input(input_size), 139 | tf.keras.experimental.LinearModel(use_bias=False), 140 | ] 141 | ) 142 | feats = tf.convert_to_tensor(features - 0.5, dtype=tf.float32) 143 | pred = tf.convert_to_tensor(prediction, dtype=tf.float32) 144 | if weights is None: 145 | optimizer = tf.keras.optimizers.Adamax() 146 | start_exp = self.__exposures(feats, pred[:, None]) 147 | target_exps = tf.clip_by_value(start_exp, -self.max_exposure, self.max_exposure) 148 | self._train_loop(model, optimizer, feats, pred, target_exps) 149 | else: 150 | model.set_weights(weights) 151 | return pred[:, None] - model(feats), model.get_weights() 152 | 153 | def _train_loop(self, model, optimizer, feats, pred, target_exps): 154 | for _ in range(1000000): 155 | loss, grads = self.__train_loop_body(model, feats, pred, target_exps) 156 | optimizer.apply_gradients(zip(grads, model.trainable_variables)) 157 | if loss < 1e-7: 158 | break 159 | 160 | def __train_loop_body(self, model, feats, pred, target_exps): 161 | with tf.GradientTape() as tape: 162 | exps = self.__exposures(feats, pred[:, None] - model(feats, training=True)) 163 | loss = tf.reduce_sum(tf.nn.relu(tf.nn.relu(exps) - tf.nn.relu(target_exps)) + tf.nn.relu(tf.nn.relu(-exps) - tf.nn.relu(-target_exps))) 164 | return loss, tape.gradient(loss, model.trainable_variables) 165 | 166 | @staticmethod 167 | def __exposures(x, y): 168 | x = x - tf.math.reduce_mean(x, axis=0) 169 | x = x / tf.norm(x, axis=0) 170 | y = y - tf.math.reduce_mean(y, axis=0) 171 | y = y / tf.norm(y, axis=0) 172 | return tf.matmul(x, y, transpose_a=True) 173 | -------------------------------------------------------------------------------- /src/numerblox/prediction_loaders.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from pathlib import Path 3 | from uuid import uuid4 4 | 5 | import pandas as pd 6 | from sklearn.base import BaseEstimator, TransformerMixin 7 | 8 | from .download import NumeraiClassicDownloader 9 | 10 | 11 | class BasePredictionLoader(TransformerMixin, BaseEstimator): 12 | """Shared functionality for all Prediction Loaders.""" 13 | 14 | def __init__(self): ... 15 | 16 | def fit(self, X=None, y=None): 17 | return self 18 | 19 | @abstractmethod 20 | def transform(self, X=None, y=None) -> pd.DataFrame: 21 | """Return Predictions generated by model.""" 22 | ... 23 | 24 | @abstractmethod 25 | def get_feature_names_out(self, input_features=None): 26 | """Return feature names.""" 27 | ... 28 | 29 | 30 | class ExamplePredictions(BasePredictionLoader): 31 | """ 32 | Load example predictions. 33 | :param file_name: File to download from NumerAPI. 34 | By default this is example predictions for v5.0 data. 35 | 'v5.0/live_example_preds.parquet' by default. 36 | Example predictions in previous versions: 37 | - v5.0. validation examples -> "v5.0/validation_example_preds.parquet" 38 | - v5.0. live benchmark models -> "v5.0/live_benchmark_models.parquet" 39 | - v5.0. validation benchmark models -> "v5.0/validation_benchmark_models.parquet" 40 | :param round_num: Optional round number. Downloads most recent round by default. 41 | :param keep_files: Whether to keep downloaded files. 42 | By default, files are deleted after the predictions are loaded. 43 | """ 44 | 45 | def __init__(self, file_name: str = "v5.0/live_example_preds.parquet", round_num: int = None, keep_files: bool = False): 46 | super().__init__() 47 | self.file_name = file_name 48 | self.round_num = round_num 49 | self.keep_files = keep_files 50 | 51 | def transform(self, X=None, y=None) -> pd.DataFrame: 52 | """Return example predictions.""" 53 | self._download_example_preds() 54 | example_preds = self._load_example_preds() 55 | if not self.keep_files: 56 | self.downloader.remove_base_directory() 57 | return example_preds 58 | 59 | def _download_example_preds(self): 60 | data_directory = f"example_predictions_loader_{uuid4()}" 61 | self.downloader = NumeraiClassicDownloader(directory_path=data_directory) 62 | self.dest_path = f"{str(self.downloader.dir)}/{self.file_name}" 63 | self.downloader.download_single_dataset(filename=self.file_name, dest_path=self.dest_path, round_num=self.round_num) 64 | 65 | def _load_example_preds(self, *args, **kwargs): 66 | return pd.read_parquet(self.dest_path, *args, **kwargs) 67 | 68 | def get_feature_names_out(self, input_features=None): 69 | return [Path(self.file_name).with_suffix("").as_posix()] if not input_features else input_features 70 | -------------------------------------------------------------------------------- /src/numerblox/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from numerblox.preprocessing.classic import * 2 | from numerblox.preprocessing.signals import * 3 | -------------------------------------------------------------------------------- /src/numerblox/preprocessing/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | from sklearn.base import BaseEstimator, TransformerMixin 8 | 9 | 10 | class BasePreProcessor(TransformerMixin, BaseEstimator): 11 | """Common functionality for preprocessors and postprocessors.""" 12 | 13 | def __init__(self): 14 | sklearn.set_config(enable_metadata_routing=True) 15 | 16 | def fit(self, X, y=None): 17 | self.is_fitted_ = True 18 | return self 19 | 20 | @abstractmethod 21 | def transform(self, X: Union[np.array, pd.DataFrame], y=None, **kwargs) -> pd.DataFrame: ... 22 | 23 | @abstractmethod 24 | def get_feature_names_out(self, input_features=None) -> List[str]: ... 25 | -------------------------------------------------------------------------------- /src/numerblox/preprocessing/classic.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import List 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from numerblox.feature_groups import V5_FEATURE_GROUP_MAPPING 8 | from numerblox.preprocessing.base import BasePreProcessor 9 | 10 | 11 | class GroupStatsPreProcessor(BasePreProcessor): 12 | """ 13 | Note that this class only works with `pd.DataFrame` input. 14 | When using in a Pipeline, make sure that the Pandas output API is set (`.set_output(transform="pandas")`. 15 | 16 | Calculates group statistics for all data groups. \n 17 | :param groups: Groups to create features for. All groups by default. \n 18 | """ 19 | 20 | def __init__(self, groups: list = None): 21 | super().__init__() 22 | self.all_groups = ["intelligence", "charisma", "strength", "dexterity", "constitution", "wisdom", "agility", "serenity", "sunshine", "rain"] 23 | self.groups = groups 24 | self.group_names = groups if self.groups else self.all_groups 25 | self.feature_group_mapping = V5_FEATURE_GROUP_MAPPING 26 | 27 | def transform(self, X: pd.DataFrame) -> np.array: 28 | """Check validity and add group features.""" 29 | dataf = self._add_group_features(X) 30 | return dataf.to_numpy() 31 | 32 | def _add_group_features(self, X: pd.DataFrame) -> pd.DataFrame: 33 | """Mean, standard deviation and skew for each group.""" 34 | dataf = pd.DataFrame() 35 | for group in self.group_names: 36 | cols = self.feature_group_mapping[group] 37 | valid_cols = [col for col in cols if col in X.columns] 38 | if not valid_cols: 39 | warnings.warn(f"None of the columns of '{group}' are in the input data. Output will be nans for the group features.") 40 | elif len(cols) != len(valid_cols): 41 | warnings.warn(f"Not all columns of '{group}' are in the input data ({len(valid_cols)} < {len(cols)}). Use remaining columns for stats features.") 42 | dataf.loc[:, f"feature_{group}_mean"] = X[valid_cols].mean(axis=1) 43 | dataf.loc[:, f"feature_{group}_std"] = X[valid_cols].std(axis=1) 44 | dataf.loc[:, f"feature_{group}_skew"] = X[valid_cols].skew(axis=1) 45 | return dataf 46 | 47 | def get_feature_names_out(self, input_features=None) -> List[str]: 48 | """Return feature names.""" 49 | if not input_features: 50 | feature_names = [] 51 | for group in self.group_names: 52 | feature_names.append(f"feature_{group}_mean") 53 | feature_names.append(f"feature_{group}_std") 54 | feature_names.append(f"feature_{group}_skew") 55 | else: 56 | feature_names = input_features 57 | return feature_names 58 | -------------------------------------------------------------------------------- /src/numerblox/targets.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import List, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | from scipy.stats import rankdata 8 | from sklearn.base import BaseEstimator, TransformerMixin 9 | from sklearn.linear_model import Ridge 10 | from sklearn.mixture import BayesianGaussianMixture 11 | from sklearn.utils.validation import check_is_fitted 12 | from tqdm import tqdm 13 | 14 | # Ignore SettingWithCopyWarning 15 | pd.options.mode.chained_assignment = None 16 | 17 | 18 | class BaseTargetProcessor(TransformerMixin, BaseEstimator): 19 | """Common functionality for preprocessors and postprocessors.""" 20 | 21 | def __init__(self): 22 | sklearn.set_config(enable_metadata_routing=True) 23 | self.set_transform_request(era_series=True) 24 | 25 | def fit(self, X, y=None): 26 | self.is_fitted_ = True 27 | return self 28 | 29 | @abstractmethod 30 | def transform(self, X: Union[np.array, pd.DataFrame], y=None) -> pd.DataFrame: ... 31 | 32 | @abstractmethod 33 | def get_feature_names_out(self, input_features=None) -> List[str]: ... 34 | 35 | 36 | class BayesianGMMTargetProcessor(BaseTargetProcessor): 37 | """ 38 | Generate synthetic (fake) target using a Bayesian Gaussian Mixture model. \n 39 | Based on Michael Oliver's GitHub Gist implementation: \n 40 | https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93 41 | 42 | :param n_components: Number of components for fitting Bayesian Gaussian Mixture Model. 43 | """ 44 | 45 | def __init__( 46 | self, 47 | n_components: int = 3, 48 | ): 49 | super().__init__() 50 | self.set_fit_request(era_series=True) 51 | self.n_components = n_components 52 | self.ridge = Ridge(fit_intercept=False) 53 | self.bins = [0, 0.05, 0.25, 0.75, 0.95, 1] 54 | 55 | def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series): 56 | """ 57 | Fit Bayesian Gaussian Mixture model on coefficients and normalize. 58 | :param X: DataFrame containing features. 59 | :param y: Series containing real target. 60 | :param era_series: Series containing era information. 61 | """ 62 | bgmm = BayesianGaussianMixture(n_components=self.n_components) 63 | coefs = self._get_coefs(dataf=X, y=y, era_series=era_series) 64 | bgmm.fit(coefs) 65 | # make probability of sampling each component equal to better balance rare regimes 66 | bgmm.weights_[:] = 1 / self.n_components 67 | self.bgmm_ = bgmm 68 | self.is_fitted_ = True 69 | return self 70 | 71 | def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array: 72 | """ 73 | Main method for generating fake target. 74 | :param X: DataFrame containing features. 75 | :param era_series: Series containing era information. 76 | """ 77 | check_is_fitted(self, "bgmm_") 78 | assert len(X) == len(era_series), "X and eras must be same length." 79 | all_eras = era_series.unique().tolist() 80 | # Scale data between 0 and 1 81 | X = X.astype(float) 82 | X /= X.max() 83 | X -= 0.5 84 | X.loc[:, "era"] = era_series 85 | 86 | fake_target = self._generate_target(dataf=X, all_eras=all_eras) 87 | return fake_target 88 | 89 | def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, era_series: pd.Series) -> np.ndarray: 90 | """ 91 | Generate coefficients for BGMM. 92 | :param dataf: DataFrame containing features. 93 | :param y: Series containing real target. 94 | """ 95 | coefs = [] 96 | dataf.loc[:, "era"] = era_series 97 | dataf.loc[:, "target"] = y 98 | all_eras = dataf["era"].unique().tolist() 99 | for era in all_eras: 100 | era_df = dataf[dataf["era"] == era] 101 | era_y = era_df.loc[:, "target"] 102 | era_df = era_df.drop(columns=["era", "target"]) 103 | self.ridge.fit(era_df, era_y) 104 | coefs.append(self.ridge.coef_) 105 | stacked_coefs = np.vstack(coefs) 106 | return stacked_coefs 107 | 108 | def _generate_target(self, dataf: pd.DataFrame, all_eras: list) -> np.ndarray: 109 | """Generate fake target using Bayesian Gaussian Mixture model.""" 110 | fake_target = [] 111 | for era in tqdm(all_eras, desc="Generating fake target"): 112 | features = dataf[dataf["era"] == era] 113 | features = features.drop(columns=["era", "target"]) 114 | # Sample a set of weights from GMM 115 | beta, _ = self.bgmm_.sample(1) 116 | # Create fake continuous target 117 | fake_targ = features @ beta[0] 118 | # Bin fake target like real target 119 | fake_targ = (rankdata(fake_targ) - 0.5) / len(fake_targ) 120 | fake_targ = (np.digitize(fake_targ, self.bins) - 1) / 4 121 | fake_target.append(fake_targ) 122 | return np.concatenate(fake_target) 123 | 124 | def get_feature_names_out(self, input_features=None) -> List[str]: 125 | """Return feature names.""" 126 | return ["fake_target"] if not input_features else input_features 127 | 128 | 129 | class SignalsTargetProcessor(BaseTargetProcessor): 130 | """ 131 | Engineer targets for Numerai Signals. \n 132 | More information on implements Numerai Signals targets: \n 133 | https://forum.numer.ai/t/decoding-the-signals-target/2501 134 | 135 | :param price_col: Column from which target will be derived. \n 136 | :param windows: Timeframes to use for engineering targets. 10 and 20-day by default. \n 137 | :param bins: Binning used to create group targets. Nomi binning by default. \n 138 | :param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default. 139 | """ 140 | 141 | def __init__( 142 | self, 143 | price_col: str = "close", 144 | windows: list = None, 145 | bins: list = None, 146 | labels: list = None, 147 | ): 148 | super().__init__() 149 | self.price_col = price_col 150 | self.windows = windows if windows else [10, 20] 151 | self.bins = bins if bins else [0, 0.05, 0.25, 0.75, 0.95, 1] 152 | self.labels = labels if labels else [0, 0.25, 0.50, 0.75, 1] 153 | 154 | def transform(self, dataf: pd.DataFrame, era_series: pd.Series) -> np.array: 155 | for window in tqdm(self.windows, desc="Signals target engineering windows"): 156 | dataf.loc[:, f"target_{window}d_raw"] = dataf[self.price_col].pct_change(periods=window).shift(-window) 157 | era_groups = dataf.groupby(era_series) 158 | 159 | dataf.loc[:, f"target_{window}d_rank"] = era_groups[f"target_{window}d_raw"].rank(pct=True, method="first") 160 | dataf.loc[:, f"target_{window}d_group"] = era_groups[f"target_{window}d_rank"].transform(lambda group: pd.cut(group, bins=self.bins, labels=self.labels, include_lowest=True)) 161 | output_cols = self.get_feature_names_out() 162 | return dataf[output_cols].to_numpy() 163 | 164 | def get_feature_names_out(self, input_features=None) -> List[str]: 165 | """Return feature names of Signals targets.""" 166 | if not input_features: 167 | feature_names = [] 168 | for window in self.windows: 169 | feature_names.append(f"target_{window}d_raw") 170 | feature_names.append(f"target_{window}d_rank") 171 | feature_names.append(f"target_{window}d_group") 172 | else: 173 | feature_names = input_features 174 | return feature_names 175 | -------------------------------------------------------------------------------- /tests/test_assets/mock_credentials.json: -------------------------------------------------------------------------------- 1 | {"pub_id": "Hello", "secret_key": "World"} -------------------------------------------------------------------------------- /tests/test_assets/val_3_eras.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/tests/test_assets/val_3_eras.parquet -------------------------------------------------------------------------------- /tests/test_download/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crowdcent/numerblox/16834cbeca383613f9944ea7bc78e9e7b8ce4034/tests/test_download/__init__.py -------------------------------------------------------------------------------- /tests/test_download/test_download_classic.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import PosixPath 3 | from uuid import uuid4 4 | 5 | import pytest 6 | from numerapi import NumerAPI 7 | 8 | from numerblox.download import NumeraiClassicDownloader 9 | 10 | ALL_CLASSIC_VERSIONS = set(s.split("/")[0] for s in NumerAPI().list_datasets() if "signals" not in s) 11 | 12 | TEST_CLASSIC_DIR = f"test_numclassic_general_{uuid4()}" 13 | TEST_CLASSIC_VERSIONS = ["5.0"] 14 | 15 | 16 | def test_base(): 17 | numer_classic_downloader = NumeraiClassicDownloader(TEST_CLASSIC_DIR) 18 | 19 | # Test building class 20 | assert isinstance(numer_classic_downloader.dir, PosixPath) 21 | assert numer_classic_downloader.dir.is_dir() 22 | 23 | # Test is_empty 24 | (numer_classic_downloader.dir / "test.txt").write_text("test") 25 | assert not numer_classic_downloader.is_empty 26 | 27 | # Remove contents 28 | numer_classic_downloader.remove_base_directory() 29 | assert not os.path.exists(TEST_CLASSIC_DIR) 30 | 31 | 32 | def test_classic(): 33 | dl = NumeraiClassicDownloader(TEST_CLASSIC_DIR) 34 | 35 | # Check versions 36 | assert dl.dataset_versions == ALL_CLASSIC_VERSIONS 37 | 38 | # Test live download 39 | for version in TEST_CLASSIC_VERSIONS: 40 | dl.download_live_data("live", version=version) 41 | assert os.path.exists(dl.dir / "live") 42 | assert os.path.exists(dl.dir / "live" / "live.parquet") 43 | 44 | # Test example data 45 | dl.download_example_data("test/", version=version) 46 | assert os.path.exists(dl.dir / "test") 47 | assert os.path.exists(dl.dir / "test" / "live_example_preds.parquet") 48 | assert os.path.exists(dl.dir / "test" / "validation_example_preds.parquet") 49 | 50 | # Test features 51 | features = dl.get_classic_features() 52 | assert isinstance(features, dict) 53 | assert len(features["feature_sets"]["medium"]) == 705 54 | # Check that feature_stats and feature_sets keys exist 55 | assert "feature_sets" in features.keys() 56 | 57 | dl.remove_base_directory() 58 | 59 | 60 | def test_classic_versions(): 61 | downloader = NumeraiClassicDownloader(directory_path=f"some_path_{uuid4()}") 62 | 63 | # Test unsupported versions 64 | unsupported_versions = ["3"] 65 | for version in unsupported_versions: 66 | with pytest.raises(AssertionError): 67 | downloader.download_training_data(version=version) 68 | with pytest.raises(AssertionError): 69 | downloader.download_live_data(version=version) 70 | 71 | downloader.remove_base_directory() 72 | -------------------------------------------------------------------------------- /tests/test_download/test_download_crypto.py: -------------------------------------------------------------------------------- 1 | import os 2 | from uuid import uuid4 3 | 4 | import pytest 5 | 6 | from numerblox.download import NumeraiCryptoDownloader 7 | 8 | ALL_CRYPTO_VERSIONS = ["v1.0"] 9 | 10 | 11 | @pytest.mark.xfail(reason="May fail due to API rate limiting") 12 | def test_crypto(): 13 | TEST_CRYPTO_DIR = f"test_numcrypto_general_{uuid4()}" 14 | dl = NumeraiCryptoDownloader(TEST_CRYPTO_DIR) 15 | 16 | # Check versions 17 | assert dl.dataset_versions == ALL_CRYPTO_VERSIONS 18 | 19 | # Test live download 20 | dl.download_live_data("live", version="1.0") 21 | assert os.path.exists(dl.dir / "live") 22 | assert os.path.exists(dl.dir / "live" / "live_universe.parquet") 23 | 24 | # Test training data download 25 | dl.download_training_data("train/", version="1.0") 26 | assert os.path.exists(dl.dir / "train") 27 | assert os.path.exists(dl.dir / "train" / "train_targets.parquet") 28 | 29 | 30 | @pytest.mark.xfail(reason="May fail due to API rate limiting") 31 | def test_crypto_versions(): 32 | downloader = NumeraiCryptoDownloader(directory_path=f"some_path_{uuid4()}") 33 | 34 | # Test unsupported versions 35 | unsupported_versions = ["0", "0.5", "3.5"] 36 | for version in unsupported_versions: 37 | with pytest.raises(AssertionError): 38 | downloader.download_training_data(version=version) 39 | with pytest.raises(AssertionError): 40 | downloader.download_live_data(version=version) 41 | -------------------------------------------------------------------------------- /tests/test_download/test_download_signals.py: -------------------------------------------------------------------------------- 1 | import os 2 | from uuid import uuid4 3 | 4 | import pytest 5 | from numerapi import SignalsAPI 6 | 7 | from numerblox.download import EODDownloader, KaggleDownloader, NumeraiSignalsDownloader 8 | 9 | ALL_SIGNALS_VERSIONS = set(s.replace("signals/", "").split("/")[0] for s in SignalsAPI().list_datasets() if s.startswith("signals/v")) 10 | TEST_SIGNALS_DIR = f"test_numsignals_general_{uuid4()}" 11 | TEST_SIGNALS_VERSIONS = ["2.0"] 12 | 13 | 14 | @pytest.mark.xfail(reason="May fail due to API rate limiting") 15 | def test_signals(): 16 | dl = NumeraiSignalsDownloader(TEST_SIGNALS_DIR) 17 | 18 | # Check versions 19 | assert dl.dataset_versions == ALL_SIGNALS_VERSIONS 20 | 21 | # Test live download 22 | for version in TEST_SIGNALS_VERSIONS: 23 | dl.download_live_data("live", version=version) 24 | assert os.path.exists(dl.dir / "live") 25 | assert os.path.exists(dl.dir / "live" / "live.parquet") 26 | 27 | # Test example data 28 | dl.download_example_data("test/", version=version) 29 | assert os.path.exists(dl.dir / "test") 30 | assert os.path.exists(dl.dir / "test" / "live_example_preds.parquet") 31 | assert os.path.exists(dl.dir / "test" / "validation_example_preds.parquet") 32 | 33 | dl.remove_base_directory() 34 | 35 | 36 | @pytest.mark.xfail(reason="May fail due to API rate limiting") 37 | def test_signals_versions(): 38 | downloader = NumeraiSignalsDownloader(directory_path=f"some_path_{uuid4()}") 39 | 40 | # Test unsupported versions 41 | unsupported_versions = ["0"] 42 | for version in unsupported_versions: 43 | with pytest.raises(AssertionError): 44 | downloader.download_training_data(version=version) 45 | with pytest.raises(AssertionError): 46 | downloader.download_live_data(version=version) 47 | 48 | downloader.remove_base_directory() 49 | 50 | 51 | @pytest.mark.xfail(reason="May fail due to API rate limiting or missing credentials") 52 | def test_kaggle_downloader(): 53 | try: 54 | kd = KaggleDownloader(f"test_kaggle_{uuid4()}") 55 | assert os.path.exists(kd.dir) 56 | kd.remove_base_directory() 57 | except OSError: 58 | pass 59 | 60 | 61 | @pytest.mark.xfail(reason="May fail due to API rate limiting or missing credentials") 62 | def test_eod(): 63 | eod = EODDownloader(f"test_eod_{uuid4()}", key="DEMO", tickers=["AAPL.US"]) 64 | eod.download_live_data() 65 | eod.download_training_data() 66 | eod.remove_base_directory() 67 | -------------------------------------------------------------------------------- /tests/test_end_to_end.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.compose import ColumnTransformer 3 | from sklearn.model_selection import TimeSeriesSplit 4 | from sklearn.pipeline import make_pipeline, make_union 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklego.preprocessing import ColumnSelector 7 | from xgboost import XGBRegressor 8 | 9 | from numerblox.ensemble import NumeraiEnsemble, PredictionReducer 10 | from numerblox.meta import CrossValEstimator, MetaEstimator, make_meta_pipeline 11 | from numerblox.neutralizers import FeatureNeutralizer 12 | from numerblox.numerframe import create_numerframe 13 | from numerblox.preprocessing import GroupStatsPreProcessor 14 | 15 | 16 | @pytest.fixture(scope="module") 17 | def setup_data(): 18 | df = create_numerframe("tests/test_assets/val_3_eras.parquet") 19 | return df 20 | 21 | 22 | def test_neutralized_xgboost_pipeline(setup_data): 23 | df = setup_data 24 | 25 | X, y = df.get_feature_target_pair(multi_target=False) 26 | fncv3_cols = df.get_fncv3_feature_data.columns.tolist() 27 | era_series = df.get_era_data 28 | features = df.get_feature_data 29 | 30 | # Preprocessing 31 | gpp = GroupStatsPreProcessor(groups=["sunshine", "rain"]) 32 | fncv3_selector = ColumnSelector(fncv3_cols) 33 | # TODO Test with preproc FeatureUnion 34 | preproc_pipe = ColumnTransformer([("gpp", gpp, features.columns.tolist()), ("selector", fncv3_selector, fncv3_cols)]) 35 | 36 | # Model 37 | xgb = XGBRegressor() 38 | cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5)) 39 | ens = NumeraiEnsemble() 40 | fn = FeatureNeutralizer(proportion=0.5) 41 | full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn) 42 | 43 | # Train full model 44 | full_pipe.fit(X, y, era_series=era_series) 45 | # Inference 46 | preds = full_pipe.predict(X, era_series=era_series, features=features) 47 | assert preds.min() >= 0 48 | assert abs(preds.max() - 1) <= 1e-9 49 | assert preds.shape[0] == X.shape[0] 50 | assert len(preds.shape) == 2 51 | 52 | 53 | def test_multi_classification_ensemble(setup_data): 54 | df = setup_data 55 | X, y = df.get_feature_target_pair(multi_target=False) 56 | era_series = df.get_era_data 57 | features = df.get_feature_data 58 | fncv3_cols = df.get_fncv3_feature_data.columns.tolist() 59 | # TODO Test with preproc FeatureUnion in sklearn 1.5+ 60 | preproc_pipe = ColumnTransformer([("gpp", GroupStatsPreProcessor(groups=["sunshine", "rain"]), features.columns.tolist()), ("selector", ColumnSelector(fncv3_cols), fncv3_cols)]) 61 | 62 | model = DecisionTreeClassifier() 63 | crossval = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func="predict_proba") 64 | pred_rud = PredictionReducer(n_models=3, n_classes=5) 65 | ens = NumeraiEnsemble(donate_weighted=True) 66 | fn = FeatureNeutralizer(proportion=0.5) 67 | full_pipe = make_meta_pipeline(preproc_pipe, crossval, pred_rud, ens, fn) 68 | 69 | y_int = (y * 4).astype(int) 70 | full_pipe.fit(X, y_int, era_series=era_series) 71 | 72 | preds = full_pipe.predict(X, era_series=era_series, features=features) 73 | assert preds.min() >= 0 74 | assert abs(preds.max() - 1) <= 1e-9 75 | assert preds.shape[0] == X.shape[0] 76 | assert len(preds.shape) == 2 77 | 78 | 79 | @pytest.mark.xfail(reason="Can only be tested with sklearn 1.5+") 80 | def test_feature_union_pipeline(setup_data): 81 | df = setup_data 82 | X, y = df.get_feature_target_pair(multi_target=False) 83 | era_series = df.get_era_data 84 | features = df.get_feature_data 85 | fncv3_cols = df.get_fncv3_feature_data.columns.tolist() 86 | 87 | gpp = GroupStatsPreProcessor(groups=["sunshine", "rain"]) 88 | fncv3_selector = ColumnSelector(fncv3_cols) 89 | preproc_pipe = make_union(gpp, fncv3_selector) 90 | 91 | xgb = MetaEstimator(XGBRegressor()) 92 | fn = FeatureNeutralizer(proportion=0.5) 93 | model_pipe = make_pipeline(preproc_pipe, xgb, fn) 94 | 95 | model_pipe.fit(X, y) 96 | 97 | preds = model_pipe.predict(X, era_series=era_series, features=features) 98 | assert preds.min() >= 0 99 | assert abs(preds.max() - 1) <= 1e-9 100 | assert preds.shape[0] == X.shape[0] 101 | 102 | 103 | def test_column_transformer_pipeline(setup_data): 104 | df = setup_data 105 | X, y = df.get_feature_target_pair(multi_target=False) 106 | 107 | era_series = df.get_era_data 108 | features = df.get_feature_data 109 | fncv3_cols = df.get_fncv3_feature_data.columns.tolist() 110 | 111 | gpp = GroupStatsPreProcessor(groups=["sunshine", "rain"]) 112 | preproc_pipe = ColumnTransformer([("gpp", gpp, features.columns.tolist()), ("selector", "passthrough", fncv3_cols[2:])]) 113 | xgb = MetaEstimator(XGBRegressor()) 114 | fn = FeatureNeutralizer(proportion=0.5) 115 | model_pipe = make_pipeline(preproc_pipe, xgb, fn) 116 | 117 | model_pipe.fit(X, y) 118 | 119 | preds = model_pipe.predict(X, era_series=era_series, features=features) 120 | assert preds.min() >= 0 121 | assert abs(preds.max() - 1) <= 1e-9 122 | assert preds.shape[0] == X.shape[0] 123 | -------------------------------------------------------------------------------- /tests/test_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from scipy.stats import rankdata 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.datasets import make_regression 7 | from sklearn.utils.validation import check_is_fitted 8 | 9 | from numerblox.ensemble import NumeraiEnsemble, PredictionReducer 10 | 11 | 12 | ##### Mock objects ##### 13 | @pytest.fixture 14 | def sample_data(): 15 | return make_regression(n_samples=100, n_features=20, noise=0.1) 16 | 17 | 18 | @pytest.fixture 19 | def ensemble(): 20 | return NumeraiEnsemble() 21 | 22 | 23 | ##### NumeraiEnsemble ##### 24 | 25 | 26 | def test_numeraiensemble_fit(ensemble, sample_data): 27 | X, y = sample_data 28 | ensemble.fit(X, y) 29 | check_is_fitted(ensemble) 30 | assert issubclass(type(ensemble), (TransformerMixin, BaseEstimator)) 31 | 32 | 33 | def test_numeraiensemble_predict(ensemble, sample_data): 34 | X, y = sample_data 35 | ensemble = NumeraiEnsemble(weights=[0.05, 0.05, 0.3, 0.3, 0.3]) 36 | ensemble.fit(X, y) 37 | eras = np.array([1] * 50 + [2] * 50) 38 | input_preds = np.random.uniform(size=(100, 5)) 39 | 40 | ensemble_preds = ensemble.predict(input_preds, era_series=eras) 41 | # The length of output should have the same shape as input preds 42 | assert len(ensemble_preds) == len(input_preds) 43 | # Output should be a numpy array with values between 0 and 1 44 | assert isinstance(ensemble_preds, np.ndarray) 45 | assert len(ensemble_preds.shape) == 2 46 | assert ensemble_preds.min() >= 0 47 | assert ensemble_preds.max() <= 1 48 | 49 | # Test with Pandas Series into 50 | input_preds = pd.DataFrame(input_preds) 51 | eras = pd.Series(eras) 52 | ensemble_preds = ensemble.predict(input_preds, eras) 53 | 54 | 55 | def test_numeraiensemble_standardize(ensemble, sample_data): 56 | X, y = sample_data 57 | ensemble.fit(X, y) 58 | 59 | data = np.array([1, 2, 3, 4, 5]) 60 | standardized_data = ensemble._standardize(data) 61 | 62 | expected = (rankdata(data, method="ordinal") - 0.5) / len(data) 63 | 64 | assert np.allclose(standardized_data, expected) 65 | 66 | 67 | def test_numeraiensemble_standardize_by_era(ensemble): 68 | eras = np.array([1, 1, 1, 2, 2, 2]) 69 | 70 | # Test 1: Basic functionality 71 | X = np.array([0.5, 0.7, 0.1, 0.9, 0.6, 0.3]) 72 | standardized = ensemble._standardize_by_era(X, eras) 73 | # These values are simply computed based on manual calculations for rank and normalization 74 | expected_values_1 = [0.5, 0.83333333, 0.16666667, 0.83333333, 0.5, 0.16666667] 75 | assert np.allclose(standardized, expected_values_1) 76 | 77 | # Test 2: Check standardized values for all same predictions split across two different eras 78 | X = np.array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5]) 79 | standardized = ensemble._standardize_by_era(X, eras) 80 | expected_values_2 = [0.16666667, 0.5, 0.83333333, 0.16666667, 0.5, 0.83333333] 81 | assert np.allclose(standardized, expected_values_2) 82 | 83 | # Test 3: Different predictions but split across two eras 84 | X = np.array([0.1, 0.9, 0.9, 0.1, 0.1, 0.9]) 85 | standardized = ensemble._standardize_by_era(X, eras) 86 | expected_values_3 = [0.16666667, 0.5, 0.83333333, 0.16666667, 0.5, 0.83333333] 87 | assert np.allclose(standardized, expected_values_3) 88 | 89 | 90 | def test_numeraiensemble_predict_with_constant_values(ensemble): 91 | # Create an instance of your ensemble with mock estimators 92 | era_series = np.random.randint(1, 5, size=100) 93 | 94 | X_fit = np.random.rand(100, 3) 95 | y_fit = np.random.rand(100) 96 | ensemble.fit(X_fit, y_fit) 97 | 98 | constant_preds = np.ones((100, 5)) 99 | 100 | with pytest.raises(ValueError, match="Predictions for all columns are constant. No valid predictions to ensemble."): 101 | with pytest.warns(UserWarning, match="Some estimator predictions are constant. Consider checking your estimators. Skipping these estimator predictions in ensembling."): 102 | ensemble.predict(constant_preds, era_series) 103 | 104 | 105 | def test_numeraiensemble_predict_with_nans(ensemble): 106 | # Create an instance of your ensemble with mock estimators 107 | era_series = np.random.randint(1, 5, size=100) 108 | 109 | X_fit = np.random.rand(100, 3) 110 | y_fit = np.random.rand(100) 111 | ensemble.fit(X_fit, y_fit) 112 | 113 | nan_preds = np.ones((100, 5)) 114 | nan_preds[5:15, 0] = np.nan 115 | nan_preds[:5, 1] = np.nan 116 | 117 | with pytest.warns(UserWarning, match="Predictions in column"): 118 | ensemble_preds = ensemble.predict(nan_preds, era_series) 119 | assert len(ensemble_preds) == len(nan_preds) 120 | # Output should be a numpy array with values between 0 and 1 121 | assert isinstance(ensemble_preds, np.ndarray) 122 | # There must be some nans in the data. 123 | assert np.sum(np.isnan(ensemble_preds)) >= 0 124 | # None nan values should be between 0 and 1 125 | non_nan_values = ensemble_preds[~np.isnan(ensemble_preds)] 126 | if non_nan_values.size > 0: 127 | assert non_nan_values.min() >= 0 128 | assert non_nan_values.max() <= 1 129 | 130 | 131 | def test_numeraiensemble_donate_weights(ensemble): 132 | ensemble.donate_weighted = True 133 | # For 3 predictions, weights should be [0.25, 0.25, 0.5] 134 | assert ensemble._get_donate_weights(n=3) == [0.25, 0.25, 0.5] 135 | # For 5, weights should be [0.0625, 0.0625, 0.125, 0.25, 0.5] 136 | assert ensemble._get_donate_weights(n=5) == [0.0625, 0.0625, 0.125, 0.25, 0.5] 137 | 138 | 139 | def test_numeraiensemble_donate_weights_sum_to_one(ensemble): 140 | ensemble.donate_weighted = True 141 | for n_estimators in range(1, 11): 142 | # Assert that the sum of weights is close to 1 143 | assert np.isclose(sum(ensemble._get_donate_weights(n=n_estimators)), 1.0) 144 | 145 | 146 | def test_numeraiensemble_get_feature_names_out(ensemble): 147 | X = np.random.rand(10, 3) 148 | y = np.random.rand(10) 149 | ensemble.fit(X, y) 150 | assert ensemble.get_feature_names_out() == ["numerai_ensemble_predictions"] 151 | assert ensemble.get_feature_names_out(["a", "b"]) == ["a", "b"] 152 | 153 | 154 | def test_numeraiensemble_set_output(ensemble, sample_data): 155 | X, y = sample_data 156 | era_series = np.array([1] * 50 + [2] * 50) 157 | ens_ins = ensemble 158 | ens_ins.fit(X, y) 159 | 160 | ens_ins.set_output(transform="pandas") 161 | preds = ens_ins.predict(X, era_series=era_series) 162 | assert isinstance(preds, pd.DataFrame) 163 | ens_ins.set_output(transform="default") 164 | preds = ens_ins.predict(X, era_series=era_series) 165 | assert isinstance(preds, np.ndarray) 166 | 167 | 168 | ##### PredictionReducer ##### 169 | 170 | 171 | def test_prediction_reducer(): 172 | # Simulated probability predictions for 3 samples, 2 models and 3 classes 173 | X = np.array([[0.1, 0.7, 0.2, 0.2, 0.5, 0.3], [0.2, 0.5, 0.3, 0.3, 0.3, 0.4], [0.6, 0.2, 0.2, 0.4, 0.4, 0.2]]) 174 | 175 | reducer = PredictionReducer(n_models=2, n_classes=3) 176 | reduced_X = reducer.fit_transform(X) 177 | 178 | # The expected result is a 3x2 matrix 179 | expected_result = np.array([[0.7 * 1 + 0.2 * 2, 0.5 * 1 + 0.3 * 2], [0.5 * 1 + 0.3 * 2, 0.3 * 1 + 0.4 * 2], [0.2 * 1 + 0.2 * 2, 0.4 * 1 + 0.2 * 2]]) 180 | 181 | assert reduced_X.shape == (3, 2) 182 | np.testing.assert_array_almost_equal(reduced_X, expected_result) 183 | 184 | assert issubclass(type(reducer), (BaseEstimator, TransformerMixin)) 185 | 186 | # Set output API 187 | reducer.set_output(transform="pandas") 188 | preds = reducer.predict(X) 189 | assert isinstance(preds, pd.DataFrame) 190 | reducer.set_output(transform="default") 191 | preds = reducer.predict(X) 192 | assert isinstance(preds, np.ndarray) 193 | 194 | 195 | def test_prediction_reducer_feature_names_out(): 196 | reducer = PredictionReducer(n_models=3, n_classes=4) 197 | feature_names = reducer.get_feature_names_out() 198 | expected_names = ["reduced_prediction_0", "reduced_prediction_1", "reduced_prediction_2"] 199 | 200 | assert feature_names == expected_names 201 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | from numerblox.misc import AttrDict, Key, load_key_from_json 2 | 3 | 4 | def test_attrdict(): 5 | test_dict = AttrDict({"test1": "hello", "test2": "world"}) 6 | assert hasattr(test_dict, "test1") 7 | assert hasattr(test_dict, "test2") 8 | assert test_dict.test1 == test_dict["test1"] 9 | assert test_dict.test2 == test_dict["test2"] 10 | 11 | 12 | def test_key(): 13 | pub_id, secret_key = "Hello", "World" 14 | example_key = Key(pub_id=pub_id, secret_key=secret_key) 15 | assert str(example_key) == example_key.__repr__() 16 | assert (example_key.pub_id, example_key.secret_key) == (pub_id, secret_key) 17 | 18 | 19 | def test_load_key_from_json(): 20 | example_key = load_key_from_json("tests/test_assets/mock_credentials.json") 21 | assert (example_key.pub_id, example_key.secret_key) == ("Hello", "World") 22 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import create_classic_sample_data 3 | 4 | from numerblox.models import EraBoostedXGBRegressor 5 | 6 | setup_data = create_classic_sample_data 7 | 8 | 9 | def test_initialization(): 10 | model = EraBoostedXGBRegressor() 11 | assert model.proportion == 0.5 12 | assert model.trees_per_step == 10 13 | assert model.num_iters == 200 14 | assert model.n_estimators == 100 15 | 16 | custom_model = EraBoostedXGBRegressor(proportion=0.3, trees_per_step=5, num_iters=10) 17 | assert custom_model.proportion == 0.3 18 | assert custom_model.trees_per_step == 5 19 | assert custom_model.num_iters == 10 20 | 21 | 22 | def test_fit_method(setup_data): 23 | model = EraBoostedXGBRegressor(proportion=0.5, num_iters=5, n_estimators=100, max_depth=3, learning_rate=0.1) 24 | X, y, era_series = setup_data[["feature1", "feature2"]], setup_data["target"], setup_data["era"] 25 | initial_tree_count = model.n_estimators 26 | 27 | model.fit(X, y, era_series=era_series, verbose=500) 28 | 29 | assert model.n_estimators > initial_tree_count 30 | # Check if the final number of trees is as expected 31 | expected_final_tree_count = initial_tree_count + (model.num_iters - 1) * model.trees_per_step 32 | assert model.n_estimators == expected_final_tree_count 33 | 34 | 35 | def test_predictions(setup_data): 36 | model = EraBoostedXGBRegressor(num_iters=5, proportion=0.5, n_estimators=100, learning_rate=0.1, max_depth=3) 37 | X, y, era_series = setup_data[["feature1", "feature2"]], setup_data["target"], setup_data["era"] 38 | model.fit(X, y, era_series=era_series) 39 | 40 | predictions = model.predict(X) 41 | assert len(predictions) == len(X) 42 | # Check that predictions are not constant. 43 | assert len(set(predictions)) > 1 44 | # Check that it has fitted the data reasonably well. 45 | correlation = np.corrcoef(predictions, y)[0, 1] 46 | assert correlation > 0.8 47 | -------------------------------------------------------------------------------- /tests/test_neutralizers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | import sklearn 5 | from sklearn.pipeline import make_pipeline 6 | from sklearn.utils._metadata_requests import MetadataRequest 7 | from utils import create_classic_sample_data 8 | 9 | from numerblox.neutralizers import BaseNeutralizer, FeatureNeutralizer 10 | 11 | sklearn.set_config(enable_metadata_routing=True) 12 | 13 | setup_data = create_classic_sample_data 14 | 15 | 16 | def test_base_neutralizer_initialization(): 17 | bn = BaseNeutralizer(new_col_names=["test"]) 18 | assert bn.new_col_names == ["test"] 19 | 20 | 21 | def test_base_neutralizer_fit(setup_data): 22 | obj = BaseNeutralizer(new_col_names=["test"]).fit(setup_data) 23 | assert isinstance(obj, BaseNeutralizer) 24 | 25 | 26 | def test_feature_neutralizer_initialization(): 27 | fn = FeatureNeutralizer() 28 | assert fn.new_col_names[0].startswith("prediction_neutralized_") 29 | 30 | # Proportion must be between 0 and 1 31 | with pytest.raises(AssertionError): 32 | FeatureNeutralizer(proportion=[1.1]) 33 | with pytest.raises(AssertionError): 34 | FeatureNeutralizer(proportion=[-0.1]) 35 | 36 | # Test routing 37 | routing = fn.get_metadata_routing() 38 | assert isinstance(routing, MetadataRequest) 39 | assert routing.consumes("transform", ["features", "era_series"]) == set({"features", "era_series"}) 40 | assert routing.consumes("predict", ["features", "era_series"]) == set({"features", "era_series"}) 41 | 42 | 43 | def test_feature_neutralizer_length_mismatch_X_features(setup_data): 44 | fn = FeatureNeutralizer() 45 | features = setup_data[["feature1", "feature2"]] 46 | era_series = setup_data["era"] 47 | X = setup_data["prediction"][:-1] # Remove one element to cause mismatch 48 | 49 | with pytest.raises(AssertionError): 50 | fn.transform(X, features=features, era_series=era_series) 51 | 52 | 53 | def test_feature_neutralizer_length_mismatch_X_eras(setup_data): 54 | fn = FeatureNeutralizer() 55 | features = setup_data[["feature1", "feature2"]] 56 | era_series = setup_data["era"][:-1] # Remove one element to cause mismatch 57 | X = setup_data["prediction"] 58 | 59 | with pytest.raises(AssertionError): 60 | fn.transform(X, features=features, era_series=era_series) 61 | 62 | 63 | def test_feature_neutralizer_incorrect_dim_X_single_pred(setup_data): 64 | fn = FeatureNeutralizer(pred_name=["prediction1", "prediction2"]) 65 | features = setup_data[["feature1", "feature2"]] 66 | era_series = setup_data["era"] 67 | X = setup_data["prediction"] # X is 1D, but two prediction names are provided 68 | 69 | with pytest.raises(AssertionError): 70 | fn.transform(X, features=features, era_series=era_series) 71 | 72 | 73 | def test_feature_neutralizer_incorrect_dim_X_multi_pred(setup_data): 74 | fn = FeatureNeutralizer(pred_name=["prediction1", "prediction2"]) 75 | features = setup_data[["feature1", "feature2"]] 76 | era_series = setup_data["era"] 77 | setup_data["prediction2"] = np.random.uniform(size=len(setup_data)) 78 | X = setup_data[["prediction"]] # Only one column provided, but two expected 79 | 80 | with pytest.raises(AssertionError): 81 | fn.transform(X, features=features, era_series=era_series) 82 | 83 | 84 | def test_feature_neutralizer_predict(setup_data): 85 | fn = FeatureNeutralizer(pred_name="prediction", proportion=0.5) 86 | features = setup_data[["feature1", "feature2"]] 87 | era_series = setup_data["era"] 88 | X = setup_data["prediction"] 89 | result = fn.transform(X, features=features, era_series=era_series) 90 | assert len(result) == len(setup_data) 91 | assert result.shape[1] == 1 92 | assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) 93 | assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) 94 | 95 | 96 | def test_feature_neutralizer_transform_no_era(setup_data): 97 | fn = FeatureNeutralizer(pred_name="prediction", proportion=0.5) 98 | features = setup_data[["feature1", "feature2"]] 99 | X = setup_data["prediction"] 100 | # Ensure warning is raised. Omitting era_series with .set_transform_request(era_series=True) does not raise an error. 101 | with pytest.warns(UserWarning): 102 | result = make_pipeline(fn).transform(X, features=features) 103 | assert len(result) == len(setup_data) 104 | assert result.shape[1] == 1 105 | assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) 106 | assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) 107 | 108 | fn.set_transform_request(era_series=False) 109 | # Ensure warning is raised 110 | with pytest.warns(UserWarning): 111 | result2 = fn.transform(X, features=features) 112 | assert np.all(result == result2) 113 | assert len(result2) == len(setup_data) 114 | assert result2.shape[1] == 1 115 | assert np.all(np.isclose(result2, 0, atol=1e-8) | (result >= 0)) 116 | assert np.all(np.isclose(result2, 1, atol=1e-8) | (result <= 1)) 117 | 118 | fn.set_transform_request(era_series=None) 119 | era_series = setup_data["era"] 120 | # Passing era_series should give an error with metadata routing set to None 121 | with pytest.raises(ValueError): 122 | make_pipeline(fn).fit_transform(X, features=features, era_series=era_series) 123 | 124 | 125 | def test_feature_neutralizer_predict_multi_pred(setup_data): 126 | fn = FeatureNeutralizer(pred_name=["prediction", "prediction2"], proportion=[0.5]) 127 | features = setup_data[["feature1", "feature2"]] 128 | era_series = setup_data["era"] 129 | setup_data["prediction2"] = np.random.uniform(size=len(setup_data)) 130 | X = setup_data[["prediction", "prediction2"]] 131 | result = fn.transform(X, features=features, era_series=era_series) 132 | assert len(result) == len(setup_data) 133 | assert result.shape[1] == 2 134 | assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) 135 | assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) 136 | 137 | 138 | def test_feature_neutralizer_predict_multi_prop(setup_data): 139 | fn = FeatureNeutralizer(pred_name="prediction", proportion=[0.5, 0.7]) 140 | features = setup_data[["feature1", "feature2"]] 141 | era_series = setup_data["era"] 142 | X = setup_data["prediction"] 143 | result = fn.transform(X, features=features, era_series=era_series) 144 | assert len(result) == len(setup_data) 145 | assert result.shape[1] == 2 146 | assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) 147 | assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) 148 | 149 | 150 | def test_feature_neutralizer_multi_pred_multi_prop(setup_data): 151 | fn = FeatureNeutralizer(pred_name=["prediction", "prediction2"], proportion=[0.5, 0.7, 0.9]) 152 | features = setup_data[["feature1", "feature2"]] 153 | era_series = setup_data["era"] 154 | setup_data["prediction2"] = np.random.uniform(size=len(setup_data)) 155 | X = setup_data[["prediction", "prediction2"]] 156 | result = fn.transform(X, features=features, era_series=era_series) 157 | assert len(result) == len(setup_data) 158 | assert result.shape[1] == 6 159 | assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) 160 | assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) 161 | 162 | # Test with numpy X 163 | result = fn.transform(X.to_numpy(), features=features, era_series=era_series) 164 | assert len(result) == len(setup_data) 165 | assert result.shape[1] == 6 166 | assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) 167 | assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) 168 | 169 | 170 | def test_feature_neutralizer_neutralize(setup_data): 171 | columns = ["prediction"] 172 | by = ["feature1", "feature2"] 173 | scores = FeatureNeutralizer().neutralize(setup_data, columns, by, proportion=0.5) 174 | assert isinstance(scores, pd.DataFrame) 175 | 176 | 177 | def test_feature_neutralizer_get_feature_names_out(): 178 | names = FeatureNeutralizer().get_feature_names_out() 179 | assert names == ["prediction_neutralized_0.5"] 180 | 181 | 182 | def test_feature_neutralizer_get_feature_names_out_complex(): 183 | names = FeatureNeutralizer(pred_name="fancy", suffix="blob").get_feature_names_out() 184 | assert names == ["fancy_neutralized_0.5_blob"] 185 | 186 | 187 | def test_feature_neutralizer_get_feature_names_out_with_input_features(): 188 | names = FeatureNeutralizer().get_feature_names_out(input_features=["prediction_fancy1"]) 189 | assert names == ["prediction_fancy1"] 190 | -------------------------------------------------------------------------------- /tests/test_numerframe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from numerai_era_data.date_utils import ERA_ONE_START 5 | 6 | from numerblox.feature_groups import FNCV3_FEATURES, MEDIUM_FEATURES, SMALL_FEATURES, V5_FEATURE_GROUP_MAPPING 7 | from numerblox.numerframe import NumerFrame, create_numerframe 8 | 9 | TEST_FILE_PATH = "tests/test_assets/val_3_eras.parquet" 10 | dataset = pd.read_parquet(TEST_FILE_PATH) 11 | 12 | 13 | def test_numerframe_initialization(): 14 | nf = NumerFrame(dataset) 15 | assert isinstance(nf, NumerFrame) 16 | assert nf.meta == {"era_col": "era"} 17 | assert nf.meta.era_col == "era" 18 | 19 | 20 | def test_get_feature_data(): 21 | nf = NumerFrame(dataset) 22 | features = nf.get_feature_data 23 | assert isinstance(features, NumerFrame) 24 | assert all([col.startswith("feature_") for col in features.columns.tolist()]) 25 | 26 | 27 | def test_get_pattern_data(): 28 | nf = NumerFrame(dataset) 29 | xerxes_targets = nf.get_pattern_data("xerxes") 30 | assert isinstance(xerxes_targets, NumerFrame) 31 | assert xerxes_targets.columns.tolist() == ["target_xerxes_20", "target_xerxes_60"] 32 | 33 | 34 | def test_get_target_data(): 35 | nf = NumerFrame(dataset) 36 | targets = nf.get_target_data 37 | assert isinstance(targets, NumerFrame) 38 | assert all([col.startswith("target") for col in targets.columns.tolist()]) 39 | 40 | 41 | def test_get_single_target_data(): 42 | nf = NumerFrame(dataset) 43 | single_target = nf.get_single_target_data 44 | assert isinstance(single_target, NumerFrame) 45 | assert single_target.columns.tolist() == ["target"] 46 | 47 | 48 | def test_get_prediction_data(): 49 | nf = NumerFrame(dataset) 50 | preds = nf.get_prediction_data 51 | assert isinstance(preds, NumerFrame) 52 | assert preds.columns.tolist() == [] 53 | 54 | 55 | def test_get_column_selection(): 56 | nf = NumerFrame(dataset) 57 | result = nf.get_column_selection(["feature_itinerant_hexahedral_photoengraver"]) 58 | assert isinstance(result, NumerFrame) 59 | assert result.columns.tolist() == ["feature_itinerant_hexahedral_photoengraver"] 60 | 61 | 62 | def test_get_aux_data(): 63 | nf = NumerFrame(dataset) 64 | aux_data = nf.get_aux_data 65 | assert isinstance(aux_data, NumerFrame) 66 | assert aux_data.columns.tolist() == ["era", "data_type"] 67 | 68 | 69 | def test_get_era_data(): 70 | nf = NumerFrame(dataset) 71 | era_data = nf.get_era_data 72 | assert isinstance(era_data, NumerFrame) 73 | assert era_data.columns.tolist() == ["era"] 74 | 75 | 76 | def test_get_prediction_aux_data(): 77 | nf = NumerFrame(dataset) 78 | nf["prediction"] = 1 79 | nf = NumerFrame(nf) 80 | pred_aux = nf.get_prediction_aux_data 81 | assert isinstance(pred_aux, NumerFrame) 82 | assert pred_aux.columns.tolist() == ["prediction", "era", "data_type"] 83 | 84 | 85 | def test_get_feature_target_pair(): 86 | nf = NumerFrame(dataset) 87 | X, y = nf.get_feature_target_pair() 88 | assert isinstance(X, NumerFrame) 89 | assert X.columns.tolist() == nf.get_feature_data.columns.tolist() 90 | assert y.columns.tolist() == ["target"] 91 | 92 | 93 | def test_get_feature_target_pair_multi_target(): 94 | nf = NumerFrame(dataset) 95 | X, y = nf.get_feature_target_pair(multi_target=True) 96 | assert isinstance(X, NumerFrame) 97 | assert X.columns.tolist() == nf.get_feature_data.columns.tolist() 98 | assert y.columns.tolist() == nf.get_target_data.columns.tolist() 99 | 100 | 101 | def test_get_fncv3_features(): 102 | nf = NumerFrame(dataset) 103 | result = nf.get_fncv3_feature_data 104 | assert isinstance(result, NumerFrame) 105 | assert result.columns.tolist() == FNCV3_FEATURES 106 | 107 | 108 | def test_get_small_features(): 109 | nf = NumerFrame(dataset) 110 | result = nf.get_small_feature_data 111 | assert isinstance(result, NumerFrame) 112 | assert result.columns.tolist() == SMALL_FEATURES 113 | 114 | 115 | def test_get_medium_features(): 116 | nf = NumerFrame(dataset) 117 | result = nf.get_medium_feature_data 118 | assert isinstance(result, NumerFrame) 119 | assert result.columns.tolist() == MEDIUM_FEATURES 120 | 121 | 122 | def test_get_unique_eras(): 123 | nf = NumerFrame(dataset) 124 | result = nf.get_unique_eras 125 | assert isinstance(result, list) 126 | assert result == ["0575", "0576", "0577"] 127 | 128 | 129 | def test_get_feature_group(): 130 | # Test with a valid group name 131 | nf = NumerFrame(dataset) 132 | result = nf.get_feature_group("rain") 133 | assert isinstance(result, NumerFrame) 134 | assert result.columns.tolist() == V5_FEATURE_GROUP_MAPPING["rain"] 135 | 136 | # Test with an invalid group name 137 | with pytest.raises(AssertionError, match=r".*not found in.*"): 138 | nf.get_feature_group("group_invalid") 139 | 140 | 141 | def test_get_last_n_eras(): 142 | nf = NumerFrame(dataset) 143 | result = nf.get_last_n_eras(2) 144 | assert isinstance(result, NumerFrame) 145 | assert result[nf.meta.era_col].unique().tolist() == ["0576", "0577"] 146 | assert result.shape == (11313, 2415) 147 | 148 | 149 | def test_get_era_batch(): 150 | nf = NumerFrame(dataset) 151 | eras = ["0575", "0576"] 152 | X, y = nf.get_era_batch(eras=eras) 153 | assert isinstance(X, np.ndarray) 154 | assert X.shape == (11230, 2376) 155 | assert y.shape == (11230, 37) 156 | 157 | 158 | def test_get_era_from_date(): 159 | nf = NumerFrame(dataset) 160 | era = nf.get_era_from_date(pd.Timestamp("2016-01-01")) 161 | assert isinstance(era, int) 162 | assert era == 677 163 | 164 | era1 = nf.get_era_from_date(pd.Timestamp(ERA_ONE_START)) 165 | assert isinstance(era1, int) 166 | assert era1 == 1 167 | 168 | 169 | def test_get_date_from_era(): 170 | nf = NumerFrame(dataset) 171 | date = nf.get_date_from_era(era=4) 172 | assert isinstance(date, pd.Timestamp) 173 | assert date == pd.Timestamp("2003-02-01") 174 | 175 | date1 = nf.get_date_from_era(era=1) 176 | assert isinstance(date1, pd.Timestamp) 177 | assert date1 == pd.Timestamp(ERA_ONE_START) 178 | 179 | 180 | def test_get_dates_from_era_col(): 181 | nf = NumerFrame(dataset).iloc[:5] 182 | result = nf.get_dates_from_era_col 183 | assert isinstance(result, pd.Series) 184 | assert all(result.index == nf.index[:5]) 185 | assert result.tolist() == [pd.Timestamp("2014-01-11 00:00:00")] * len(result) 186 | 187 | 188 | def test_get_eras_from_date_col(): 189 | dataset_copy = dataset.copy() 190 | # Use a smaller range of dates 191 | dataset_copy["date"] = [pd.Timestamp(ERA_ONE_START) + pd.Timedelta(days=i) for i in range(len(dataset_copy))] 192 | dataset_copy = dataset_copy.drop(columns="era") 193 | nf = NumerFrame(dataset_copy.iloc[:5]) 194 | result = nf.get_eras_from_date_col 195 | assert isinstance(result, pd.Series) 196 | assert all(result.index == nf.index[:5]) 197 | assert result.tolist() == [1, 1, 1, 1, 1] 198 | 199 | 200 | def test_get_era_range(): 201 | nf = NumerFrame(dataset) 202 | result = nf.get_era_range(start_era=575, end_era=576) 203 | assert isinstance(result, NumerFrame) 204 | assert result[nf.meta.era_col].unique().tolist() == ["0575", "0576"] 205 | assert result.shape == (11230, 2415) 206 | 207 | with pytest.raises(AssertionError): 208 | no_era_dataset = dataset.drop("era", axis="columns") 209 | no_era_dataset["date"] = pd.Timestamp("2016-01-01") 210 | nf = NumerFrame(no_era_dataset) 211 | nf.get_era_range(start_era=1, end_era=3) 212 | # Negative era 213 | with pytest.raises(AssertionError): 214 | nf.get_era_range(-1, 5) 215 | # End era before start era 216 | with pytest.raises(AssertionError): 217 | nf.get_era_range(20, 3) 218 | # Start era not int 219 | with pytest.raises(AssertionError): 220 | nf.get_era_range("0001", 2) 221 | # End era not int 222 | with pytest.raises(AssertionError): 223 | nf.get_era_range(1, "0002") 224 | 225 | 226 | def test_get_date_range(): 227 | date_col_dataset = dataset.drop("era", axis="columns") 228 | date_col_dataset["date"] = [pd.Timestamp("2016-01-01") + pd.Timedelta(days=i) for i in range(0, len(date_col_dataset))] 229 | nf = NumerFrame(date_col_dataset) 230 | result = nf.get_date_range(start_date=pd.Timestamp("2016-01-01"), end_date=pd.Timestamp("2016-01-03")) 231 | assert isinstance(result, NumerFrame) 232 | assert result[nf.meta.era_col].unique().tolist() == [pd.Timestamp("2016-01-01"), pd.Timestamp("2016-01-02"), pd.Timestamp("2016-01-03")] 233 | assert result.shape == (3, 2415) 234 | 235 | # End date before start date 236 | with pytest.raises(AssertionError): 237 | nf.get_date_range(pd.Timestamp("2022-01-05"), pd.Timestamp("2022-01-01")) 238 | # Date before era 1 239 | with pytest.raises(AssertionError): 240 | nf.get_date_range(pd.Timestamp("1970-01-05"), pd.Timestamp("1971-01-10")) 241 | # Start date not pd.Timestamp 242 | with pytest.raises(AssertionError): 243 | nf.get_date_range("2016-01-01", pd.Timestamp("2016-01-10")) 244 | # End date not pd.Timestamp 245 | with pytest.raises(AssertionError): 246 | nf.get_date_range(pd.Timestamp("2016-01-01"), "2016-01-10") 247 | 248 | 249 | def test_create_numerframe(): 250 | nf = create_numerframe(TEST_FILE_PATH) 251 | assert isinstance(nf, NumerFrame) 252 | -------------------------------------------------------------------------------- /tests/test_penalizers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from utils import create_classic_sample_data 3 | 4 | from numerblox.penalizers import BasePenalizer, FeaturePenalizer 5 | 6 | setup_data = create_classic_sample_data 7 | 8 | 9 | def test_base_penalizer_initialization(): 10 | bn = BasePenalizer(new_col_name="test") 11 | assert bn.new_col_name == "test" 12 | 13 | 14 | def test_base_penalizer_fit(setup_data): 15 | obj = BasePenalizer(new_col_name="test").fit(setup_data) 16 | assert isinstance(obj, BasePenalizer) 17 | 18 | 19 | @pytest.mark.xfail(reason="TensorFlow is not installed") 20 | def test_feature_penalizer_initialization(): 21 | fp = FeaturePenalizer(max_exposure=0.5) 22 | assert fp.new_col_name.startswith("prediction_penalized_") 23 | assert fp.max_exposure == 0.5 24 | 25 | 26 | @pytest.mark.xfail(reason="TensorFlow is not installed") 27 | def test_feature_penalizer_get_feature_names_out(): 28 | names = FeaturePenalizer(max_exposure=0.5).get_feature_names_out() 29 | assert names == ["prediction_penalized_0.5"] 30 | 31 | 32 | @pytest.mark.xfail(reason="TensorFlow is not installed") 33 | def test_feature_penalizer_get_feature_names_out_complex(): 34 | names = FeaturePenalizer(max_exposure=0.7, pred_name="fancy", suffix="blob").get_feature_names_out() 35 | assert names == ["fancy_penalized_0.7_blob"] 36 | 37 | 38 | @pytest.mark.xfail(reason="TensorFlow is not installed") 39 | def test_feature_penalizer_get_feature_names_out_with_input_features(): 40 | names = FeaturePenalizer(max_exposure=0.5).get_feature_names_out(input_features=["prediction_fancy1"]) 41 | assert names == ["prediction_fancy1"] 42 | -------------------------------------------------------------------------------- /tests/test_prediction_loaders.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.datasets import make_regression 7 | from sklearn.pipeline import FeatureUnion, Pipeline 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | from numerblox.prediction_loaders import BasePredictionLoader, ExamplePredictions 11 | 12 | 13 | def test_example_predictions_basic(): 14 | ep = ExamplePredictions() 15 | preds = ep.fit_transform(None) 16 | # Check all values are between 0 and 1 17 | assert preds["prediction"].min() >= 0 18 | assert preds["prediction"].max() <= 1 19 | assert isinstance(preds, pd.DataFrame) 20 | assert issubclass(ExamplePredictions, (BasePredictionLoader, TransformerMixin, BaseEstimator)) 21 | assert BasePredictionLoader.__bases__[-1] == BaseEstimator, "BaseEstimator must be the rightmost base class" 22 | 23 | 24 | def test_example_predictions_pipeline(): 25 | # Create dummy dataset 26 | X, y = make_regression(n_samples=100, n_features=20, noise=0.1) 27 | X = pd.DataFrame(X) 28 | 29 | # Create pipeline with standard scaler and example predictions 30 | pipeline = Pipeline([("scaler", StandardScaler()), ("predictions", ExamplePredictions())]) 31 | # Get results 32 | preds = pipeline.fit_transform(X, y) 33 | 34 | # Check all values are between 0 and 1 35 | assert preds["prediction"].min() >= 0 36 | assert preds["prediction"].max() <= 1 37 | assert isinstance(preds, pd.DataFrame) 38 | 39 | 40 | def test_example_predictions_feature_union(): 41 | # Get predictions in basic setting to compare output 42 | ep = ExamplePredictions() 43 | preds = ep.fit_transform(None) 44 | 45 | # Dummy data 46 | X, _ = make_regression(n_samples=len(preds), n_features=2, noise=0.1) 47 | 48 | # Create feature union 49 | combined_features = FeatureUnion([("standard", StandardScaler()), ("example", ExamplePredictions())]) 50 | 51 | # Transform data 52 | X_transformed = combined_features.fit_transform(X) 53 | 54 | # Ensure the transformation worked 55 | assert np.allclose(X_transformed[:, -1], preds["prediction"].values) 56 | assert X_transformed.shape[0] == X.shape[0] 57 | assert X_transformed.shape[1] == 3 58 | 59 | 60 | def test_example_predictions_get_feature_names_out(): 61 | ep = ExamplePredictions() 62 | assert ep.get_feature_names_out() == ["v5.0/live_example_preds"] 63 | assert ep.get_feature_names_out(["a", "b"]) == ["a", "b"] 64 | 65 | 66 | def test_example_predictions_keep_files(): 67 | # Test with keep_files = True 68 | ep_keep = ExamplePredictions(keep_files=True) 69 | ep_keep.fit_transform(None) 70 | assert os.path.isdir(ep_keep.downloader.dir), "Directory should be kept with keep_files=True" 71 | assert os.path.exists(ep_keep.dest_path), "File should be kept with keep_files=True" 72 | # Clean up 73 | ep_keep.downloader.remove_base_directory() 74 | 75 | # Test with keep_files = False 76 | ep_remove = ExamplePredictions(keep_files=False) 77 | ep_remove.fit_transform(None) 78 | assert not os.path.isdir(ep_remove.downloader.dir), "Directory should be removed with keep_files=False" 79 | -------------------------------------------------------------------------------- /tests/test_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from datetime import datetime 4 | from random import choices 5 | from string import ascii_uppercase 6 | from unittest.mock import patch 7 | from uuid import uuid4 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import pytest 12 | from dateutil.relativedelta import FR, relativedelta 13 | 14 | from numerblox.misc import Key 15 | from numerblox.submission import NumeraiClassicSubmitter, NumeraiCryptoSubmitter, NumeraiSignalsSubmitter 16 | 17 | TARGET_NAME = "prediction" 18 | 19 | 20 | def _create_random_classic_df(): 21 | # Create random predictions dataframe 22 | n_rows = 100 23 | test_dataf = pd.DataFrame(np.random.uniform(size=n_rows), columns=[TARGET_NAME]) 24 | test_dataf["id"] = [uuid4() for _ in range(n_rows)] 25 | test_dataf = test_dataf.set_index("id") 26 | return test_dataf 27 | 28 | 29 | def create_random_signals_df(n_rows=1000): 30 | signals_test_dataf = pd.DataFrame(np.random.uniform(size=(n_rows, 1)), columns=["signal"]) 31 | signals_test_dataf["ticker"] = ["".join(choices(ascii_uppercase, k=4)) for _ in range(n_rows)] 32 | last_friday = str((datetime.now() + relativedelta(weekday=FR(-1))).date()).replace("-", "") 33 | signals_test_dataf["last_friday"] = last_friday 34 | signals_test_dataf["data_type"] = "live" 35 | return signals_test_dataf 36 | 37 | 38 | def test_classic_submitter(): 39 | # Initialization 40 | test_dir = f"test_sub_{uuid4()}" 41 | classic_key = Key(pub_id="Hello", secret_key="World") 42 | num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key) 43 | assert num_sub.dir.is_dir() 44 | 45 | # Save CSV 46 | test_dataf = _create_random_classic_df() 47 | file_name = "test.csv" 48 | num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME) 49 | num_sub.save_csv(dataf=test_dataf, file_name="test2.csv", cols=TARGET_NAME) 50 | assert (num_sub.dir / file_name).is_file() 51 | 52 | # Combine CSVs 53 | combined = num_sub.combine_csvs([f"{test_dir}/test.csv", f"{test_dir}/test2.csv"], aux_cols=["id"]) 54 | assert combined.columns == [TARGET_NAME] 55 | 56 | # Test that saving breaks if range is invalid. 57 | with pytest.raises(ValueError): 58 | invalid_signal = deepcopy(test_dataf) 59 | invalid_signal[TARGET_NAME] = invalid_signal[TARGET_NAME].add(10) 60 | num_sub.save_csv( 61 | invalid_signal, 62 | file_name="should_not_save.csv", 63 | cols=TARGET_NAME, 64 | ) 65 | 66 | # Wind down 67 | num_sub.remove_base_directory() 68 | assert not os.path.exists(test_dir) 69 | 70 | 71 | def test_signals_submitter(): 72 | # Initialization 73 | test_dir = f"test_sub_{uuid4()}" 74 | signals_key = Key(pub_id="Hello", secret_key="World") 75 | signals_sub = NumeraiSignalsSubmitter(directory_path=test_dir, key=signals_key) 76 | assert signals_sub.dir.is_dir() 77 | 78 | # Save CSVs 79 | test_dataf = create_random_signals_df() 80 | signals_cols = ["signal", "ticker", "data_type", "last_friday"] 81 | file_name = "signals_test.csv" 82 | signals_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=signals_cols) 83 | signals_sub.save_csv(dataf=test_dataf, file_name="signals_test2.csv", cols=signals_cols) 84 | 85 | combined_signals = signals_sub.combine_csvs(csv_paths=[f"{test_dir}/signals_test.csv", f"{test_dir}/signals_test2.csv"], aux_cols=["ticker", "last_friday", "data_type"], era_col="last_friday", pred_col="signal") 86 | assert combined_signals.columns == ["signal"] 87 | 88 | # Test that saving breaks if range is invalid. 89 | with pytest.raises(ValueError): 90 | invalid_signal = deepcopy(test_dataf) 91 | invalid_signal.loc[0, "signal"] += 10 92 | signals_sub.save_csv( 93 | invalid_signal, 94 | file_name="should_not_save.csv", 95 | cols=list(invalid_signal.columns), 96 | ) 97 | 98 | # Test that saving breaks if ticker is invalid. 99 | with pytest.raises(NotImplementedError): 100 | invalid_ticker = deepcopy(test_dataf) 101 | invalid_ticker = invalid_ticker.rename({"ticker": "not_a_valid_ticker_format"}, axis=1) 102 | signals_sub.save_csv( 103 | invalid_ticker, 104 | file_name="should_not_save.csv", 105 | cols=list(invalid_ticker.columns), 106 | ) 107 | # Wind down 108 | signals_sub.remove_base_directory() 109 | assert not os.path.exists(test_dir) 110 | 111 | 112 | def test_crypto_submitter(): 113 | # Initialization 114 | test_dir = f"test_sub_{uuid4()}" 115 | crypto_key = Key(pub_id="Hello", secret_key="World") 116 | crypto_sub = NumeraiCryptoSubmitter(directory_path=test_dir, key=crypto_key) 117 | assert crypto_sub.dir.is_dir() 118 | 119 | # Create random crypto predictions dataframe 120 | def create_random_crypto_df(n_rows=1000): 121 | crypto_test_dataf = pd.DataFrame(np.random.uniform(size=(n_rows, 1)), columns=["signal"]) 122 | crypto_test_dataf["symbol"] = [f"CRYPTO_{i:04d}" for i in range(n_rows)] 123 | return crypto_test_dataf 124 | 125 | # Save CSVs 126 | test_dataf = create_random_crypto_df() 127 | crypto_cols = ["symbol", "signal"] 128 | file_name = "crypto_test.csv" 129 | crypto_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=crypto_cols) 130 | crypto_sub.save_csv(dataf=test_dataf, file_name="crypto_test2.csv", cols=crypto_cols) 131 | 132 | combined_crypto = crypto_sub.combine_csvs(csv_paths=[f"{test_dir}/crypto_test.csv", f"{test_dir}/crypto_test2.csv"], aux_cols=["symbol"], pred_col="signal") 133 | assert combined_crypto.columns == ["signal"] 134 | 135 | # Test that saving breaks if range is invalid. 136 | with pytest.raises(ValueError): 137 | invalid_signal = deepcopy(test_dataf) 138 | invalid_signal.loc[0, "signal"] += 10 139 | crypto_sub.save_csv( 140 | invalid_signal, 141 | file_name="should_not_save.csv", 142 | cols=list(invalid_signal.columns), 143 | ) 144 | 145 | # Test that saving breaks if symbol column is missing 146 | with pytest.raises(AssertionError): 147 | invalid_symbol = deepcopy(test_dataf) 148 | invalid_symbol = invalid_symbol.rename(columns={"symbol": "not_symbol"}) 149 | crypto_sub.save_csv( 150 | invalid_symbol, 151 | file_name="should_not_save.csv", 152 | cols=list(invalid_symbol.columns), 153 | ) 154 | 155 | # Wind down 156 | crypto_sub.remove_base_directory() 157 | assert not os.path.exists(test_dir) 158 | 159 | 160 | def raise_api_error(*args, **kwargs): 161 | raise ValueError("Your session is invalid or has expired.") 162 | 163 | 164 | @patch.object(NumeraiClassicSubmitter, "_get_model_id", return_value="mocked_model_id") 165 | def test_upload_predictions_retries(mocked_get_model_id): 166 | test_dir = f"test_sub_{uuid4()}" 167 | classic_key = Key(pub_id="Hello", secret_key="World") 168 | num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key, sleep_time=0.1, fail_silently=True) 169 | file_name = "test.csv" 170 | 171 | # Save CSV 172 | test_dataf = _create_random_classic_df() 173 | num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME) 174 | 175 | with patch.object(num_sub.api, "upload_predictions", side_effect=raise_api_error) as mock_upload: 176 | num_sub.upload_predictions(file_name=file_name, model_name="mock_model") 177 | # Check if retries happened 'max_retries' times 178 | assert mock_upload.call_count == num_sub.max_retries 179 | num_sub.remove_base_directory() 180 | 181 | 182 | @patch.object(NumeraiClassicSubmitter, "_get_model_id", return_value="mocked_model_id") 183 | def test_upload_predictions_fail_silently(mocked_get_model_id): 184 | test_dir = f"test_sub_{uuid4()}" 185 | classic_key = Key(pub_id="Hello", secret_key="World") 186 | num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key, sleep_time=0.1, fail_silently=True) 187 | file_name = "test.csv" 188 | 189 | # Save CSV 190 | test_dataf = _create_random_classic_df() 191 | num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME) 192 | 193 | with patch.object(num_sub.api, "upload_predictions", side_effect=raise_api_error): 194 | num_sub.upload_predictions(file_name=file_name, model_name="mock_model") 195 | 196 | num_sub.remove_base_directory() 197 | 198 | 199 | @patch.object(NumeraiClassicSubmitter, "_get_model_id", return_value="mocked_model_id") 200 | def test_upload_predictions_exception_handling(mocked_get_model_id): 201 | test_dir = f"test_sub_{uuid4()}" 202 | classic_key = Key(pub_id="Hello", secret_key="World") 203 | num_sub = NumeraiClassicSubmitter(directory_path=test_dir, key=classic_key, sleep_time=0.1, fail_silently=True) 204 | file_name = "test.csv" 205 | 206 | # Save CSV 207 | test_dataf = _create_random_classic_df() 208 | num_sub.save_csv(dataf=test_dataf, file_name=file_name, cols=TARGET_NAME) 209 | 210 | with patch("builtins.print") as mock_print: 211 | num_sub.upload_predictions(file_name=file_name, model_name="mock_model") 212 | assert mock_print.call_count >= num_sub.max_retries 213 | 214 | num_sub.remove_base_directory() 215 | 216 | 217 | # Tests for NumerBaySubmitter 218 | def test_numerbay_submitter(): 219 | pass 220 | -------------------------------------------------------------------------------- /tests/test_targets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import polars as pl 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from tqdm import tqdm 6 | from utils import create_signals_sample_data 7 | 8 | from numerblox.targets import BaseTargetProcessor, BayesianGMMTargetProcessor, SignalsTargetProcessor 9 | 10 | dataset = pd.read_parquet("tests/test_assets/val_3_eras.parquet") 11 | dummy_signals_data = create_signals_sample_data 12 | 13 | ALL_PROCESSORS = [BayesianGMMTargetProcessor, SignalsTargetProcessor] 14 | 15 | 16 | def test_processors_sklearn(): 17 | data = dataset.sample(50) 18 | data = data.drop(columns=["data_type"]) 19 | 20 | assert BaseTargetProcessor.__bases__[-1] == BaseEstimator, "BaseEstimator must be the rightmost base class" 21 | 22 | for processor_cls in tqdm(ALL_PROCESSORS, desc="Testing target processors for scikit-learn compatibility"): 23 | # Initialization 24 | processor = processor_cls() 25 | 26 | # Inherits from Sklearn classes 27 | assert issubclass(processor_cls, (BaseTargetProcessor, TransformerMixin, BaseEstimator)) 28 | 29 | # Test every processor has get_feature_names_out 30 | assert hasattr(processor, "get_feature_names_out"), "Processor {processor.__name__} does not have get_feature_names_out. Every implemented preprocessors should have this method." 31 | 32 | 33 | def test_bayesian_gmm_target_preprocessor(): 34 | bgmm = BayesianGMMTargetProcessor(n_components=2) 35 | 36 | y = dataset["target_xerxes_20"].fillna(0.5) 37 | era_series = dataset["era"] 38 | feature_names = [ 39 | "feature_melismatic_daily_freak", 40 | "feature_pleasurable_facultative_benzol", 41 | ] 42 | X = dataset[feature_names] 43 | 44 | bgmm.fit(X, y, era_series=era_series) 45 | 46 | result = bgmm.transform(X, era_series=era_series) 47 | assert bgmm.get_feature_names_out() == ["fake_target"] 48 | assert len(result) == len(dataset) 49 | assert result.min() >= 0.0 50 | assert result.max() <= 1.0 51 | 52 | # _get_coefs 53 | coefs = bgmm._get_coefs(X, y, era_series=era_series) 54 | assert coefs.shape == (3, 2) 55 | assert coefs.min() >= 0.0 56 | assert coefs.max() <= 1.0 57 | 58 | # Test set_output API 59 | bgmm.set_output(transform="pandas") 60 | result = bgmm.transform(X, era_series=era_series) 61 | assert isinstance(result, pd.DataFrame) 62 | bgmm.set_output(transform="default") 63 | result = bgmm.transform(X, era_series=era_series) 64 | assert isinstance(result, np.ndarray) 65 | 66 | 67 | def test_signals_target_processor(dummy_signals_data): 68 | stp = SignalsTargetProcessor() 69 | stp.set_output(transform="pandas") 70 | era_series = dummy_signals_data["date"] 71 | stp.fit(dummy_signals_data) 72 | result = stp.transform(dummy_signals_data, era_series=era_series) 73 | expected_target_cols = ["target_10d_raw", "target_10d_rank", "target_10d_group", "target_20d_raw", "target_20d_rank", "target_20d_group"] 74 | for col in expected_target_cols: 75 | assert col in result.columns 76 | assert stp.get_feature_names_out() == expected_target_cols 77 | 78 | # Test set_output API 79 | stp.set_output(transform="default") 80 | result = stp.transform(dummy_signals_data, era_series=era_series) 81 | assert isinstance(result, np.ndarray) 82 | 83 | stp.set_output(transform="polars") 84 | result = stp.transform(dummy_signals_data, era_series=era_series) 85 | assert isinstance(result, pl.DataFrame) 86 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def create_classic_sample_data(): 8 | data = {"feature1": [1, 2, 3, 4, 3, 2, 1], "feature2": [4, 3, 2, 1, 3, 1, 2], "prediction": [0.5, 0.6, 0.7, 0.8, 0.2, 0.3, 0.4], "target": [0, 1, 0, 1, 0.25, 0.75, 0.5], "target_2": [0, 0.25, 0.75, 0.50, 0.25, 0.75, 0.5], "era": ["era1", "era2", "era1", "era2", "era1", "era2", "era1"]} 9 | return pd.DataFrame(data) 10 | 11 | 12 | @pytest.fixture 13 | def create_signals_sample_data(): 14 | instances = [] 15 | tickers = ["ABC.US", "DEF.US", "GHI.US", "JKL.US", "MNO.US"] 16 | for ticker in tickers: 17 | price = np.random.randint(10, 100) 18 | for i in range(100): 19 | price += np.random.uniform(-1, 1) 20 | instances.append( 21 | { 22 | "ticker": ticker, 23 | "date": pd.Timestamp("2020-01-01") + pd.Timedelta(days=i), 24 | "open": price - 0.05, 25 | "high": price + 0.02, 26 | "low": price - 0.01, 27 | "close": price, 28 | "adjusted_close": price * np.random.uniform(0.5, 1.5), 29 | "volume": np.random.randint(1000, 10000), 30 | "target": np.random.uniform(), 31 | "target_2": np.random.uniform(), 32 | "prediction": np.random.uniform(), 33 | "prediction_random": np.random.uniform(), 34 | } 35 | ) 36 | # Add instances with only 10 days of data 37 | unwanted_tickers = ["XYZ.US", "RST.US", "UVW.US"] 38 | price = np.random.randint(10, 100) 39 | for ticker in unwanted_tickers: 40 | for i in range(10): 41 | price += np.random.uniform(-1, 1) 42 | instances.append( 43 | { 44 | "ticker": ticker, 45 | "date": pd.Timestamp("2020-01-01") + pd.Timedelta(days=i), 46 | "open": price - 0.05, 47 | "high": price + 0.02, 48 | "low": price - 0.01, 49 | "close": price, 50 | "adjusted_close": price * np.random.uniform(0.5, 1.5), 51 | "volume": np.random.randint(1000, 10000), 52 | "target": np.random.uniform(), 53 | "target_2": np.random.uniform(), 54 | "prediction": np.random.uniform(), 55 | "prediction_random": np.random.uniform(), 56 | } 57 | ) 58 | return pd.DataFrame(instances) 59 | --------------------------------------------------------------------------------