├── vscode.env
├── src
    └── pythor
    │   ├── __init__.py
    │   ├── feature.py
    │   ├── neural.py
    │   ├── optimisation.py
    │   ├── benchmark.py
    │   ├── util.py
    │   └── numerai.py
├── setup.cfg
├── MANIFEST.in
├── Numerai_Paper3_Rain-2.pdf
├── requirements.txt
├── .vscode
    └── settings.json
├── tox.ini
├── .editorconfig
├── pyproject.toml
├── Dockerfile
├── .github
    └── workflows
    │   ├── test.yml
    │   └── release.yml
├── LICENSE
├── setup.py
├── README.md
└── .gitignore


/vscode.env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=/;src/;${PYTHONPATH}
2 | 


--------------------------------------------------------------------------------
/src/pythor/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.1.2"
2 | 
3 | 
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | version = attr: pythor.__version__
3 | license_files = LICENSE
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include pyproject.toml
2 | include *.md
3 | include LICENSE
4 | recursive-include tests test*.py
5 | 


--------------------------------------------------------------------------------
/Numerai_Paper3_Rain-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThomasWong2022/thor-public/HEAD/Numerai_Paper3_Rain-2.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | setuptools
 2 | joblib
 3 | numpy
 4 | pandas
 5 | scipy
 6 | scikit-learn
 7 | torch
 8 | signatory
 9 | xgboost
10 | lightgbm
11 | catboost
12 | optuna
13 | cupy 
14 | cuml
15 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "python.testing.unittestEnabled": false,
3 |   "python.testing.nosetestsEnabled": false,
4 |   "python.testing.pytestEnabled": true,
5 |   "python.envFile": "${workspaceRoot}/vscode.env"
6 | }
7 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py{37,38,39}
 3 | minversion = 3.3.0
 4 | isolated_build = true
 5 | 
 6 | [testenv]
 7 | deps =
 8 |     check-manifest >= 0.42
 9 |     pytest
10 | commands =
11 |     check-manifest --ignore 'tox.ini,tests/**,.editorconfig,vscode.env,.vscode/**'
12 |     python setup.py check -m -s
13 |     pytest tests {posargs}
14 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | indent_style = space
 6 | indent_size = 2
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 
11 | [*.py]
12 | charset = utf-8
13 | indent_style = space
14 | indent_size = 4
15 | end_of_line = lf
16 | insert_final_newline = true
17 | trim_trailing_whitespace = true
18 | 
19 | [*.{md,mdx}]
20 | trim_trailing_whitespace = false
21 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=46.4.0", 
 3 |             "wheel", 
 4 |             "joblib",
 5 |             "numpy",
 6 |             "pandas",
 7 |             "scipy",
 8 |             "scikit-learn",
 9 |             "optuna",
10 |             ]
11 | build-backend = "setuptools.build_meta"
12 | [options.extras_require]
13 | cuda = [
14 |     "cupy",
15 |     "cuml",
16 |     "torch",
17 |     "lightgbm --install-option=--cuda",
18 |     "xgboost",
19 |     "catboost",
20 | ]


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/kaggle-gpu-images/python:latest
 2 | #### Install Signatory, needs to update when newer supporting torch1.11.0
 3 | RUN pip install signatory==1.2.6.1.9.0 --no-deps
 4 | #### Additional NN models 
 5 | RUN pip install pytorch-lightning
 6 | RUN pip install pytorch-tabnet --no-deps
 7 | RUN pip install numerapi==2.12.9
 8 | #### Build THOR package
 9 | WORKDIR /
10 | COPY src/pythor/ src/pythor/
11 | COPY setup.py setup.py
12 | COPY README.md README.md
13 | RUN pip install .
14 | WORKDIR /workspace 
15 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - nonexisting
 7 |   pull_request:
 8 |     branches:
 9 |       - nonexisting
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python: ["3.7.12","3.8.16","3.9.16"]
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |       - name: Setup Python
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: ${{ matrix.python }}
24 |       - name: Install Tox and any other packages
25 |         run: pip install tox
26 |       - name: Run Tox
27 |         # Run tox using the version of Python in `PATH`
28 |         run: tox -e py
29 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |       - name: Set up Python
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: "3.x"
16 |       - name: Install dependencies
17 |         run: |
18 |           python -m pip install --upgrade pip
19 |           pip install setuptools wheel twine
20 |       - name: Build and publish
21 |         env:
22 |           TWINE_USERNAME: __token__
23 |           TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
24 |         run: |
25 |           python setup.py sdist bdist_wheel
26 |           twine upload --repository pypi dist/*
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tom Chen (tomchen.org)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="thorml",
 8 |     author="Thomas Wong",
 9 |     author_email="mw4315@ic.ac.uk",
10 |     description="AutoML tools for Tabular Datasets",
11 |     keywords="autoML",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/ThomasWong2022/thor-public",
15 |     project_urls={
16 |         "Documentation": "https://github.com/ThomasWong2022/thor-public",
17 |         "Bug Reports": "https://github.com/ThomasWong2022/thor-public/issues",
18 |         "Source Code": "https://github.com/ThomasWong2022/thor-public",
19 |     },
20 |     package_dir={"": "src"},
21 |     packages=setuptools.find_packages(where="src"),
22 |     classifiers=[
23 |         # see https://pypi.org/classifiers/
24 |         "Development Status :: 3 - Alpha",
25 |         "Intended Audience :: Developers",
26 |         "Topic :: Software Development :: Build Tools",
27 |         "Programming Language :: Python :: 3",
28 |         "Programming Language :: Python :: 3.7",
29 |         "Programming Language :: Python :: 3.8",
30 |         "Programming Language :: Python :: 3.9",
31 |         "Programming Language :: Python :: 3 :: Only",
32 |         "License :: OSI Approved :: MIT License",
33 |         "Operating System :: OS Independent",
34 |     ],
35 |     python_requires=">=3.7",
36 |     # install_requires=['Pillow'],
37 |     extras_require={
38 |         "dev": ["check-manifest"],
39 |     },
40 | )
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # THOR: Time-Varying High-dimensional Ordinal Regression 
 2 | 
 3 | [![Downloads](https://static.pepy.tech/badge/thorml)](https://pepy.tech/project/thorml)
 4 | 
 5 | THOR is a new autoML tool for temporal tabular datasets and time series. It handles high dimensional datasets with distribution shifts better than other tools. It makes use of the latest research results from incremental learning to improve robustness of machine learning methods. 
 6 | 
 7 | 
 8 | ### Docker 
 9 | 
10 | As this packages used various machine learning and CUDA libaries for GPU support, we recommend to use docker to manage the dependencies. 
11 | 
12 | The image is now uploaded on [Docker Hub](https://hub.docker.com/repository/docker/thomaswong2023/thor-public/general).
13 | 
14 | The following Docker images contains all the dependencies used in this tool. 
15 | 
16 | ```bash
17 | docker pull thomaswong2023/thor-public:deps
18 | docker run --gpus device=all -it -d --rm --name thor-public-example thomaswong2023/thor:public:deps bash
19 | 
20 | ```
21 | 
22 | 
23 | ### PyPI 
24 | 
25 | This project is also on [PyPI](https://pypi.org/project/thorml/).
26 | 
27 | Install the package with the following command. Dependencies are not installed with the package 
28 | 
29 | ```bash
30 | pip install thorml -r requirements.txt
31 | 
32 | ```
33 | 
34 | 
35 | 
36 | ## Citation
37 | If you are using this package in your scientific work, we would appreciate citations to the following preprint on arxiv.
38 | 
39 | [Dynamic Feature Projection and model selection methods for temporal tabular datasets with regime changes](https://arxiv.org/abs/2301.00790)
40 | 
41 | Bibtex entry:
42 | ```
43 | @misc{wong2023dynamic,
44 |       title={Dynamic Feature Engineering and model selection methods for temporal tabular datasets with regime changes}, 
45 |       author={Thomas Wong and Mauricio Barahona},
46 |       year={2023},
47 |       eprint={2301.00790},
48 |       archivePrefix={arXiv},
49 |       primaryClass={q-fin.CP}
50 | }
51 | ```
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 


--------------------------------------------------------------------------------
/src/pythor/feature.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # A collection of feature enginnering methods for time-series data
  5 | #
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | 
 21 | from joblib import Parallel, delayed
 22 | import pandas as pd
 23 | import numpy as np
 24 | from sklearn.base import TransformerMixin, BaseEstimator
 25 | import cupy as cp
 26 | import cuml
 27 | import torch, signatory
 28 | 
 29 | 
 30 | import logging
 31 | 
 32 | logger = logging.getLogger("Numerai")
 33 | 
 34 | 
 35 | """
 36 | Feature Engineering used in Numerai Thesis 
 37 | """
 38 | 
 39 | 
 40 | class NumeraiTransformer(TransformerMixin, BaseEstimator):
 41 |     def __init__(
 42 |         self,
 43 |         seed=0,
 44 |         usesquare=False,
 45 |         dropout_pct=0.05,
 46 |         no_product_features=10,
 47 |         no_pca_features=0,
 48 |     ):
 49 |         self.seed = seed
 50 |         self.usesquare = usesquare
 51 |         self.dropout_pct = dropout_pct
 52 |         self.no_product_features = no_product_features
 53 |         self.no_pca_features = no_pca_features
 54 |         ## Data Dictionary to reconsturct transformer during inference
 55 |         self.data = dict()
 56 | 
 57 |     ## Transform Numerai Features with mean zero (-2,-1,0,1,2)
 58 |     def transform(self, X, is_train=True):
 59 |         ## Numpy Random Number Generator
 60 |         rng = np.random.default_rng(self.seed)
 61 | 
 62 |         ## Drop Out Matrix
 63 |         if self.dropout_pct > 0 and is_train:
 64 |             dropout_matrix = 1 - np.random.binomial(1, self.dropout_pct, X.shape)
 65 |             X_val = X.values * dropout_matrix
 66 | 
 67 |         if self.usesquare:
 68 |             squareX = pd.DataFrame(np.square(X_val), index=X.index)
 69 |             squareX.columns = ["{}_square".format(x) for x in X.columns]
 70 |         else:
 71 |             squareX = pd.DataFrame()
 72 | 
 73 |         ## Pair Transforms
 74 |         if self.no_product_features > 0:
 75 |             if is_train:
 76 |                 col1 = np.random.choice(X.columns, self.no_product_features)
 77 |                 col2 = np.random.choice(
 78 |                     X.columns,
 79 |                     self.no_product_features,
 80 |                 )
 81 |                 self.product_features = pd.DataFrame(
 82 |                     {
 83 |                         "col1": col1,
 84 |                         "col2": col2,
 85 |                     }
 86 |                 ).drop_duplicates()
 87 |                 self.data["product_features"] = self.product_features
 88 |             else:
 89 |                 self.product_features = self.data["product_features"]
 90 | 
 91 |             productX = pd.DataFrame(
 92 |                 np.array(X[self.product_features["col1"]])
 93 |                 * np.array(X[self.product_features["col2"]]),
 94 |                 index=X.index,
 95 |             )
 96 |             productX.columns = [
 97 |                 f"feature_product_{i}" for i in range(self.product_features.shape[0])
 98 |             ]
 99 |         else:
100 |             productX = pd.DataFrame()
101 | 
102 |         ## Concat All Features to output
103 |         transformed_features = pd.concat(
104 |             [
105 |                 X.astype(np.int8),
106 |                 squareX.astype(np.int8),
107 |                 productX.astype(np.int8),
108 |             ],
109 |             axis=1,
110 |         )
111 | 
112 |         return transformed_features
113 | 
114 | 
115 | class SignatureTransformer(TransformerMixin, BaseEstimator):
116 |     def __init__(
117 |         self,
118 |         lookback,
119 |         signature_level,
120 |     ):
121 |         self.lookback = lookback
122 |         self.signature_level = signature_level
123 | 
124 |     ## Can also be used to transform data in an online fashion by transform
125 |     def transform(self, X):
126 |         history_length = X.shape[0]
127 |         path_class = signatory.Path(
128 |             torch.Tensor(cp.asarray([X.values])), self.signature_level
129 |         )
130 |         sigs = list()
131 |         for i in range(self.lookback, history_length):
132 |             sigs.append(path_class.logsignature(i - self.lookback, i))
133 |         all_sig = torch.concat(sigs)
134 |         transformed_signature = pd.DataFrame(
135 |             all_sig.numpy(), index=X.index[self.lookback :]
136 |         )
137 |         transformed_signature.columns = [
138 |             "lookback_{}_signature_{}".format(self.lookback, i)
139 |             for i in range(transformed_signature.shape[1])
140 |         ]
141 |         return transformed_signature
142 | 
143 | 
144 | def features_transform_batch(transformer, data, is_train=True):
145 |     BATCH_SIZE = 10000000000
146 |     start_index = 0
147 |     transformed_features_batches = list()
148 | 
149 |     while start_index < data.shape[0]:
150 |         data_batch = data.iloc[start_index : start_index + BATCH_SIZE]
151 |         transformed_featrues_batch = pd.DataFrame(
152 |             transformer.transform(data_batch, is_train=is_train), index=data_batch.index
153 |         )
154 |         transformed_features_batches.append(transformed_featrues_batch)
155 |         start_index = start_index + BATCH_SIZE
156 | 
157 |     transformed_features = pd.concat(transformed_features_batches, axis=0)
158 |     return transformer, transformed_features
159 | 
160 | 
161 | def benchmark_features_transform(
162 |     X_train,
163 |     y_train,
164 |     X_test=None,
165 |     group_train=None,
166 |     group_test=None,
167 |     feature_eng=None,
168 |     feature_eng_parameters=None,
169 |     debug=False,
170 | ):
171 |     ### Numerai
172 |     if feature_eng in [
173 |         "numerai",
174 |     ]:
175 |         if feature_eng_parameters is None:
176 |             feature_eng_parameters = {
177 |                 "usesquare": False,
178 |                 "no_product_features": 0,
179 |                 "seed": 10,
180 |             }
181 |         transformer = NumeraiTransformer(**feature_eng_parameters)
182 | 
183 |     if feature_eng is not None:
184 |         extracted_features_train = transformer.transform(X_train, is_train=True)
185 |         if X_test is not None:
186 |             extracted_features_test = transformer.transform(X_test, is_train=False)
187 |         else:
188 |             extracted_features_test = None
189 | 
190 |         return transformer, extracted_features_train, extracted_features_test
191 |     else:
192 |         if X_test is not None:
193 |             return None, X_train, X_test
194 |         else:
195 |             return None, X_train, None
196 | 


--------------------------------------------------------------------------------
/src/pythor/neural.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import joblib, os, shutil, datetime
  4 | import torch
  5 | 
  6 | import logging, gc
  7 | 
  8 | logger = logging.getLogger("Numerai")
  9 | 
 10 | 
 11 | from torch import nn
 12 | import torch.nn.functional as F
 13 | from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
 14 | from pytorch_lightning import Trainer, LightningModule, seed_everything
 15 | from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 16 | import cupy as cp
 17 | 
 18 | 
 19 | # +
 20 | ## Tabular Models
 21 | # -
 22 | 
 23 | 
 24 | class TabularModel:
 25 |     def __init__(self, nn_model, config):
 26 |         """
 27 |         Args:
 28 |             nn_model (LightningModule): Neural Networks implmented as a LightningModule
 29 |             config (dict): A dictionary which contains the parameters for training NN
 30 |         """
 31 | 
 32 |         self.nn_model = nn_model
 33 |         self.config = config
 34 |         seed_everything(config.get("seed", 0), workers=True)
 35 | 
 36 |     def train(self, X_train, y_train, X_validate, y_validate):
 37 |         self.config["input_shape"] = X_train.shape[1]
 38 |         self.config["output_shape"] = y_train.shape[1]
 39 | 
 40 |         self.network = self.nn_model(self.config)
 41 | 
 42 |         early_stop_callback = EarlyStopping(
 43 |             monitor="val_loss",
 44 |             min_delta=0.00,
 45 |             patience=self.config.get("patience", 5),
 46 |             verbose=False,
 47 |             mode="min",
 48 |         )
 49 | 
 50 |         ## Assume X is a DataFrame, assume y is a DataFrame or pd Series
 51 |         dataset_train = TensorDataset(
 52 |             torch.from_numpy(X_train.values), torch.from_numpy(y_train.values)
 53 |         )
 54 |         dataloader_train = DataLoader(
 55 |             dataset_train,
 56 |             batch_size=self.config.get("batch_size", 4096),
 57 |             num_workers=0,
 58 |         )
 59 |         dataset_validate = TensorDataset(
 60 |             torch.from_numpy(X_validate.values), torch.from_numpy(y_validate.values)
 61 |         )
 62 |         dataloader_validate = DataLoader(
 63 |             dataset_validate,
 64 |             batch_size=self.config.get("batch_size", 4096),
 65 |             num_workers=0,
 66 |         )
 67 | 
 68 |         ## Use GPU if possible
 69 |         self.trainer = Trainer(
 70 |             accelerator="cuda",
 71 |             deterministic=True,
 72 |             auto_lr_find=True,
 73 |             max_epochs=self.config.get("max_epochs", 3),
 74 |             callbacks=[early_stop_callback],
 75 |         )
 76 | 
 77 |         self.trainer.fit(self.network, dataloader_train, dataloader_validate)
 78 | 
 79 |     def predict(self, X):
 80 |         self.network.eval()
 81 |         with torch.no_grad():
 82 |             predictions = self.network(torch.from_numpy(X))
 83 |         return predictions.numpy()
 84 | 
 85 |     def load_model(self, checkpoint):
 86 |         self.network = self.nn_model.load_from_checkpoint(checkpoint)
 87 | 
 88 |     def save_model(self, checkpoint):
 89 |         self.trainer.save_checkpoint(checkpoint)
 90 | 
 91 | 
 92 | # +
 93 | ## Tabular Modules
 94 | # -
 95 | 
 96 | 
 97 | class MLP(LightningModule):
 98 |     def __init__(self, config):
 99 |         super().__init__()
100 |         self.config = config
101 | 
102 |         neuron_sizes = config.get("neurons", 256)
103 |         num_layers = config.get("num_layers", 2)
104 | 
105 |         self.layers = nn.Sequential(
106 |             nn.Linear(config["input_shape"], neuron_sizes),
107 |             nn.ReLU(),
108 |             nn.Dropout(config.get("dropout", 0.5)),
109 |         )
110 | 
111 |         for i in range(
112 |             1,
113 |             num_layers,
114 |         ):
115 |             new_neuron_sizes = int(neuron_sizes * config.get("neuron_scale", 0.5)) + 1
116 |             self.layers.append(
117 |                 nn.Linear(neuron_sizes, new_neuron_sizes),
118 |             )
119 |             self.layers.append(nn.ReLU())
120 |             self.layers.append(nn.Dropout(config.get("dropout", 0.5)))
121 |             neuron_sizes = new_neuron_sizes
122 | 
123 |         self.layers.append(nn.Linear(neuron_sizes, config["output_shape"]))
124 | 
125 |         ## Need to have this to ensure correct hyper-parameters are loaded
126 |         ## https://github.com/Lightning-AI/lightning/issues/3981
127 |         self.save_hyperparameters()
128 | 
129 |     def forward(self, x):
130 |         return self.layers(x.float())
131 | 
132 |     def training_step(self, batch, batch_idx):
133 |         x, y = batch
134 |         y_hat = self.layers(x.float())
135 |         loss = F.mse_loss(y_hat, y.float())
136 |         self.log("train_loss", loss)
137 |         return loss
138 | 
139 |     def validation_step(self, batch, batch_idx):
140 |         x, y = batch
141 |         y_hat = self.layers(x.float())
142 |         loss = F.mse_loss(y_hat, y.float())
143 |         self.log("val_loss", loss)
144 | 
145 |     def predict_step(self, batch, batch_idx):
146 |         return self(batch)
147 | 
148 |     def configure_optimizers(self):
149 |         optimizer = torch.optim.Adam(
150 |             self.parameters(), lr=self.config.get("learning_rate", 1e-4)
151 |         )
152 |         return optimizer
153 | 
154 | 
155 | class LSTM_Tabular(LightningModule):
156 |     def __init__(self, config):
157 |         super().__init__()
158 |         self.config = config
159 |         self.lstm = nn.LSTM(
160 |             input_size=self.config.get("no_channels", 4),
161 |             hidden_size=self.config.get("hidden_size", 4),
162 |             num_layers=self.config.get("num_layers", 2),
163 |             dropout=self.config.get("dropout", 0.1),
164 |             batch_first=True,
165 |         )
166 |         self.fc = nn.Linear(
167 |             self.config.get("hidden_size", 4), self.config.get("output_shape", 1)
168 |         )
169 |         self.save_hyperparameters()
170 | 
171 |     def forward(self, x):
172 |         batch_size, flattened = x.shape
173 |         x = torch.reshape(
174 |             x,
175 |             (
176 |                 batch_size,
177 |                 -1,
178 |                 self.config.get("no_channels", 4),
179 |             ),
180 |         )
181 |         flip_columns_order = self.config.get("flip_column_order", True)
182 |         if flip_columns_order:
183 |             x = torch.flip(x, [1])
184 |         ## LSTM Layers
185 |         lstm_out, _ = self.lstm(
186 |             x.float()
187 |         )  # lstm_out = (batch_size, seq_len, hidden_size)
188 |         x = self.fc(lstm_out[:, -1])
189 |         return x
190 | 
191 |     def training_step(self, batch, batch_idx):
192 |         x, y = batch
193 |         y_hat = self.forward(x)
194 |         loss = F.mse_loss(y_hat.float(), y.float())
195 |         self.log("train_loss", loss)
196 |         return loss
197 | 
198 |     def validation_step(self, batch, batch_idx):
199 |         x, y = batch
200 |         y_hat = self.forward(x)
201 |         loss = F.mse_loss(y_hat.float(), y.float())
202 |         self.log("val_loss", loss)
203 | 
204 |     def predict_step(self, batch, batch_idx):
205 |         return self(batch)
206 | 
207 |     def configure_optimizers(self):
208 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
209 |         return optimizer
210 | 
211 | 
212 | # +
213 | ## Time Series Models
214 | ## Needs Further Development
215 | # -
216 | 
217 | 
218 | class TimeSeriesDataset(Dataset):
219 |     """Face Landmarks dataset."""
220 | 
221 |     def __init__(self, timeseries, targets=None, lookback=200):
222 |         """
223 |         Args:
224 |             timeseries (pd.DataFrame): A DataFrame of a multivaraite time-series
225 |             targets (pd.DataFrame): A DataFrame of targets for the time-series
226 |             lookback (int): Number of data records to include in lookback
227 |         """
228 |         self.X = timeseries
229 |         self.y = targets
230 |         self.lookback = lookback
231 | 
232 |     def __len__(self):
233 |         return self.X.shape[0] - (self.lookback - 1)
234 | 
235 |     def __getitem__(self, idx):
236 |         if torch.is_tensor(idx):
237 |             idx = idx.tolist()
238 |         if self.y is not None:
239 |             return torch.tensor(
240 |                 self.X.values[idx : idx + self.lookback, :]
241 |             ), torch.tensor(self.y.values[idx + self.lookback - 1, :])
242 |         else:
243 |             return torch.tensor(self.X.values[idx : idx + self.lookback, :])
244 | 
245 | 
246 | class TimeSeriesModel:
247 |     def __init__(self, nn_model, config):
248 |         """
249 |         Args:
250 |             nn_model (LightningModule): Neural Networks implmented as a LightningModule
251 |             config (dict): A dictionary which contains the parameters for training NN
252 |         """
253 | 
254 |         self.nn_model = nn_model
255 |         self.config = config
256 |         seed_everything(config.get("seed", 0), workers=True)
257 | 
258 |     def train(self, X_train, y_train, X_validate, y_validate):
259 |         self.config["input_shape"] = X_train.shape[1]
260 |         self.config["output_shape"] = y_train.shape[1]
261 | 
262 |         self.network = self.nn_model(self.config)
263 | 
264 |         early_stop_callback = EarlyStopping(
265 |             monitor="val_loss",
266 |             min_delta=0.00,
267 |             patience=self.config.get("patience", 10),
268 |             verbose=False,
269 |             mode="min",
270 |         )
271 | 
272 |         ## Assume X is a DataFrame, assume y is a DataFrame or pd Series
273 | 
274 |         train_dataset = TimeSeriesDataset(
275 |             X_train, y_train, lookback=self.config.get("lookback", 200)
276 |         )
277 |         dataloader_train = torch.utils.data.DataLoader(
278 |             train_dataset, batch_size=self.config.get("batch_size", 1000), shuffle=False
279 |         )
280 |         validate_dataset = TimeSeriesDataset(
281 |             X_validate, y_validate, lookback=self.config.get("lookback", 200)
282 |         )
283 |         dataloader_validate = torch.utils.data.DataLoader(
284 |             validate_dataset,
285 |             batch_size=self.config.get("batch_size", 1000),
286 |             shuffle=False,
287 |         )
288 | 
289 |         ## Use GPU if possible
290 |         self.trainer = Trainer(
291 |             accelerator="cuda",
292 |             deterministic=True,
293 |             auto_lr_find=True,
294 |             max_epochs=self.config.get("max_epochs", 3),
295 |             callbacks=[early_stop_callback],
296 |         )
297 | 
298 |         self.trainer.fit(self.network, dataloader_train, dataloader_validate)
299 | 
300 |     def predict(self, X):
301 |         self.network.eval()
302 |         with torch.no_grad():
303 |             predictions = self.network(X)
304 |         return predictions.numpy()
305 | 
306 |     def load_model(self, checkpoint):
307 |         self.network = self.nn_model.load_from_checkpoint(checkpoint)
308 | 
309 |     def save_model(self, checkpoint):
310 |         self.trainer.save_checkpoint(checkpoint)
311 | 
312 | 
313 | # +
314 | ### TimeSeires Modules
315 | # -
316 | 
317 | 
318 | class LSTM(LightningModule):
319 |     def __init__(self, config):
320 |         super().__init__()
321 |         self.config = config
322 |         self.lstm = nn.LSTM(
323 |             input_size=self.config.get("input_size", 11),
324 |             hidden_size=self.config.get("hidden_size", 4),
325 |             num_layers=self.config.get("num_layers", 2),
326 |             dropout=self.config.get("dropout", 0.1),
327 |             batch_first=True,
328 |         )
329 |         self.fc = nn.Linear(self.config.get("hidden_size", 4), 11)
330 | 
331 |     def forward(self, x):
332 |         lstm_out, _ = self.lstm(x)  # lstm_out = (batch_size, seq_len, hidden_size)
333 |         x = self.fc(lstm_out[:, -1])
334 |         return x
335 | 
336 |     def training_step(self, batch, batch_idx):
337 |         x, y = batch
338 |         y_hat = self.forward(x.float())
339 |         loss = F.mse_loss(y_hat, y.float())
340 |         self.log("train_loss", loss)
341 |         return loss
342 | 
343 |     def validation_step(self, batch, batch_idx):
344 |         x, y = batch
345 |         y_hat = self.forward(x.float())
346 |         loss = F.mse_loss(y_hat, y.float())
347 |         self.log("val_loss", loss)
348 | 
349 |     def predict_step(self, batch, batch_idx):
350 |         return self(batch)
351 | 
352 |     def configure_optimizers(self):
353 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
354 |         return optimizer
355 | 


--------------------------------------------------------------------------------
/src/pythor/optimisation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Optimising hyper-parameters for ML models with Optuna
  5 | #
  6 | 
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import joblib, json, os, gc
 11 | 
 12 | import optuna
 13 | from optuna.samplers import RandomSampler, TPESampler
 14 | 
 15 | 
 16 | from .util import GroupedTimeSeriesSplit, strategy_metrics
 17 | from .benchmark import benchmark_pipeline, save_best_model, load_best_model
 18 | from .numerai import load_numerai_data, score_numerai
 19 | 
 20 | 
 21 | import logging
 22 | 
 23 | logger = logging.getLogger("Numerai")
 24 | 
 25 | 
 26 | ### Create Hyper-parameter space for optuna
 27 | ### Extract parameter space that needs to be optimised from the config dictionary
 28 | def create_optuna_space(config_dictionary, trial):
 29 |     space = dict()
 30 |     for step in ["feature_eng", "ml_method"]:
 31 |         for k, v in config_dictionary[step]["parameters"].items():
 32 |             if isinstance(v, list):
 33 |                 space[k] = getattr(trial, f"suggest_{v[0]}")(name=k, **v[1])
 34 |             else:
 35 |                 space[k] = v
 36 |     return space
 37 | 
 38 | 
 39 | ### Create Parameter Sets from optuna trial instances
 40 | def create_parameters_sets(
 41 |     args,
 42 |     config_dictionary,
 43 |     seed=0,
 44 | ):
 45 |     ### Feature Engineering
 46 |     feature_eng_parameters = {}
 47 |     for k, v in config_dictionary["feature_eng"]["parameters"].items():
 48 |         feature_eng_parameters[k] = args.get(k, v)
 49 | 
 50 |     ### ML Methods
 51 |     tabular_hyper = {
 52 |         "seed": seed,
 53 |     }
 54 | 
 55 |     for k, v in config_dictionary["ml_method"]["parameters"].items():
 56 |         tabular_hyper[k] = args.get(k, v)
 57 | 
 58 |     ### Additional Hyper-parameters to be passed to training loop, NOT used now
 59 |     additional_hyper = dict()
 60 | 
 61 |     return feature_eng_parameters, tabular_hyper, additional_hyper
 62 | 
 63 | 
 64 | # Create Objective function using optuna for Numerai Classic and Numerai Signals Tournament
 65 | 
 66 | 
 67 | def create_optuna_numerai_objective(
 68 |     config_dictionary, numerai_files, seed=0, debug=False
 69 | ):
 70 |     def objective(trial):
 71 |         with open(numerai_files["feature_metadata"], "r") as f:
 72 |             feature_metadata = json.load(f)
 73 |         if config_dictionary["model_params"]["feature_sets"] == "v4":
 74 |             features_optimizer = feature_metadata["feature_sets"]["fncv3_features"]
 75 |             bad_features = [
 76 |                 "feature_palpebral_univalve_pennoncel",
 77 |                 "feature_unsustaining_chewier_adnoun",
 78 |                 "feature_brainish_nonabsorbent_assurance",
 79 |                 "feature_coastal_edible_whang",
 80 |                 "feature_disprovable_topmost_burrower",
 81 |                 "feature_trisomic_hagiographic_fragrance",
 82 |                 "feature_queenliest_childing_ritual",
 83 |                 "feature_censorial_leachier_rickshaw",
 84 |                 "feature_daylong_ecumenic_lucina",
 85 |                 "feature_steric_coxcombic_relinquishment",
 86 |             ]
 87 |             features_optimizer = list(set(features_optimizer) - set(bad_features))
 88 |         else:
 89 |             features_optimizer = list()
 90 | 
 91 |         features, targets, groups, weights = load_numerai_data(
 92 |             numerai_files["dataset"],
 93 |             feature_metadata=numerai_files["feature_metadata"],
 94 |             resample=0,
 95 |             resample_freq=config_dictionary["model_params"]["train_resample_freq"],
 96 |             target_col=config_dictionary["model_params"]["train_targets"],
 97 |             data_version=config_dictionary["model_params"]["feature_sets"],
 98 |             startera=config_dictionary["model_params"]["train_startera"],
 99 |             endera=config_dictionary["model_params"]["train_endera"],
100 |         )
101 | 
102 |         param = create_optuna_space(config_dictionary, trial)
103 | 
104 |         logger.info(param)
105 | 
106 |         (
107 |             feature_eng_parameters,
108 |             tabular_hyper,
109 |             additional_hyper,
110 |         ) = create_parameters_sets(
111 |             param,
112 |             config_dictionary,
113 |             seed=seed,
114 |         )
115 | 
116 |         model_performance, trained_models, data, parameters = benchmark_pipeline(
117 |             features,
118 |             targets,
119 |             weights,
120 |             groups,
121 |             feature_eng=config_dictionary["feature_eng"]["method"],
122 |             feature_eng_parameters=feature_eng_parameters,
123 |             tabular_model=config_dictionary["ml_method"]["method"],
124 |             tabular_hyper=tabular_hyper,
125 |             model_params=config_dictionary["model_params"]["train"],
126 |             additional_hyper=additional_hyper,
127 |             debug=debug,
128 |         )
129 |         ## Get Predictions for each of the walk forward model
130 |         ## Score on Validation data
131 |         predictions = list()
132 |         for model_name in list(data.keys()):
133 |             predictions.append(data[model_name]["prediction"])
134 |         train_prediction_df = pd.DataFrame(pd.concat(predictions, axis=0).mean(axis=1))
135 |         train_prediction_df.columns = ["prediction"]
136 |         train_prediction_df["target"] = targets.reindex(train_prediction_df.index)
137 |         train_prediction_df["era"] = groups.reindex(train_prediction_df.index)
138 |         train_prediction_df, correlations_by_era = score_numerai(
139 |             train_prediction_df,
140 |             features,
141 |             riskiest_features=features_optimizer,
142 |             proportion=float(
143 |                 config_dictionary["model_params"]["selection"]["proportion"]
144 |             ),
145 |             era_col="era",
146 |             target_col_name="target",
147 |         )
148 |         performances = strategy_metrics(correlations_by_era["neutralised_correlation"])
149 |         metric = performances[
150 |             config_dictionary["model_params"]["selection"]["criteria"]
151 |         ]
152 |         logger.info(f"Out of Sample Metric {metric}")
153 |         return metric
154 | 
155 |     return objective
156 | 
157 | 
158 | def optuna_search(
159 |     config_dictionary,
160 |     numerai_files,
161 |     n_trials=10,
162 |     timeout=10000,
163 |     seed=0,
164 |     debug=False,
165 | ):
166 |     optuna.logging.set_verbosity(optuna.logging.WARNING)
167 | 
168 |     numerai_objective = create_optuna_numerai_objective(
169 |         config_dictionary, numerai_files, seed=seed, debug=debug
170 |     )
171 |     study = optuna.create_study(
172 |         direction="maximize",
173 |     )
174 |     study.optimize(
175 |         numerai_objective, n_trials=n_trials, timeout=timeout, gc_after_trial=True
176 |     )
177 | 
178 |     return study.best_trial.params, study.best_trial.value
179 | 
180 | 
181 | def train_best_model_optuna(
182 |     target_col_name,
183 |     end_era,
184 |     best_parameters,
185 |     config_dictionary,
186 |     numerai_files,
187 |     seed=0,
188 |     debug=False,
189 | ):
190 |     resample_seed = int(
191 |         seed % config_dictionary["model_params"]["validate_resample_freq"]
192 |     )
193 |     features, targets, groups, weights = load_numerai_data(
194 |         numerai_files["dataset"],
195 |         feature_metadata=numerai_files["feature_metadata"],
196 |         resample=resample_seed,
197 |         resample_freq=config_dictionary["model_params"]["validate_resample_freq"],
198 |         target_col=[target_col_name],
199 |         data_version=config_dictionary["model_params"]["feature_sets"],
200 |         startera=config_dictionary["model_params"]["train_startera"],
201 |         endera=end_era,
202 |     )
203 | 
204 |     output_folder = config_dictionary["model_params"]["output_folder"]
205 | 
206 |     if not os.path.exists(f"{output_folder}/"):
207 |         os.mkdir(f"{output_folder}/")
208 | 
209 |     feature_eng_parameters, tabular_hyper, additional_hyper = create_parameters_sets(
210 |         best_parameters,
211 |         config_dictionary,
212 |         seed=seed,
213 |     )
214 | 
215 |     model_performance, trained_models, data, parameters = benchmark_pipeline(
216 |         features,
217 |         targets,
218 |         weights,
219 |         groups,
220 |         feature_eng=config_dictionary["feature_eng"]["method"],
221 |         feature_eng_parameters=feature_eng_parameters,
222 |         tabular_model=config_dictionary["ml_method"]["method"],
223 |         tabular_hyper=tabular_hyper,
224 |         model_params=config_dictionary["model_params"]["validate"],
225 |         additional_hyper=additional_hyper,
226 |         debug=debug,
227 |     )
228 | 
229 |     ## Save each model
230 |     for model_name in list(trained_models.keys()):
231 |         ## Save Parameters and Feature Transformer
232 |         output_parameters_path = f"{output_folder}/{model_name}_{seed}.parameters"
233 |         output_parameters = dict()
234 |         output_parameters["parameters"] = parameters[model_name]
235 |         output_parameters["transformer"] = trained_models[model_name]["transformer"]
236 |         joblib.dump(output_parameters, output_parameters_path)
237 | 
238 |         ## Save Model
239 |         output_model_path = f"{output_folder}/{model_name}_{seed}.model"
240 |         save_best_model(
241 |             trained_models[model_name]["model"],
242 |             parameters[model_name]["model"]["tabular_model"],
243 |             output_model_path,
244 |         )
245 | 
246 |     return None
247 | 
248 | 
249 | def numerai_optimisation_pipeline_optuna(
250 |     config_dictionary,
251 |     numerai_files,
252 |     run_optimisation=True,
253 |     optimised_parameters_path="numerai_best_parameters.json",
254 |     grid_search_seed=0,
255 |     n_trials=40,
256 |     timeout=2000,
257 |     debug=False,
258 | ):
259 |     ## Search for optimal hyper-parameters
260 |     if run_optimisation:
261 |         best_parameters, best_value = optuna_search(
262 |             config_dictionary,
263 |             numerai_files,
264 |             seed=grid_search_seed,
265 |             n_trials=n_trials,
266 |             timeout=timeout,
267 |             debug=debug,
268 |         )
269 |         with open(optimised_parameters_path, "w") as f:
270 |             best_parameters["Optuna_Best_Value"] = best_value
271 |             json.dump(best_parameters, f)
272 |     else:
273 |         with open(optimised_parameters_path, "r") as f:
274 |             best_parameters = json.load(f)
275 |             logger.info(f"Using Best parameters {best_parameters}")
276 | 
277 |     START_SEED = config_dictionary["model_params"]["model_no_start"]
278 |     NO_MODELS_PER_CONFIG = config_dictionary["model_params"]["no_models_per_config"]
279 | 
280 |     if config_dictionary["model_params"]["mix_cv"]:
281 |         for target_col_name in config_dictionary["model_params"]["validate_targets"]:
282 |             for end_era in config_dictionary["model_params"]["validate_enderas"]:
283 |                 for seed in range(START_SEED, START_SEED + NO_MODELS_PER_CONFIG):
284 |                     ## Check if Model already exists
285 |                     output_folder = config_dictionary["model_params"]["output_folder"]
286 |                     tabular_model = config_dictionary["ml_method"]["method"]
287 |                     feature_eng = config_dictionary["feature_eng"]["method"]
288 |                     model_name = "{}_{}_{}".format(tabular_model, feature_eng, 1)
289 |                     output_model_path = f"{output_folder}/{model_name}_{seed}.model"
290 |                     if not os.path.exists(output_model_path):
291 |                         train_best_model_optuna(
292 |                             target_col_name,
293 |                             end_era,
294 |                             best_parameters,
295 |                             config_dictionary,
296 |                             numerai_files,
297 |                             seed=seed,
298 |                             debug=debug,
299 |                         )
300 | 
301 |                 START_SEED = START_SEED + NO_MODELS_PER_CONFIG
302 |     else:
303 |         for end_era in config_dictionary["model_params"]["validate_enderas"]:
304 |             for target_col_name in config_dictionary["model_params"][
305 |                 "validate_targets"
306 |             ]:
307 |                 for seed in range(START_SEED, START_SEED + NO_MODELS_PER_CONFIG):
308 |                     ## Check if Model already exists
309 |                     output_folder = config_dictionary["model_params"]["output_folder"]
310 |                     tabular_model = config_dictionary["ml_method"]["method"]
311 |                     feature_eng = config_dictionary["feature_eng"]["method"]
312 |                     model_name = "{}_{}_{}".format(tabular_model, feature_eng, 1)
313 |                     output_model_path = f"{output_folder}/{model_name}_{seed}.model"
314 |                     if not os.path.exists(output_model_path):
315 |                         train_best_model_optuna(
316 |                             target_col_name,
317 |                             end_era,
318 |                             best_parameters,
319 |                             config_dictionary,
320 |                             numerai_files,
321 |                             seed=seed,
322 |                             debug=debug,
323 |                         )
324 | 
325 |                 START_SEED = START_SEED + NO_MODELS_PER_CONFIG
326 | 


--------------------------------------------------------------------------------
/src/pythor/benchmark.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # A collection of GBDT models for temporal tabular data
  5 | #
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | import pandas as pd
 21 | import numpy as np
 22 | import joblib, os, shutil, datetime
 23 | 
 24 | 
 25 | 
 26 | import logging, gc
 27 | 
 28 | if torch.cuda.is_available():
 29 |     import cupy as cp
 30 | 
 31 | 
 32 | from sklearn.metrics import mean_squared_error
 33 | from sklearn.model_selection import KFold, GroupKFold, GroupShuffleSplit
 34 | 
 35 | ## Machine Learning packages
 36 | from xgboost import XGBRegressor
 37 | from lightgbm import LGBMRegressor, LGBMClassifier
 38 | from catboost import CatBoostRegressor
 39 | 
 40 | import lightgbm, xgboost, catboost
 41 | import torch
 42 | 
 43 | 
 44 | 
 45 | from .util import align_features_target, RollingTSTransformer, GroupedTimeSeriesSplit
 46 | from .feature import benchmark_features_transform
 47 | from .neural import TabularModel, MLP, LSTM_Tabular
 48 | 
 49 | # ## Persistence of ML models
 50 | 
 51 | 
 52 | ### Save Best Model using method provided
 53 | def save_best_model(model, model_type, outputpath):
 54 |     if model_type in [
 55 |         "lightgbm",
 56 |         "lightgbm-gbdt",
 57 |         "lightgbm-goss",
 58 |         "lightgbm-rf",
 59 |         "lightgbm-dart",
 60 |     ]:
 61 |         model.save_model(outputpath)
 62 |     if model_type in [
 63 |         "xgboost",
 64 |         "xgboost-dart",
 65 |         "xgboost-gbtree",
 66 |     ]:
 67 |         model.save_model(outputpath)
 68 |     if model_type == "catboost":
 69 |         model.save_model(outputpath)
 70 |     if model_type in [
 71 |         "Numerai-MLP",
 72 |         "Numerai-LSTM",
 73 |     ]:
 74 |         model.save_model(outputpath)
 75 |     if model_type == "tabnet":
 76 |         model.save_model(outputpath)
 77 |         os.rename("{}.zip".format(outputpath), outputpath)
 78 |     return None
 79 | 
 80 | 
 81 | ### load Best Model using method provided
 82 | def load_best_model(model_type, outputpath):
 83 |     if model_type in [
 84 |         "lightgbm",
 85 |         "lightgbm-gbdt",
 86 |         "lightgbm-goss",
 87 |         "lightgbm-rf",
 88 |         "lightgbm-dart",
 89 |     ]:
 90 |         reg = lightgbm.Booster(model_file=outputpath)
 91 |     if model_type in [
 92 |         "xgboost",
 93 |         "xgboost-dart",
 94 |         "xgboost-gbtree",
 95 |     ]:
 96 |         reg = xgboost.Booster()
 97 |         reg.load_model(outputpath)
 98 |     if model_type == "catboost":
 99 |         reg = catboost.CatBoost()
100 |         reg.load_model(outputpath)
101 |     if model_type in [
102 |         "Numerai-MLP",
103 |     ]:
104 |         reg = TabularModel(MLP, config=dict())
105 |         reg.load_model(outputpath)
106 |     if model_type in [
107 |         "Numerai-LSTM",
108 |     ]:
109 |         reg = TabularModel(LSTM_Tabular, config=dict())
110 |         reg.load_model(outputpath)
111 |     if model_type == "tabnet":
112 |         from pytorch_tabnet.tab_model import TabNetRegressor
113 | 
114 |         reg = TabNetRegressor()
115 |         reg.load_model(outputpath)
116 |     if model_type == "feature-momentum":
117 |         reg = None
118 |     return reg
119 | 
120 | 
121 | # ## Fit ML Models
122 | 
123 | 
124 | def benchmark_neural_model(
125 |     extracted_features_train,
126 |     y_train,
127 |     weights_train,
128 |     extracted_features_test=None,
129 |     y_test=None,
130 |     weights_test=None,
131 |     tabular_model="Numerai-MLP",
132 |     tabular_hyper=None,
133 |     additional_hyper=None,
134 |     debug=False,
135 | ):
136 |     gc.collect()
137 | 
138 |     ## Initialise and Train Models
139 |     if tabular_model in [
140 |         "Numerai-MLP",
141 |     ]:
142 |         reg = TabularModel(MLP, config=tabular_hyper)
143 |         reg.train(extracted_features_train, y_train, extracted_features_test, y_test)
144 |         pred = reg.predict(extracted_features_test.values)
145 |         return reg, pred
146 | 
147 |     if tabular_model in [
148 |         "Numerai-LSTM",
149 |     ]:
150 |         reg = TabularModel(LSTM_Tabular, config=tabular_hyper)
151 |         reg.train(extracted_features_train, y_train, extracted_features_test, y_test)
152 |         pred = reg.predict(extracted_features_test.values)
153 |         return reg, pred
154 | 
155 |     if tabular_model == "tabnet":
156 |         ## Default is PyTorch Adam Optimizer
157 |         from torch.optim import Adam
158 |         from torch.optim.lr_scheduler import StepLR
159 |         from pytorch_tabnet.tab_model import TabNetRegressor
160 | 
161 |         tabnet_hyper = dict()
162 |         tabnet_hyper["optimizer_fn"] = Adam
163 |         tabnet_hyper["optimizer_params"] = {
164 |             "lr": 0.02,
165 |         }
166 |         tabnet_hyper["scheduler_fn"] = StepLR
167 |         tabnet_hyper["scheduler_params"] = {"gamma": 0.95, "step_size": 20}
168 | 
169 |         for key in [
170 |             "seed",
171 |             "n_d",
172 |             "n_a",
173 |             "n_steps",
174 |             "n_independent",
175 |             "n_shared",
176 |             "gamma",
177 |             "momentum",
178 |             "lambda_sparse",
179 |         ]:
180 |             tabnet_hyper[key] = tabular_hyper[key]
181 | 
182 |         ## Separate Hyper-parameters in the fit function
183 |         tabnet_fit_hyper = dict()
184 |         for key in [
185 |             "max_epochs",
186 |             "patience",
187 |             "batch_size",
188 |         ]:
189 |             tabnet_fit_hyper[key] = tabular_hyper[key]
190 | 
191 |         reg = TabNetRegressor(**tabnet_hyper)
192 |         reg.fit(
193 |             extracted_features_train.values,
194 |             y_train.values,
195 |             eval_set=[(extracted_features_test.values, y_test.values)],
196 |             max_epochs=tabnet_fit_hyper.get("max_epochs", 20),
197 |             patience=tabnet_fit_hyper.get("patience", 5),
198 |             batch_size=tabnet_fit_hyper.get("batch_size", 40960),
199 |             virtual_batch_size=int(tabnet_fit_hyper.get("batch_size", 40960) / 4),
200 |             num_workers=0,
201 |         )
202 |         pred = reg.predict(extracted_features_test.values)
203 |         return reg, pred
204 | 
205 | 
206 | def benchmark_tree_model(
207 |     extracted_features_train,
208 |     y_train,
209 |     weights_train,
210 |     extracted_features_test=None,
211 |     y_test=None,
212 |     weights_test=None,
213 |     tabular_model="lightgbm",
214 |     tabular_hyper=None,
215 |     additional_hyper=None,
216 |     debug=False,
217 | ):
218 |     ### Free up Memory from previous loop
219 |     gc.collect()
220 | 
221 |     #### Fit Regressor Model for different ML methods
222 |     if tabular_model in [
223 |         "lightgbm",
224 |         "lightgbm-gbdt",
225 |         "lightgbm-dart",
226 |         "lightgbm-goss",
227 |         "lightgbm-rf",
228 |     ]:
229 |         if y_test is not None:
230 |             train_data = lightgbm.Dataset(
231 |                 extracted_features_train,
232 |                 label=y_train,
233 |                 weight=weights_train,
234 |                 params={"max_bin": tabular_hyper["max_bin"]},
235 |             )
236 |             test_data = lightgbm.Dataset(
237 |                 extracted_features_test,
238 |                 label=y_test,
239 |                 weight=weights_test,
240 |                 params={"max_bin": tabular_hyper["max_bin"]},
241 |             )
242 |             early_stopping_rounds = tabular_hyper.get("early_stopping_round", 0)
243 |             model = lightgbm.train(
244 |                 tabular_hyper,
245 |                 train_set=train_data,
246 |                 num_boost_round=tabular_hyper["num_iterations"],
247 |                 valid_sets=[test_data],
248 |                 callbacks=[
249 |                     lightgbm.log_evaluation(period=1000),
250 |                     lightgbm.early_stopping(early_stopping_rounds),
251 |                 ],
252 |             )
253 |             valid_iteration = min(
254 |                 additional_hyper.get("gbm_start_iteration", 0),
255 |                 int(model.num_trees() // 2),
256 |             )
257 |             pred = model.predict(
258 |                 extracted_features_test, start_iteration=valid_iteration
259 |             )
260 |             return model, pred
261 |         else:
262 |             train_data = lightgbm.Dataset(
263 |                 extracted_features_train,
264 |                 label=y_train,
265 |                 weight=weights_train,
266 |             )
267 |             model = lightgbm.train(
268 |                 tabular_hyper,
269 |                 train_set=train_data,
270 |                 num_boost_round=tabular_hyper["num_iterations"],
271 |             )
272 |             return model
273 | 
274 |     ## xgboost ignores extra parameters
275 |     if tabular_model in [
276 |         "xgboost",
277 |         "xgboost-dart",
278 |         "xgboost-gbtree",
279 |     ]:
280 |         ## Create DMatrix
281 |         if y_test is not None:
282 |             train_data = xgboost.DMatrix(
283 |                 extracted_features_train,
284 |                 label=y_train.values.reshape(-1),
285 |                 weight=weights_train.values.reshape(-1),
286 |             )
287 |             test_data = xgboost.DMatrix(
288 |                 extracted_features_test,
289 |                 label=y_test.values.reshape(-1),
290 |                 weight=weights_test.values.reshape(-1),
291 |             )
292 |             ### Train XGBoost model
293 |             model = xgboost.train(
294 |                 tabular_hyper,
295 |                 train_data,
296 |                 num_boost_round=tabular_hyper["num_boost_round"],
297 |                 evals=[(test_data, "xgboost_test_data")],
298 |                 early_stopping_rounds=tabular_hyper["early_stopping_rounds"],
299 |                 verbose_eval=100,
300 |             )
301 |             start_iteration = min(
302 |                 additional_hyper.get("gbm_start_iteration", 0),
303 |                 int(model.best_iteration // 2),
304 |             )
305 |             end_iteration = model.best_iteration
306 |             pred = model.predict(
307 |                 test_data,
308 |                 iteration_range=(start_iteration, end_iteration),
309 |             )
310 |             return model, pred
311 |         else:
312 |             train_data = xgboost.DMatrix(
313 |                 extracted_features_train,
314 |                 label=y_train.values.reshape(-1),
315 |                 weight=weights_train.values.reshape(-1),
316 |             )
317 |             model = xgboost.train(
318 |                 tabular_hyper,
319 |                 train_data,
320 |                 num_boost_round=tabular_hyper["num_boost_round"],
321 |             )
322 |             return model
323 | 
324 | 
325 | ### Run ML pipeline for temporal tabular data
326 | def benchmark_pipeline(
327 |     features,
328 |     target,
329 |     weights,
330 |     groups,
331 |     model_params=None,
332 |     feature_eng=None,
333 |     feature_eng_parameters=None,
334 |     tabular_model="lightgbm",
335 |     tabular_hyper=None,
336 |     additional_hyper=None,
337 |     debug=False,
338 | ):
339 |     if debug:
340 |         print(f"Dataset Sizes {features.shape} {target.shape} {groups.shape}")
341 | 
342 |     if not model_params:
343 |         model_params = {
344 |             "valid_splits": 1,
345 |             "test_size": 52,
346 |             "max_train_size": 52,
347 |             "gap": 52,
348 |             "cross_validation": "GroupedTimeSeriesSplit",
349 |         }
350 | 
351 |     ## Cross Validation split
352 |     if model_params["cross_validation"] == "GroupedTimeSeriesSplit":
353 |         tscv = GroupedTimeSeriesSplit(
354 |             valid_splits=model_params["valid_splits"],
355 |             test_size=model_params["test_size"],
356 |             max_train_size=model_params["max_train_size"],
357 |             gap=model_params["gap"],
358 |             debug=debug,
359 |         )
360 |     elif model_params["cross_validation"] == "GroupShuffleSplit":
361 |         tscv = GroupShuffleSplit(
362 |             n_splits=model_params["n_splits"],
363 |             test_size=model_params["test_size"],
364 |             train_size=model_params["train_size"],
365 |             random_state=model_params.get("random_state", 0),
366 |         )
367 |     else:
368 |         tscv = KFold(
369 |             n_splits=model_params["n_splits"],
370 |             shuffle=True,
371 |             random_state=model_params.get("random_state", 0),
372 |         )
373 |     model_no = 1
374 |     model_performance = dict()
375 |     trained_models = dict()
376 |     data = dict()
377 |     parameters = dict()
378 | 
379 |     for train_index, test_index in tscv.split(features, groups=groups):
380 |         ## Get Trained and Test Data
381 |         if model_params["cross_validation"] == "GroupedTimeSeriesSplit":
382 |             X_train, X_test = features.loc[train_index, :], features.loc[test_index, :]
383 |             y_train, y_test = target.loc[train_index, :], target.loc[test_index, :]
384 |             ## Data Weights are pd Series
385 |             weights_train, weights_test = (
386 |                 weights.loc[train_index],
387 |                 weights.loc[test_index],
388 |             )
389 |             ## Group Labels are pd Series
390 |             group_train, group_test = (
391 |                 groups.loc[train_index],
392 |                 groups.loc[test_index],
393 |             )
394 |             if debug:
395 |                 print(X_train.shape, X_test.shape)
396 | 
397 |         ## For Existing Cross Validation Splits in scikit-learn it is based on index location (iloc)
398 |         else:
399 |             X_train, X_test = (
400 |                 features.iloc[train_index, :],
401 |                 features.iloc[test_index, :],
402 |             )
403 |             y_train, y_test = target.iloc[train_index, :], target.iloc[test_index, :]
404 |             weights_train, weights_test = (
405 |                 weights.iloc[train_index],
406 |                 weights.iloc[test_index],
407 |             )
408 |             group_train, group_test = (
409 |                 groups.iloc[train_index],
410 |                 groups.iloc[test_index],
411 |             )
412 | 
413 |         ### Transform features
414 | 
415 |         (
416 |             transformer,
417 |             extracted_features_train,
418 |             extracted_features_test,
419 |         ) = benchmark_features_transform(
420 |             X_train,
421 |             y_train,
422 |             X_test,
423 |             group_train,
424 |             group_test,
425 |             feature_eng,
426 |             feature_eng_parameters,
427 |             debug,
428 |         )
429 | 
430 |         if tabular_model in [
431 |             "lightgbm-gbdt",
432 |             "lightgbm-goss",
433 |             "lightgbm-dart",
434 |             "lightgbm-rf",
435 |             "xgboost-dart",
436 |             "xgboost-gbtree",
437 |             "catboost",
438 |             "lightgbm",
439 |             "xgboost",
440 |         ]:
441 |             ### Train Tabular Models
442 |             reg, pred = benchmark_tree_model(
443 |                 extracted_features_train,
444 |                 y_train,
445 |                 weights_train,
446 |                 extracted_features_test,
447 |                 y_test,
448 |                 weights_test,
449 |                 tabular_model,
450 |                 tabular_hyper,
451 |                 additional_hyper,
452 |                 debug,
453 |             )
454 | 
455 |         if tabular_model in [
456 |             "Numerai-MLP",
457 |             "Numerai-LSTM",
458 |             "tabnet",
459 |         ]:
460 |             ### Train Tabular Models
461 |             reg, pred = benchmark_neural_model(
462 |                 extracted_features_train,
463 |                 y_train,
464 |                 weights_train,
465 |                 extracted_features_test,
466 |                 y_test,
467 |                 weights_test,
468 |                 tabular_model,
469 |                 tabular_hyper,
470 |                 additional_hyper,
471 |                 debug,
472 |             )
473 | 
474 |         ## Convert Prediction output to a dataframe
475 |         pred = pd.DataFrame(pred, index=y_test.index, columns=y_test.columns)
476 | 
477 |         model_name = "{}_{}_{}".format(tabular_model, feature_eng, model_no)
478 | 
479 |         parameters[model_name] = {
480 |             "feature_eng": feature_eng_parameters.copy(),
481 |             "tabular": tabular_hyper.copy(),
482 |             "additional": additional_hyper.copy(),
483 |         }
484 | 
485 |         ### Compute model performance
486 |         model_metrics = dict()
487 |         model_metrics["MSE"] = mean_squared_error(y_test, pred)
488 |         model_performance[model_name] = model_metrics.copy()
489 | 
490 |         #### Training Parameters
491 |         model_params["feature_columns"] = features.columns
492 |         model_params["target_columns"] = target.columns
493 |         model_params["feature_engineering"] = feature_eng
494 |         model_params["tabular_model"] = tabular_model
495 |         model_params["train_start"] = group_train.iloc[0]
496 |         model_params["train_end"] = group_train.iloc[-1]
497 |         model_params["validation_start"] = group_test.iloc[0]
498 |         model_params["validation_end"] = group_test.iloc[-1]
499 |         model_params["model_name"] = model_name
500 |         parameters[model_name]["model"] = model_params.copy()
501 | 
502 |         if debug:
503 |             print(parameters[model_name])
504 | 
505 |         if transformer is not None:
506 |             trained_models[model_name] = {
507 |                 "transformer": transformer.data,
508 |                 "model": reg,
509 |             }
510 |         else:
511 |             trained_models[model_name] = {
512 |                 "transformer": None,
513 |                 "model": reg,
514 |             }
515 | 
516 |         data[model_name] = {
517 |             "prediction": pred,
518 |             "y_test": y_test,
519 |         }
520 | 
521 |         model_no += 1
522 | 
523 |     return model_performance, trained_models, data, parameters
524 | 


--------------------------------------------------------------------------------
/src/pythor/util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # A collection of tools for data pre-processing for non-stationary time-series and tabular data
  5 | #
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | 
 21 | import pandas as pd
 22 | import numpy as np
 23 | import joblib, os, glob
 24 | 
 25 | from sklearn.model_selection import TimeSeriesSplit
 26 | from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
 27 | 
 28 | from sklearn.inspection import permutation_importance
 29 | from sklearn.preprocessing import StandardScaler
 30 | from sklearn.base import TransformerMixin, BaseEstimator
 31 | 
 32 | 
 33 | """
 34 | Strategy Metrics , Regime Analysis 
 35 | """
 36 | 
 37 | 
 38 | def strategy_metrics(strategy, interval=1, numerai=True, accuracy=4):
 39 |     results = dict()
 40 |     results["mean"] = np.around(strategy.mean(), accuracy)
 41 |     results["volatility"] = np.around(strategy.std(), accuracy)
 42 |     results["skew"] = np.around(strategy.skew(), accuracy)
 43 |     results["kurtosis"] = np.around(strategy.kurtosis(), accuracy)
 44 |     if numerai:
 45 |         portfolio = strategy.cumsum()
 46 |     else:
 47 |         portfolio = (1 + strategy).cumprod()
 48 |     if numerai:
 49 |         dd = portfolio - portfolio.cummax()
 50 |     else:
 51 |         dd = (portfolio - portfolio.cummax()) / portfolio.cummax()
 52 |     results["max_drawdown"] = np.around(-1 * dd.cummin().min(), accuracy).item()
 53 |     if strategy.std() > 0:
 54 |         results["sharpe"] = np.around(strategy.mean() / strategy.std(), accuracy)
 55 |     else:
 56 |         results["sharpe"] = np.around(results["mean"] / 1e-4, accuracy)
 57 |     if results["max_drawdown"] > 0:
 58 |         results["calmar"] = np.around(
 59 |             results["mean"] / results["max_drawdown"], accuracy
 60 |         )
 61 |     else:
 62 |         results["calmar"] = np.around(results["mean"] / 1e-4, accuracy)
 63 |     return results
 64 | 
 65 | 
 66 | def regime_analysis(
 67 |     df,
 68 |     performance_col="correlation",
 69 |     regime_columns="regime",
 70 | ):
 71 |     ans = df.groupby(regime_columns).agg({performance_col: strategy_metrics})
 72 |     ans_df = pd.DataFrame(ans[performance_col].values.tolist())
 73 |     ans_df.index = ans.index
 74 |     return ans_df.reset_index()
 75 | 
 76 | 
 77 | """
 78 | Dynamic Model Selection 
 79 | """
 80 | 
 81 | 
 82 | def dynamic_model_selection_masks(performances, gap=6, lookback=52, top_models=1):
 83 |     mean = performances.shift(gap).rolling(lookback).mean()
 84 |     volatility = performances.shift(gap).rolling(lookback).std()
 85 |     skew = performances.shift(gap).rolling(lookback).skew()
 86 |     kurt = performances.shift(gap).rolling(lookback).kurt()
 87 |     drawdown = (
 88 |         -1
 89 |         * (
 90 |             performances.shift(gap).cumsum() - performances.shift(gap).cumsum().cummax()
 91 |         ).cummin()
 92 |     )
 93 |     sharpe = mean / volatility
 94 |     calmar = mean / drawdown
 95 | 
 96 |     metric_masks = dict()
 97 |     for metric in [
 98 |         "mean",
 99 |         "volatility",
100 |         "skew",
101 |         "kurt",
102 |         "drawdown",
103 |         "sharpe",
104 |         "calmar",
105 |     ]:
106 |         metric_masks[f"{metric}_min"] = np.where(
107 |             locals()[metric].rank(
108 |                 axis=1,
109 |                 ascending=True,
110 |                 na_option="bottom",
111 |             )
112 |             <= top_models,
113 |             1 / top_models,
114 |             np.nan,
115 |         )
116 |         metric_masks[f"{metric}_max"] = np.where(
117 |             locals()[metric].rank(
118 |                 axis=1,
119 |                 ascending=False,
120 |                 na_option="bottom",
121 |             )
122 |             <= top_models,
123 |             1 / top_models,
124 |             np.nan,
125 |         )
126 | 
127 |     masks_dataframes = dict()
128 |     for metric in [
129 |         "mean",
130 |         "volatility",
131 |         "skew",
132 |         "kurt",
133 |         "drawdown",
134 |         "sharpe",
135 |         "calmar",
136 |     ]:
137 |         masks_dataframes[f"{metric}_min"] = pd.DataFrame(
138 |             metric_masks[f"{metric}_min"],
139 |             columns=locals()[metric].columns,
140 |             index=locals()[metric].index,
141 |         )
142 |         masks_dataframes[f"{metric}_max"] = pd.DataFrame(
143 |             metric_masks[f"{metric}_max"],
144 |             columns=locals()[metric].columns,
145 |             index=locals()[metric].index,
146 |         )
147 |     return masks_dataframes
148 | 
149 | 
150 | def walk_forward_dynamic_models(df_list):
151 |     Model_Sets = dict()
152 |     Imputed_Models = dict()
153 | 
154 |     for key in [
155 |         "Ensemble",
156 |         "Baseline",
157 |         "Optimizer",
158 |         "Small",
159 |         "Medium",
160 |         "Standard",
161 |         "Average",
162 |     ]:
163 |         Model_Sets[key] = list()
164 | 
165 |     for dynamic_models in df_list:
166 |         Model_Sets["Ensemble"].append(
167 |             dynamic_models[
168 |                 [
169 |                     x
170 |                     for x in dynamic_models.columns
171 |                     if "baseline" in x
172 |                     or "optimizer" in x
173 |                     or ("standard" in x and not "average" in x and not "random" in x)
174 |                 ]
175 |             ]
176 |         )
177 |         Model_Sets["Average"].append(
178 |             dynamic_models[
179 |                 [
180 |                     x
181 |                     for x in dynamic_models.columns
182 |                     if "baseline" in x or "optimizer" in x or "average" in x
183 |                 ]
184 |             ]
185 |         )
186 |         Model_Sets["Baseline"].append(
187 |             dynamic_models[[x for x in dynamic_models.columns if "baseline" in x]]
188 |         )
189 |         Model_Sets["Optimizer"].append(
190 |             dynamic_models[[x for x in dynamic_models.columns if "optimizer" in x]]
191 |         )
192 |         Model_Sets["Standard"].append(
193 |             dynamic_models[
194 |                 [
195 |                     x
196 |                     for x in dynamic_models.columns
197 |                     if "standard" in x and not "average" in x and not "random" in x
198 |                 ]
199 |             ]
200 |         )
201 |         Model_Sets["Small"].append(
202 |             dynamic_models[
203 |                 [
204 |                     x
205 |                     for x in dynamic_models.columns
206 |                     if "small" in x and not "average" in x and not "random" in x
207 |                 ]
208 |             ]
209 |         )
210 | 
211 |     for key in [
212 |         "Ensemble",
213 |         "Baseline",
214 |         "Optimizer",
215 |         "Small",
216 |         "Standard",
217 |         "Average",
218 |     ]:
219 |         models_over_time = pd.concat(Model_Sets[key], axis=1)
220 |         # models_over_time = (
221 |         # models_over_time.transpose()
222 |         # .fillna(models_over_time.mean(axis=1))
223 |         # .transpose()
224 |         # )
225 |         models_over_time = models_over_time.transpose().fillna(0).transpose()
226 |         Imputed_Models[key] = models_over_time.sort_index()
227 | 
228 |     return Imputed_Models
229 | 
230 | 
231 | ### Compare Against All Trained Models
232 | 
233 | 
234 | def create_leaderboard(
235 |     performances_folder,
236 |     searchkey="*",
237 |     lookback=52,
238 |     no_tops=1,
239 |     model_no_lower=0,
240 |     model_no_upper=1e8,
241 | ):
242 |     ## Load csv files
243 |     performances_files = sorted(glob.glob(f"{performances_folder}/{searchkey}.csv"))
244 |     models_list = list()
245 |     for f in performances_files:
246 |         model_no = int(f.split(".csv")[0].split("_")[-2])
247 |         model_seq = int(f.split(".csv")[0].split("_")[-1])
248 |         model_name = "_".join(f.split(".csv")[0].split("/")[-1].split("_")[:3])
249 |         if (
250 |             os.path.isfile(f)
251 |             and model_no_lower <= model_no
252 |             and model_no <= model_no_upper
253 |         ):
254 |             df = pd.read_csv(f, index_col=0).sort_index()
255 |             df = df[~df.index.duplicated()]
256 |             df.index = pd.to_datetime(df.index)
257 |             models_list.append(df)
258 | 
259 |     dynamic_models_collection = walk_forward_dynamic_models(models_list)
260 | 
261 |     ### Compute Performances of Portfolios of dynamically selected models
262 |     recent_results = list()
263 |     dynamic_portfolios = dict()
264 |     gap = 6
265 |     criteria = [
266 |         "mean",
267 |         # "calmar",
268 |         # "sharpe",
269 |     ]
270 | 
271 |     for Sets in [
272 |         "Baseline",
273 |         "Optimizer",
274 |         "Ensemble",
275 |         "Small",
276 |         "Standard",
277 |     ]:
278 |         df = dynamic_models_collection[Sets].sort_index()
279 |         if df.shape[0] > 0:
280 |             dynamic_masks = dynamic_model_selection_masks(
281 |                 df, top_models=no_tops, lookback=lookback, gap=gap
282 |             )
283 |             for base_method in criteria:
284 |                 for method in [
285 |                     f"{base_method}_max",
286 |                 ]:
287 |                     portfolio = (dynamic_masks[method] * df).sum(axis=1, min_count=1)
288 |                     dynamic_portfolios[
289 |                         f"{Sets}_{method}_{no_tops}_lookback_{lookback}"
290 |                     ] = portfolio.tail(df.shape[0] - lookback - gap)
291 |                     performances = strategy_metrics(
292 |                         portfolio.tail(df.shape[0] - lookback - gap)
293 |                     )
294 |                     performances["method"] = method
295 |                     performances["no_tops"] = no_tops
296 |                     performances["sets"] = Sets
297 |                     performances["lookback"] = lookback
298 |                     recent_results.append(performances)
299 | 
300 |     dynamic_performances = pd.DataFrame(recent_results).dropna()
301 | 
302 |     leaderboards = dict()
303 |     ## Recent Leaderboards to be used in Model Submissions
304 |     for model_subset in [
305 |         "Baseline",
306 |         "Ensemble",
307 |         "Optimizer",
308 |         "Small",
309 |         "Standard",
310 |     ]:
311 |         leaderboard = pd.DataFrame(
312 |             dynamic_models_collection[model_subset]
313 |             .sort_index()
314 |             .iloc[-1 * lookback :]
315 |             .apply(strategy_metrics)
316 |             .to_dict()
317 |         ).transpose()
318 | 
319 |         if len(dynamic_models_collection[model_subset].columns) > 0:
320 |             leaderboard.index = dynamic_models_collection[model_subset].columns
321 |             leaderboard["proportion"] = [
322 |                 float(x[-1]) for x in leaderboard.index.str.split("-")
323 |             ]
324 |             leaderboard["flavour"] = [x[-2] for x in leaderboard.index.str.split("-")]
325 |             leaderboard["model_seq"] = [
326 |                 int("-".join(x[:-2]).split("_")[-1])
327 |                 for x in leaderboard.index.str.split("-")
328 |             ]
329 |             leaderboard["model_seed"] = [
330 |                 int("-".join(x[:-2]).split("_")[-2])
331 |                 for x in leaderboard.index.str.split("-")
332 |             ]
333 |             leaderboard["model_cv"] = [
334 |                 "-".join(x[:-2]).split("_")[-3]
335 |                 for x in leaderboard.index.str.split("-")
336 |             ]
337 |             leaderboard["model_feature_engineering"] = [
338 |                 "-".join(x[:-2]).split("_")[-4]
339 |                 for x in leaderboard.index.str.split("-")
340 |             ]
341 |             leaderboard["model_tabular_method"] = [
342 |                 "-".join(x[:-2]).split("_")[-5]
343 |                 for x in leaderboard.index.str.split("-")
344 |             ]
345 |             leaderboards[model_subset] = leaderboard
346 | 
347 |             ## Leaderboard Since beginning of data
348 |             if dynamic_models_collection[model_subset].shape[0] < lookback + gap:
349 |                 start_of_data = 0
350 |             else:
351 |                 start_of_data = lookback + gap
352 |             leaderboard = pd.DataFrame(
353 |                 dynamic_models_collection[model_subset]
354 |                 .sort_index()
355 |                 .iloc[start_of_data:]
356 |                 .apply(strategy_metrics)
357 |                 .to_dict()
358 |             ).transpose()
359 |             leaderboard.index = dynamic_models_collection[model_subset].columns
360 |             leaderboards[f"{model_subset}-All"] = leaderboard
361 | 
362 |     return (
363 |         dynamic_performances,
364 |         dynamic_portfolios,
365 |         dynamic_models_collection,
366 |         leaderboards,
367 |     )
368 | 
369 | 
370 | """
371 | Cross Validation Schemes
372 | 
373 | TimeSeries Grouped CV 
374 | 
375 | """
376 | 
377 | 
378 | class GroupedTimeSeriesSplit(TimeSeriesSplit):
379 |     def __init__(
380 |         self,
381 |         n_splits=5,
382 |         valid_splits=1,
383 |         max_train_size=None,
384 |         test_size=52 * 2,
385 |         gap=52,
386 |         debug=False,
387 |     ):
388 |         self.n_splits = n_splits
389 |         self.valid_splits = valid_splits
390 |         self.shuffle = False
391 |         self.random_state = None
392 |         self.max_train_size = max_train_size
393 |         self.test_size = test_size
394 |         self.gap = gap
395 |         self.debug = debug
396 | 
397 |     def split(self, X, y=None, groups=None):
398 |         """Generate indices to split data into training and test set.
399 |         Parameters
400 |         ----------
401 |         X : pd.DataFrame of shape (n_samples, n_features)
402 |             Training data, where `n_samples` is the number of samples
403 |             and `n_features` is the number of features.
404 |         y : array-like of shape (n_samples,)
405 |             Always ignored, exists for compatibility.
406 |         groups : pd.Series of shape (n_samples,)
407 |             Group Labels of training data
408 |         Yields
409 |         ------
410 |         train : ndarray
411 |             The training set indices for that split.
412 |         test : ndarray
413 |             The testing set indices for that split.
414 |         """
415 |         X, y, groups = indexable(X, y, groups)
416 |         n_samples = _num_samples(X)
417 | 
418 |         if groups is None:
419 |             # n_samples = X.shape[0]
420 |             n_splits = self.n_splits
421 |             valid_splits = self.valid_splits
422 |             n_folds = n_splits + 1
423 |             gap = self.gap
424 |             test_size = (
425 |                 self.test_size if self.test_size is not None else n_samples // n_folds
426 |             )
427 | 
428 |             # Make sure we have enough samples for the given split parameters
429 |             if n_folds > n_samples:
430 |                 raise ValueError(
431 |                     f"Cannot have number of folds={n_folds} greater"
432 |                     f" than the number of samples={n_samples}."
433 |                 )
434 |             if n_samples - gap - (test_size * n_splits) <= 0:
435 |                 raise ValueError(
436 |                     f"Too many splits={n_splits} for number of samples"
437 |                     f"={n_samples} with test_size={test_size} and gap={gap}."
438 |                 )
439 | 
440 |             indices = X.index
441 |             test_starts = range(
442 |                 n_samples - valid_splits * test_size, n_samples, test_size
443 |             )
444 | 
445 |             for test_start in test_starts:
446 |                 train_end = test_start - gap
447 |                 if self.max_train_size and self.max_train_size < train_end:
448 |                     yield (
449 |                         indices[max(train_end - self.max_train_size, 0) : train_end],
450 |                         indices[test_start : test_start + test_size],
451 |                     )
452 |                 else:
453 |                     yield (
454 |                         indices[:train_end],
455 |                         indices[test_start : test_start + test_size],
456 |                     )
457 |         else:
458 |             ## Get unique groups
459 |             unique_groups = groups.unique()
460 |             gap = self.gap
461 |             ## Calculate test size if not provided
462 |             if self.test_size:
463 |                 n_folds = (len(unique_groups) - gap) // self.test_size
464 |             else:
465 |                 n_folds = self.n_splits + 1
466 |                 self.test_size = len(unique_groups) // n_folds
467 |             test_splits = [
468 |                 unique_groups[
469 |                     len(unique_groups)
470 |                     - (i + 1) * self.test_size : len(unique_groups)
471 |                     - i * self.test_size
472 |                 ]
473 |                 for i in range(n_folds - 1)
474 |             ]
475 |             if self.max_train_size:
476 |                 train_splits = [
477 |                     unique_groups[
478 |                         max(
479 |                             len(unique_groups)
480 |                             - (i + 1) * self.test_size
481 |                             - gap
482 |                             - self.max_train_size,
483 |                             0,
484 |                         ) : len(unique_groups)
485 |                         - (i + 1) * self.test_size
486 |                         - gap
487 |                     ]
488 |                     for i in range(n_folds - 1)
489 |                 ]
490 |             else:
491 |                 train_splits = [
492 |                     unique_groups[: len(unique_groups) - (i + 1) * self.test_size - gap]
493 |                     for i in range(n_folds - 1)
494 |                 ]
495 |             for i in range(0, self.valid_splits):
496 |                 yield (
497 |                     groups[groups.isin(train_splits[i])].index,
498 |                     groups[groups.isin(test_splits[i])].index,
499 |                 )
500 | 
501 | 
502 | """
503 | Data Dimension Transformer
504 | Currently Implemeted: Constant lookback size with zero-padding
505 | Convert from 2D DataFrame, 
506 | given a lookback size into nested DataFrames for sktime transformers
507 | """
508 | 
509 | 
510 | def forward_fill_zero(series, length):
511 |     fill_length = length - series.shape[0]
512 |     fill_series = pd.Series(np.zeros(fill_length))
513 |     return pd.concat([fill_series, series], axis=0).reset_index(drop=True)
514 | 
515 | 
516 | ### Create rolling windows of nested dataframe for sktime, forward fill zero if there are not enough data at the start
517 | def roll_2D_to_nested(X, lookback=20, normalise=True):
518 |     ## Python index start at zero
519 |     lookback = lookback - 1
520 |     index = X.index
521 |     columns = X.columns
522 |     output = np.empty((len(index), len(columns)), dtype=object)
523 |     for i in range(X.shape[0]):
524 |         for j, c in enumerate(X.columns):
525 |             start_index = max(0, i - lookback)
526 |             recent_rawdata = pd.Series(X.loc[X.index[start_index : i + 1], c])
527 |             if normalise and i >= 1:
528 |                 normalised_rawdata = (
529 |                     recent_rawdata - recent_rawdata.mean()
530 |                 ) / recent_rawdata.std()
531 |                 output[i, j] = forward_fill_zero(normalised_rawdata, lookback + 1)
532 |             else:
533 |                 output[i, j] = forward_fill_zero(recent_rawdata, lookback + 1)
534 |     return pd.DataFrame(output, index=index, columns=columns)
535 | 
536 | 
537 | class RollingTSTransformer(BaseEstimator, TransformerMixin):
538 |     def __init__(self, lookback=20, normalise=True):
539 |         self.lookback = lookback
540 |         self.normalise = normalise
541 | 
542 |     def fit(self, X, y):
543 |         return self
544 | 
545 |     def transform(self, X):
546 |         output = roll_2D_to_nested(X, self.lookback, self.normalise)
547 |         return output
548 | 
549 | 
550 | ### Data Pre-processing
551 | 
552 | 
553 | def align_features_target(features, target, large_value=1e6):
554 |     ## Flatten multi-index column names for tsfresh
555 |     if isinstance(features, pd.DataFrame):
556 |         if features.columns.nlevels > 1:
557 |             features.columns = [
558 |                 "_".join(column).rstrip("_")
559 |                 for column in features.columns.to_flat_index()
560 |             ]
561 |     ## Remove rows with na and align features and target to same length
562 |     ##features.replace(np.inf, large_value, inplace=True)
563 |     ##features.replace(-np.inf, -1 * large_value, inplace=True)
564 |     ##features = features.dropna()
565 |     ##target = target.dropna()
566 |     valid_index = features.index.intersection(target.index)
567 |     features = features.reindex(valid_index)
568 |     target = target.reindex(valid_index)
569 |     return features, target
570 | 


--------------------------------------------------------------------------------
/src/pythor/numerai.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # -*- coding: utf-8 -*-
   3 | #
   4 | # A collection of tools for data processing for Numerai and other temporal tabular data competitions
   5 | #
   6 | #
   7 | # Licensed under the Apache License, Version 2.0 (the "License");
   8 | # you may not use this file except in compliance with the License.
   9 | # You may obtain a copy of the License at
  10 | #
  11 | #     http://www.apache.org/licenses/LICENSE-2.0
  12 | #
  13 | # Unless required by applicable law or agreed to in writing, software
  14 | # distributed under the License is distributed on an "AS IS" BASIS,
  15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16 | # See the License for the specific language governing permissions and
  17 | # limitations under the License.
  18 | #
  19 | 
  20 | import joblib, datetime, json, os
  21 | import pandas as pd
  22 | import numpy as np
  23 | import scipy
  24 | import torch
  25 | import xgboost
  26 | 
  27 | from .benchmark import load_best_model
  28 | from .util import strategy_metrics
  29 | from .feature import NumeraiTransformer
  30 | 
  31 | if torch.cuda.is_available():
  32 |     import cupy as cp
  33 |     import cudf
  34 |     from cuml.neighbors import KNeighborsRegressor
  35 | else:
  36 |     from sklearn.neighbors import KNeighborsRegressor
  37 | 
  38 | 
  39 | """
  40 | Helper Functions to convert Numerai Era and Datetime 
  41 | """
  42 | 
  43 | 
  44 | ## Shifting Numerai Era
  45 | def shift_era(era, gap=6):
  46 |     new_era_int = int(era) + gap
  47 |     new_era = str(new_era_int)
  48 |     while len(new_era) < 4:
  49 |         new_era = "0" + new_era
  50 |     return new_era
  51 | 
  52 | 
  53 | ## Convert datetime into Numerai eras
  54 | def convert_datetime_to_era(sample_date):
  55 |     baseline = datetime.datetime(year=2003, month=1, day=3)
  56 |     differences = datetime.datetime.strptime(sample_date, "%Y-%m-%d") - baseline
  57 |     new_era = str(differences.days // 7 + 1)
  58 |     while len(new_era) < 4:
  59 |         new_era = "0" + new_era
  60 |     return new_era
  61 | 
  62 | 
  63 | def convert_era_to_datetime(era):
  64 |     baseline = datetime.datetime(year=2003, month=1, day=3)
  65 |     new_datetime = baseline + datetime.timedelta(days=7 * (int(era) - 1))
  66 |     return new_datetime
  67 | 
  68 | 
  69 | ### Map columns Numerai Era to datetime
  70 | def create_era_index(
  71 |     df,
  72 |     baseline=datetime.datetime(year=2003, month=1, day=3),
  73 | ):
  74 |     mapped_era = [
  75 |         baseline + datetime.timedelta(days=7 * (int(x) - 1)) for x in df.index
  76 |     ]
  77 |     df.index = mapped_era
  78 |     return df
  79 | 
  80 | 
  81 | """
  82 | Data Loader for Numerai Data 
  83 | 
  84 | """
  85 | 
  86 | 
  87 | def load_numerai_data_era(
  88 |     filename,
  89 |     feature_metadata="v4_features.json",
  90 |     resample=0,
  91 |     resample_freq=1,
  92 |     target_col=["target"],
  93 |     era_col="era",
  94 |     data_version="v4",
  95 |     startera=None,
  96 |     endera=None,
  97 | ):
  98 |     ## Read Train Data
  99 |     df_raw = pd.read_parquet(filename)
 100 |     ## Select Range
 101 |     if startera is not None and endera is not None:
 102 |         df_raw = df_raw[(df_raw[era_col] <= endera) & (df_raw[era_col] >= startera)]
 103 |     elif endera is not None:
 104 |         df_raw = df_raw[(df_raw[era_col] <= endera)]
 105 |     ## Downsample Eras
 106 |     if resample_freq > 1:
 107 |         downsampled_eras = df_raw[era_col].unique()[resample::resample_freq]
 108 |         df = df_raw[df_raw[era_col].isin(downsampled_eras)]
 109 |     else:
 110 |         df = df_raw.copy()
 111 | 
 112 |     del df_raw
 113 | 
 114 |     ## Features Sets
 115 |     feature_col = [col for col in df.columns if col.startswith("feature_")]
 116 | 
 117 |     if data_version in [
 118 |         "v4",
 119 |         "v4-all",
 120 |     ]:
 121 |         bad_features = [
 122 |             "feature_palpebral_univalve_pennoncel",
 123 |             "feature_unsustaining_chewier_adnoun",
 124 |             "feature_brainish_nonabsorbent_assurance",
 125 |             "feature_coastal_edible_whang",
 126 |             "feature_disprovable_topmost_burrower",
 127 |             "feature_trisomic_hagiographic_fragrance",
 128 |             "feature_queenliest_childing_ritual",
 129 |             "feature_censorial_leachier_rickshaw",
 130 |             "feature_daylong_ecumenic_lucina",
 131 |             "feature_steric_coxcombic_relinquishment",
 132 |         ]
 133 |         feature_col = list(set(feature_col) - set(bad_features))
 134 | 
 135 |     ## Features and Targets are DataFrame
 136 |     if data_version == "signals":
 137 |         features = df[feature_col].fillna(0)
 138 |     ## For Numerai Classic Tournament, v4 dataset
 139 |     else:
 140 |         features = df[feature_col].fillna(2) - 2
 141 | 
 142 |     target_median = df[target_col].median()
 143 |     targets = df[target_col].fillna(target_median) - target_median
 144 |     ## Group column has to be pd.Series for time-series cross validation
 145 |     groups = df[era_col]
 146 |     ## weights column has to be pd.Series for time-series cross validation
 147 |     df["weights"] = 1
 148 |     weights = df["weights"]
 149 |     return features.astype(np.int8), targets.astype(np.float32), groups, weights
 150 | 
 151 | 
 152 | def load_numerai_data(
 153 |     data_folder,
 154 |     feature_metadata="v4_features.json",
 155 |     resample=0,
 156 |     resample_freq=1,
 157 |     target_col=["target"],
 158 |     era_col="era",
 159 |     data_version="v4",
 160 |     startera=None,
 161 |     endera=None,
 162 | ):
 163 |     if data_version in [
 164 |         "v4",
 165 |         "v4.1",
 166 |         "v5",
 167 |         "v6",
 168 |     ]:
 169 |         features_list = list()
 170 |         targets_list = list()
 171 |         groups_list = list()
 172 |         weights_list = list()
 173 | 
 174 |         if startera is None:
 175 |             startera = "0001"
 176 |         if endera is None:
 177 |             endera = "0001"
 178 | 
 179 |         for i in range(int(startera) + resample, int(endera) + 1, resample_freq):
 180 |             if i <= 9:
 181 |                 test_start_str = "000" + str(i)
 182 |             elif i <= 99:
 183 |                 test_start_str = "00" + str(i)
 184 |             elif i <= 999:
 185 |                 test_start_str = "0" + str(i)
 186 |             else:
 187 |                 test_start_str = str(i)
 188 | 
 189 |             data_file = f"{data_folder}/{data_version}_{test_start_str}_int8.parquet"
 190 | 
 191 |             features, targets, groups, weights = load_numerai_data_era(
 192 |                 data_file,
 193 |                 feature_metadata=feature_metadata,
 194 |                 resample=0,
 195 |                 resample_freq=1,
 196 |                 target_col=target_col,
 197 |                 era_col=era_col,
 198 |                 data_version=data_version,
 199 |                 startera=test_start_str,
 200 |                 endera=test_start_str,
 201 |             )
 202 | 
 203 |             features_list.append(features)
 204 |             targets_list.append(targets)
 205 |             groups_list.append(groups)
 206 |             weights_list.append(weights)
 207 | 
 208 |         return (
 209 |             pd.concat(features_list),
 210 |             pd.concat(targets_list),
 211 |             pd.concat(groups_list),
 212 |             pd.concat(weights_list),
 213 |         )
 214 |     else:
 215 |         features, targets, groups, weights = load_numerai_data_era(
 216 |             data_folder,
 217 |             feature_metadata=feature_metadata,
 218 |             resample=resample,
 219 |             resample_freq=resample_freq,
 220 |             target_col=target_col,
 221 |             era_col=era_col,
 222 |             data_version=data_version,
 223 |             startera=startera,
 224 |             endera=endera,
 225 |         )
 226 |         return features, targets, groups, weights
 227 | 
 228 | 
 229 | """
 230 | Generate Predictions for Numerai 
 231 | 
 232 | trained_model: model object which has method .predict to generate predictions
 233 | parameters: dictionary which contains parameters of the trained_model
 234 | modelname: str Name of Model
 235 | start_iteration: for tree-based methods, skip the first N trees in model when generating predictions 
 236 | startera: first era to get predictions 
 237 | endera: last era to get predictions 
 238 | 
 239 | 
 240 | Output: prediction_df: pd.DataFrame with columns era, prediction, model_name, target_col 
 241 | 
 242 | """
 243 | 
 244 | 
 245 | class FeatureMomentumModel:
 246 |     def __init__(
 247 |         self,
 248 |         lookback=52,
 249 |         shift=6,
 250 |         correlation_file_path=None,
 251 |         portfolio_file_path=None,
 252 |         target_col=None,
 253 |         seed=0,
 254 |     ):
 255 |         self.seed = seed
 256 |         self.lookback = lookback
 257 |         self.shift = shift
 258 |         self.correlation_file_path = correlation_file_path
 259 |         self.portfolio_file_path = portfolio_file_path
 260 |         self.target_col = target_col
 261 | 
 262 |     def predict(self, features):
 263 |         correlation_matrix = pd.read_parquet(self.correlation_file_path)
 264 |         factor_momentum = (
 265 |             correlation_matrix.shift(self.shift)
 266 |             .fillna(0)
 267 |             .rolling(self.lookback)
 268 |             .mean()
 269 |             .dropna()
 270 |         )
 271 |         last_momentum = factor_momentum.tail(1).transpose().squeeze()[features.columns]
 272 |         preds = features * np.sign(last_momentum)
 273 |         return preds.mean(axis=1)
 274 | 
 275 |     def copy_performance(self, outputfolder):
 276 |         portfolio = pd.read_csv(self.portfolio_file_path, index_col=0)
 277 |         portfolio.columns = [f"feature-momentum_None_1_{self.seed}_1-baseline-0"]
 278 |         portfolio.to_csv(f"{outputfolder}/feature-momentum_None_1_{self.seed}_1.csv")
 279 | 
 280 | 
 281 | def predict_numerai(
 282 |     features_raw,
 283 |     targets,
 284 |     groups,
 285 |     trained_model,
 286 |     parameters,
 287 |     modelname="sample",
 288 |     gbm_start_iteration=0,  ## Backward Comptability
 289 |     era_col="era",
 290 |     debug=False,
 291 | ):
 292 |     ## Score on Dataset
 293 | 
 294 |     selected_cols = parameters["parameters"]["model"]["feature_columns"]
 295 |     target_col = parameters["parameters"]["model"]["target_columns"]
 296 | 
 297 |     ## Transform Features
 298 |     if parameters["parameters"]["model"]["feature_engineering"] is not None:
 299 |         if parameters["parameters"]["model"]["feature_engineering"] in [
 300 |             "numerai",
 301 |         ]:
 302 |             feature_eng_parameters = parameters["parameters"]["feature_eng"]
 303 |             transformer = NumeraiTransformer(**feature_eng_parameters)
 304 |             transformer.data = parameters["transformer"]
 305 |             features = transformer.transform(
 306 |                 features_raw[selected_cols], is_train=False
 307 |             )
 308 |         if parameters["parameters"]["model"]["feature_engineering"] in [
 309 |             "numeraiv4",
 310 |             "numeraiv4.1",
 311 |         ]:
 312 |             feature_eng_parameters = parameters["parameters"]["feature_eng"]
 313 |             transformer = NumeraiTransformerV4(**feature_eng_parameters)
 314 |             transformer.data = parameters["transformer"]
 315 |             features = transformer.transform(
 316 |                 features_raw[selected_cols], is_train=False
 317 |             )
 318 |     else:
 319 |         features = features_raw[selected_cols]
 320 | 
 321 |     ## Run Predictions
 322 |     ## For tree-based models can run some of the trees only
 323 |     if parameters["parameters"]["model"]["tabular_model"] in [
 324 |         "lightgbm",
 325 |         "lightgbm-gbdt",
 326 |         "lightgbm-dart",
 327 |         "lightgbm-goss",
 328 |     ]:
 329 |         ## Backward Compatability
 330 |         if "additional" in parameters["parameters"] and gbm_start_iteration is None:
 331 |             gbm_start_iteration = parameters["parameters"]["additional"].get(
 332 |                 "gbm_start_iteration", 0
 333 |             )
 334 |         start_iteration = min(
 335 |             gbm_start_iteration, int(trained_model.num_trees() * 0.75)
 336 |         )
 337 |         predictions_raw = trained_model.predict(
 338 |             features, start_iteration=start_iteration
 339 |         )
 340 |     elif parameters["parameters"]["model"]["tabular_model"] in [
 341 |         "xgboost",
 342 |     ]:
 343 |         if hasattr(trained_model, "best_iteration"):
 344 |             end_iteration = trained_model.best_iteration
 345 |         else:
 346 |             end_iteration = trained_model.num_boosted_rounds()
 347 |         start_iteration = min(gbm_start_iteration, int(end_iteration * 0.75))
 348 |         xgboost_features = xgboost.DMatrix(features)
 349 |         predictions_raw = trained_model.predict(
 350 |             xgboost_features,
 351 |             iteration_range=(start_iteration, end_iteration),
 352 |         )
 353 |     elif parameters["parameters"]["model"]["tabular_model"] in [
 354 |         "Numerai-MLP",
 355 |         "Numerai-LSTM",
 356 |     ]:
 357 |         predictions_raw = trained_model.predict(features.values)
 358 |     elif parameters["parameters"]["model"]["tabular_model"] in [
 359 |         "tabnet",
 360 |     ]:
 361 |         predictions_raw = trained_model.predict(features.values)
 362 |     elif parameters["parameters"]["model"]["tabular_model"] in [
 363 |         "feature-momentum",
 364 |     ]:
 365 |         trained_model = FeatureMomentumModel(**parameters["parameters"]["tabular"])
 366 |         predictions_raw = trained_model.predict(features)
 367 |     else:
 368 |         ## General Model which implements a predict method
 369 |         predictions_raw = trained_model.predict(features)
 370 | 
 371 |     ## Process Predictions into DataFrame
 372 |     predictions = pd.DataFrame(
 373 |         predictions_raw,
 374 |         columns=target_col,
 375 |         index=targets.index,
 376 |     )
 377 |     predictions[era_col] = groups
 378 |     ## Rank Predictions within each era
 379 |     normalised_predictions = list()
 380 |     for i, df in predictions.groupby(era_col):
 381 |         per_era = df[target_col].rank(pct=True, axis=0)
 382 |         normalised_predictions.append(per_era)
 383 |     processed_predictions = pd.concat(normalised_predictions, axis=0)
 384 |     predictions["prediction"] = processed_predictions[target_col].mean(axis=1)
 385 |     prediction_df = pd.concat([predictions[[era_col, "prediction"]], targets], axis=1)
 386 |     prediction_df["model_name"] = modelname
 387 |     return prediction_df
 388 | 
 389 | 
 390 | def predict_numerai_multiple(
 391 |     Numerai_Model_Names,
 392 |     correlation_matrix=None,
 393 |     filename="data/v4_all_int8.parquet",
 394 |     data_version="v4",
 395 |     startera=None,
 396 |     endera=None,
 397 |     debug=False,
 398 |     era_col="era",
 399 |     target_col=["target"],
 400 |     embargo=26,
 401 |     gbm_start_iteration=0,
 402 | ):
 403 |     features, targets, groups, weights = load_numerai_data(
 404 |         filename,
 405 |         target_col=target_col,
 406 |         era_col=era_col,
 407 |         data_version=data_version,
 408 |         startera=startera,
 409 |         endera=endera,
 410 |     )
 411 | 
 412 |     INDEX_COL_NAMES = features.index.names
 413 | 
 414 |     prediction_df_list = list()
 415 |     score_df_list = list()
 416 | 
 417 |     for Numerai_Model_Name in Numerai_Model_Names:
 418 |         modelname = Numerai_Model_Name.replace(".parameters", ".model")
 419 |         parameters = joblib.load(Numerai_Model_Name)
 420 |         most_recent_model = load_best_model(
 421 |             parameters["parameters"]["model"]["tabular_model"], modelname
 422 |         )
 423 | 
 424 |         ## Check Embargo Period for Numerai Classic Models
 425 |         if data_version in [
 426 |             "v4",
 427 |             "v4.1",
 428 |             "v5",
 429 |             "v6",
 430 |         ]:
 431 |             test_start = shift_era(
 432 |                 parameters["parameters"]["model"]["validation_end"], embargo
 433 |             )
 434 |             required_index = groups[groups >= test_start].index
 435 |         else:
 436 |             required_index = groups.index
 437 | 
 438 |         if debug:
 439 |             print(modelname, test_start)
 440 | 
 441 |         if required_index.shape[0] > 0:
 442 |             prediction_df = predict_numerai(
 443 |                 features.loc[required_index],
 444 |                 targets.loc[required_index],
 445 |                 groups.loc[required_index],
 446 |                 most_recent_model,
 447 |                 parameters,
 448 |                 modelname=modelname,
 449 |                 gbm_start_iteration=gbm_start_iteration,
 450 |                 era_col=era_col,
 451 |                 debug=debug,
 452 |             )
 453 |             prediction_df_list.append(prediction_df)
 454 | 
 455 |             if debug:
 456 |                 print(prediction_df.columns, prediction_df.shape)
 457 | 
 458 |     if len(prediction_df_list) > 0:
 459 |         output_cols = [era_col, "prediction"] + target_col
 460 |         average_prediction_df = (
 461 |             pd.concat(prediction_df_list, axis=0)
 462 |             .groupby(INDEX_COL_NAMES)[output_cols]
 463 |             .mean()
 464 |         )
 465 |         average_prediction_df[era_col] = groups
 466 |         if debug:
 467 |             print(average_prediction_df.columns, average_prediction_df.shape)
 468 | 
 469 |         return average_prediction_df.sort_values(era_col), prediction_df_list
 470 |     else:
 471 |         return pd.DataFrame(), pd.DataFrame()
 472 | 
 473 | 
 474 | """
 475 | Score Numerai Models with FN using CUDA 
 476 | 
 477 | prediction_df: pd.DataFrame with columns era, prediction, model_name, target_col  and index id 
 478 | features: pd.DataFrame with columns feature_xxx and index id 
 479 | riskiest_fatures: list of str 
 480 | 
 481 | 
 482 | Output 
 483 | prediction_df: pd.DataFrame with columns era, model_name, prediction, neutralised_prediction, target_col, index id 
 484 | correlations_by_era: pd.DataFrame with columns correlation, normalised_correlation, neutralised_correlation, index era 
 485 | 
 486 | """
 487 | 
 488 | 
 489 | def score_numerai(
 490 |     prediction_df,
 491 |     features,
 492 |     riskiest_features,
 493 |     proportion=0,
 494 |     modelname="sample",
 495 |     target_col_name="target",
 496 |     prediction_col="prediction",
 497 |     era_col="era",
 498 |     debug=False,
 499 | ):
 500 |     ## Find Correlation by era
 501 |     correlations_by_era = list()
 502 |     for i, df in prediction_df.groupby(era_col):
 503 |         output = dict()
 504 |         output[era_col] = i
 505 |         ## Computation on CUDA
 506 |         if torch.cuda.is_available():
 507 |             temp = (
 508 |                 scipy.stats.rankdata(df[prediction_col], method="ordinal") - 0.5
 509 |             ) / len(df[prediction_col])
 510 |             df["normalised_prediction"] = scipy.stats.norm.ppf(temp)
 511 |             ## Neutralised targets (FNC)
 512 |             if proportion > 0 and len(riskiest_features) > 0:
 513 |                 exposures = cp.asarray(features.loc[df.index, riskiest_features])
 514 |                 normalised_prediction = cp.asarray(df["normalised_prediction"])
 515 |                 gram_mtx = cp.dot(cp.linalg.pinv(exposures), normalised_prediction)
 516 |                 projected_values = normalised_prediction - cp.asarray(
 517 |                     proportion
 518 |                 ) * cp.dot(exposures, gram_mtx)
 519 |                 df["neutralised_prediction"] = projected_values.get()
 520 |                 df["neutralised_prediction"] = (
 521 |                     df["neutralised_prediction"] / df["neutralised_prediction"].std()
 522 |                 )
 523 |                 output["neutralised_correlation"] = cp.corrcoef(
 524 |                     cp.asarray(df[target_col_name]),
 525 |                     cp.asarray(df["neutralised_prediction"].rank(pct=True)),
 526 |                 )[0, 1].get()
 527 |                 prediction_df.loc[df.index, "neutralised_prediction"] = df[
 528 |                     "neutralised_prediction"
 529 |                 ].rank(pct=True)
 530 |             else:
 531 |                 output["neutralised_correlation"] = cp.corrcoef(
 532 |                     cp.asarray(df[target_col_name]),
 533 |                     cp.asarray(df[prediction_col].rank(pct=True)),
 534 |                 )[0, 1].get()
 535 |                 prediction_df.loc[df.index, "neutralised_prediction"] = df[
 536 |                     prediction_col
 537 |                 ].rank(pct=True)
 538 |         ### Computation on CPU
 539 |         else:
 540 |             ## Normalise prediction
 541 |             temp = (
 542 |                 scipy.stats.rankdata(df[prediction_col], method="ordinal") - 0.5
 543 |             ) / len(df[prediction_col])
 544 |             df["normalised_prediction"] = scipy.stats.norm.ppf(temp)
 545 |             ## Neutralised targets (FNC)
 546 |             if proportion > 0 and len(riskiest_features) > 0:
 547 |                 exposures = features.loc[df.index, riskiest_features]
 548 |                 df["neutralised_prediction"] = df[
 549 |                     "normalised_prediction"
 550 |                 ] - proportion * exposures.dot(
 551 |                     np.linalg.pinv(exposures).dot(df["normalised_prediction"])
 552 |                 )
 553 |                 df["neutralised_prediction"] = (
 554 |                     df["neutralised_prediction"] / df["neutralised_prediction"].std()
 555 |                 )
 556 |                 output["neutralised_correlation"] = np.corrcoef(
 557 |                     df[target_col_name], df["neutralised_prediction"].rank(pct=True)
 558 |                 )[0, 1]
 559 |                 prediction_df.loc[df.index, "neutralised_prediction"] = df[
 560 |                     "neutralised_prediction"
 561 |                 ].rank(pct=True)
 562 |             else:
 563 |                 output["neutralised_correlation"] = np.corrcoef(
 564 |                     df[target_col_name], df[prediction_col].rank(pct=True)
 565 |                 )[0, 1]
 566 |                 prediction_df.loc[df.index, "neutralised_prediction"] = df[
 567 |                     prediction_col
 568 |                 ].rank(pct=True)
 569 |         correlations_by_era.append(output)
 570 |     ## Generate Overall files
 571 |     correlations_by_era_all = pd.DataFrame.from_records(correlations_by_era)
 572 |     prediction_df["model_name"] = modelname
 573 |     correlations_by_era_all["model_name"] = modelname
 574 |     return prediction_df, correlations_by_era_all
 575 | 
 576 | 
 577 | """
 578 | Linear Factor Model
 579 | Factor Timing
 580 | rawdata: pd.DataFrame: Numerai dataset with columns containing the 1149 features and 20 targets, index id 
 581 | """
 582 | 
 583 | 
 584 | def numerai_feature_correlation_matrix(
 585 |     rawdata, feature_col=None, target_col_name=None, era_col="era"
 586 | ):
 587 |     output = dict()
 588 |     for i, df in rawdata.groupby(era_col):
 589 |         corr_dict = dict()
 590 |         for feature in feature_col:
 591 |             corr_dict[feature] = np.corrcoef(
 592 |                 df[feature].fillna(2).astype(float), df[target_col_name]
 593 |             )[0, 1]
 594 |         output[i] = corr_dict
 595 | 
 596 |     return pd.DataFrame.from_records(output).transpose()[feature_col]
 597 | 
 598 | 
 599 | def numerai_feature_momentum(
 600 |     data_folder="../data/era",
 601 |     output_folder="../data/feature_momentum",
 602 |     data_version="v4",
 603 |     startera="0001",
 604 |     endera="1037",
 605 |     era_col="era",
 606 |     lookback=52,
 607 |     update_correlation_mtx=True,
 608 |     feature_col=None,
 609 | ):
 610 |     if update_correlation_mtx:
 611 |         ## Calculate Correlation Matrix
 612 |         for i in range(int(startera), int(endera) + 1):
 613 |             if i <= 9:
 614 |                 test_start_str = "000" + str(i)
 615 |             elif i <= 99:
 616 |                 test_start_str = "00" + str(i)
 617 |             elif i <= 999:
 618 |                 test_start_str = "0" + str(i)
 619 |             else:
 620 |                 test_start_str = str(i)
 621 |             data_file = f"{data_folder}/{data_version}_{test_start_str}_int8.parquet"
 622 |             rawdata = pd.read_parquet(data_file)
 623 |             if feature_col is None:
 624 |                 feature_col = [x for x in rawdata.columns if x.startswith("feature_")]
 625 |             target_cols = [x for x in rawdata.columns if x.startswith("target_")]
 626 |             for target_col in target_cols:
 627 |                 correlation_file = f"{output_folder}/{target_col}_corr.parquet"
 628 |                 if os.path.exists(correlation_file):
 629 |                     correlation_matrix_old = pd.read_parquet(correlation_file)
 630 |                     if test_start_str > correlation_matrix_old.index[-1]:
 631 |                         rawdata_copy = rawdata.dropna(subset=[target_col]).copy()
 632 |                         feature_col = correlation_matrix_old.columns
 633 |                         if rawdata_copy.shape[0] > 0:
 634 |                             correlation_matrix = numerai_feature_correlation_matrix(
 635 |                                 rawdata_copy, feature_col, target_col
 636 |                             )
 637 |                             pd.concat(
 638 |                                 [correlation_matrix_old, correlation_matrix]
 639 |                             ).to_parquet(correlation_file)
 640 |                 else:
 641 |                     rawdata_copy = rawdata.dropna(subset=[target_col]).copy()
 642 |                     if rawdata_copy.shape[0] > 0:
 643 |                         correlation_matrix = numerai_feature_correlation_matrix(
 644 |                             rawdata_copy, feature_col, target_col
 645 |                         )
 646 |                         correlation_matrix.to_parquet(correlation_file)
 647 | 
 648 |     data_file = f"{data_folder}/{data_version}_{endera}_int8.parquet"
 649 |     rawdata = pd.read_parquet(data_file)
 650 |     # feature_col = [x for x in rawdata.columns if x.startswith("feature_")]
 651 |     target_cols = [x for x in rawdata.columns if x.startswith("target_")]
 652 | 
 653 |     ## Factor Momentum Portfolio
 654 |     for target_col in target_cols:
 655 |         correlation_file = f"{output_folder}/{target_col}_corr.parquet"
 656 |         correlation_matrix = pd.read_parquet(correlation_file)
 657 |         feature_col = correlation_matrix.columns
 658 | 
 659 |         if "60" in target_col:
 660 |             gap = 14
 661 |         else:
 662 |             gap = 6
 663 | 
 664 |         factor_momentum = (
 665 |             correlation_matrix.shift(gap).fillna(0).rolling(lookback).mean().dropna()
 666 |         )
 667 |         factor_volatility = (
 668 |             correlation_matrix.shift(gap).fillna(0).rolling(lookback).std().dropna()
 669 |         )
 670 |         fm_max_index = factor_momentum.index.max()
 671 |         fm_min_index = factor_momentum.index.min()
 672 | 
 673 |         factor_momentum_eras = factor_momentum.unstack(level=0).reset_index()
 674 |         factor_momentum_eras.columns = ["feature_name", "era", "momentum"]
 675 |         factor_volatility_eras = factor_volatility.unstack(level=0).reset_index()
 676 |         factor_volatility_eras.columns = ["feature_name", "era", "volatility"]
 677 | 
 678 |         for i in range(int(startera), int(endera) + 1):
 679 |             if i <= 9:
 680 |                 test_start_str = "000" + str(i)
 681 |             elif i <= 99:
 682 |                 test_start_str = "00" + str(i)
 683 |             elif i <= 999:
 684 |                 test_start_str = "0" + str(i)
 685 |             else:
 686 |                 test_start_str = str(i)
 687 | 
 688 |             if (test_start_str <= fm_max_index) & (test_start_str >= fm_min_index):
 689 |                 factor_file = f"{output_folder}/{target_col}_feature_momentum.csv"
 690 |                 if os.path.exists(factor_file):
 691 |                     factor_portfolio_old = pd.read_csv(factor_file, index_col=0)
 692 |                     factor_portfolio_old.index = pd.to_datetime(
 693 |                         factor_portfolio_old.index
 694 |                     )
 695 |                     if (
 696 |                         convert_era_to_datetime(test_start_str)
 697 |                         > factor_portfolio_old.index[-1]
 698 |                     ):
 699 |                         update = True
 700 |                     else:
 701 |                         update = False
 702 |                 else:
 703 |                     update = True
 704 | 
 705 |                 if update:
 706 |                     ## Read Data
 707 |                     data_file = (
 708 |                         f"{data_folder}/{data_version}_{test_start_str}_int8.parquet"
 709 |                     )
 710 |                     df = pd.read_parquet(data_file)
 711 |                     feature_col = [x for x in df.columns if x.startswith("feature_")]
 712 |                     df[feature_col] = df[feature_col].fillna(2) - 2
 713 | 
 714 |                     ## Factor Momentum
 715 |                     portfolio_predictions = df[[era_col, "target"]]
 716 |                     per_era = df[feature_col] * np.sign(
 717 |                         factor_momentum.loc[test_start_str, feature_col]
 718 |                     )
 719 |                     portfolio_predictions["prediction"] = per_era.mean(axis=1)
 720 |                     prediction_era, correlations_era = score_numerai(
 721 |                         portfolio_predictions,
 722 |                         df[feature_col],
 723 |                         None,
 724 |                         proportion=0,
 725 |                         modelname=f"{target_col}_feature_momentum-baseline-0",
 726 |                         target_col_name="target",
 727 |                     )
 728 |                     factor_porfolio = create_era_index(
 729 |                         correlations_era.pivot(
 730 |                             index="era",
 731 |                             columns=["model_name"],
 732 |                             values=["neutralised_correlation"],
 733 |                         )
 734 |                     )
 735 |                     if os.path.exists(factor_file):
 736 |                         factor_portfolio_old = pd.read_csv(factor_file, index_col=0)
 737 |                         factor_portfolio_old.index = pd.to_datetime(
 738 |                             factor_portfolio_old.index
 739 |                         )
 740 |                         pd.concat(
 741 |                             [
 742 |                                 factor_portfolio_old,
 743 |                                 factor_porfolio["neutralised_correlation"],
 744 |                             ]
 745 |                         ).to_csv(factor_file)
 746 |                     else:
 747 |                         factor_porfolio["neutralised_correlation"].to_csv(factor_file)
 748 | 
 749 | 
 750 | """
 751 | 
 752 | Benchmark Performances of Numerai Models 
 753 | 
 754 | Run Model Performances for models trained with a single ML model 
 755 | 
 756 | """
 757 | 
 758 | 
 759 | def dynamic_feature_neutralisation(
 760 |     prediction_df,
 761 |     features_raw,
 762 |     feature_corr=None,
 763 |     features_optimizer=None,
 764 |     modelname="sample",
 765 |     era_col="era",
 766 |     target_col=["target"],
 767 |     cutoff=420,
 768 |     gap=6,
 769 |     lookback=52,
 770 |     proportion=1,
 771 |     debug=False,
 772 | ):
 773 |     if features_optimizer is None:
 774 |         features_optimizer = features_raw.columns[:cutoff]
 775 | 
 776 |     if feature_corr is None:
 777 |         ## Get index by era
 778 |         prediction_dynamic = list()
 779 |         correlation_dynamic = list()
 780 |         for i, df in prediction_df.groupby(era_col):
 781 |             if debug:
 782 |                 print(modelname, i, df.shape)
 783 |             prediction_df_era = prediction_df.loc[df.index]
 784 |             features_raw_era = features_raw.loc[df.index]
 785 |             ## Baseline
 786 |             prediction_df_era_new, correlations_by_era = score_numerai(
 787 |                 prediction_df_era,
 788 |                 features_raw_era,
 789 |                 list(),
 790 |                 proportion=0,
 791 |                 modelname=f"{modelname}-baseline",
 792 |                 target_col_name=target_col[0],
 793 |                 era_col=era_col,
 794 |                 debug=debug,
 795 |             )
 796 |             prediction_dynamic.append(prediction_df_era_new.copy())
 797 |             correlation_dynamic.append(correlations_by_era)
 798 |         return pd.concat(prediction_dynamic, axis=0), pd.concat(
 799 |             correlation_dynamic, axis=0
 800 |         )
 801 | 
 802 |     else:
 803 |         ## Generate Feature Momentum Leaderboard
 804 |         factor_mean = (
 805 |             feature_corr.shift(gap).fillna(0).rolling(lookback).mean().dropna()
 806 |         )
 807 |         factor_volatility = (
 808 |             feature_corr.shift(gap).fillna(0).rolling(lookback).std().dropna()
 809 |         )
 810 |         factor_skew = (
 811 |             feature_corr.shift(gap).fillna(0).rolling(lookback).skew().dropna()
 812 |         )
 813 |         factor_kurt = (
 814 |             feature_corr.shift(gap).fillna(0).rolling(lookback).kurt().dropna()
 815 |         )
 816 |         factor_drawdown = (
 817 |             (-1 * (feature_corr.cumsum() - feature_corr.cumsum().cummax()).cummin())
 818 |             .shift(gap)
 819 |             .fillna(0)
 820 |         )
 821 |         factor_sharpe = factor_mean / factor_volatility
 822 |         factor_calmar = factor_mean / factor_drawdown
 823 |         factor_autocorrelation = (
 824 |             feature_corr.rolling(lookback)
 825 |             .corr(feature_corr.shift(4))
 826 |             .shift(gap)
 827 |             .fillna(0)
 828 |         )
 829 | 
 830 |         fm_max_index = factor_mean.index.max()
 831 |         fm_min_index = factor_mean.index.min()
 832 | 
 833 |         ##
 834 |         factor_flavour_eras = dict()
 835 |         for flavour in [
 836 |             "mean",
 837 |             "volatility",
 838 |         ]:
 839 |             factor_flavour_eras[flavour] = (
 840 |                 locals()[f"factor_{flavour}"].unstack(level=0).reset_index()
 841 |             )
 842 |             factor_flavour_eras[flavour].columns = ["feature_name", "era", flavour]
 843 | 
 844 |         ## Get index by era
 845 |         prediction_dynamic = list()
 846 |         correlation_dynamic = list()
 847 |         for i, df in prediction_df.groupby(era_col):
 848 |             if debug:
 849 |                 print(modelname, i, df.shape)
 850 |             if (i <= fm_max_index) & (i >= fm_min_index):
 851 |                 prediction_df_era = prediction_df.loc[df.index]
 852 |                 features_raw_era = features_raw.loc[df.index]
 853 | 
 854 |                 ## Baseline
 855 |                 prediction_df_era_new, correlations_by_era = score_numerai(
 856 |                     prediction_df_era,
 857 |                     features_raw_era,
 858 |                     list(),
 859 |                     proportion=0,
 860 |                     modelname=f"{modelname}-baseline",
 861 |                     target_col_name=target_col[0],
 862 |                     era_col=era_col,
 863 |                     debug=debug,
 864 |                 )
 865 |                 prediction_dynamic.append(prediction_df_era_new.copy())
 866 |                 correlation_dynamic.append(correlations_by_era)
 867 | 
 868 |                 ## For v4-data only
 869 |                 bad_features = [
 870 |                     "feature_palpebral_univalve_pennoncel",
 871 |                     "feature_unsustaining_chewier_adnoun",
 872 |                     "feature_brainish_nonabsorbent_assurance",
 873 |                     "feature_coastal_edible_whang",
 874 |                     "feature_disprovable_topmost_burrower",
 875 |                     "feature_trisomic_hagiographic_fragrance",
 876 |                     "feature_queenliest_childing_ritual",
 877 |                     "feature_censorial_leachier_rickshaw",
 878 |                     "feature_daylong_ecumenic_lucina",
 879 |                     "feature_steric_coxcombic_relinquishment",
 880 |                 ]
 881 | 
 882 |                 features_optimizer = list(set(features_optimizer) - set(bad_features))
 883 | 
 884 |                 ## Optimizer
 885 |                 prediction_df_era_new, correlations_by_era = score_numerai(
 886 |                     prediction_df_era,
 887 |                     features_raw_era,
 888 |                     features_optimizer,
 889 |                     proportion=proportion,
 890 |                     modelname=f"{modelname}-optimizer",
 891 |                     target_col_name=target_col[0],
 892 |                     era_col=era_col,
 893 |                     debug=debug,
 894 |                 )
 895 |                 prediction_dynamic.append(prediction_df_era_new.copy())
 896 |                 correlation_dynamic.append(correlations_by_era)
 897 | 
 898 |                 ### Dynamic Feature Neutralisation by different criteria
 899 |                 DFN_params = list()
 900 |                 for flavour in [
 901 |                     "mean",
 902 |                     "volatility",
 903 |                 ]:
 904 |                     for direction in [
 905 |                         "tail",
 906 |                         "head",
 907 |                     ]:
 908 |                         for size in [
 909 |                             420,
 910 |                         ]:
 911 |                             if direction == "tail":
 912 |                                 name = f"high_{flavour}_"
 913 |                             else:
 914 |                                 name = f"low_{flavour}_"
 915 |                             if size == 420:
 916 |                                 name = name + "standard"
 917 |                             elif size == 105:
 918 |                                 name = name + "small"
 919 |                             temp = (flavour, size, direction, name)
 920 |                             DFN_params.append(temp)
 921 | 
 922 |                 for DFN_param in DFN_params:
 923 |                     flavour = DFN_param[0]
 924 |                     factor_flavour_era = factor_flavour_eras[flavour][
 925 |                         factor_flavour_eras[flavour]["era"] == i
 926 |                     ]
 927 |                     selected_features = getattr(
 928 |                         factor_flavour_era.sort_values(flavour), DFN_param[2]
 929 |                     )(DFN_param[1])["feature_name"]
 930 | 
 931 |                     selected_features = list(set(selected_features) - set(bad_features))
 932 | 
 933 |                     prediction_df_era_new, correlations_by_era = score_numerai(
 934 |                         prediction_df_era,
 935 |                         features_raw_era,
 936 |                         selected_features,
 937 |                         proportion=proportion,
 938 |                         modelname=f"{modelname}-{DFN_param[3]}",
 939 |                         target_col_name=target_col[0],
 940 |                         era_col=era_col,
 941 |                         debug=debug,
 942 |                     )
 943 |                     prediction_dynamic.append(prediction_df_era_new.copy())
 944 |                     correlation_dynamic.append(correlations_by_era)
 945 | 
 946 |         return pd.concat(prediction_dynamic, axis=0), pd.concat(
 947 |             correlation_dynamic, axis=0
 948 |         )
 949 | 
 950 | 
 951 | def save_model_performance_test(
 952 |     Numerai_Model_Names,
 953 |     feature_corr,
 954 |     features_optimizer,
 955 |     startera=None,
 956 |     endera=None,
 957 |     data_file="data/v4_all_int8.parquet",
 958 |     data_version="v4",
 959 |     target_col=["target"],
 960 |     debug=False,
 961 |     gbm_start_iteration=0,
 962 | ):
 963 |     (
 964 |         average_prediction_df,
 965 |         prediction_df_list,
 966 |     ) = predict_numerai_multiple(
 967 |         Numerai_Model_Names,
 968 |         feature_corr,
 969 |         filename=data_file,
 970 |         data_version=data_version,
 971 |         startera=startera,
 972 |         endera=endera,
 973 |         debug=debug,
 974 |         target_col=target_col,
 975 |         gbm_start_iteration=gbm_start_iteration,
 976 |     )
 977 | 
 978 |     del prediction_df_list
 979 | 
 980 |     MODEL_NAME = Numerai_Model_Names[0].split(".parameters")[0].split("/")[-1]
 981 |     MODEL_NAME = MODEL_NAME + f"_{len(Numerai_Model_Names)}"
 982 | 
 983 |     (
 984 |         features,
 985 |         targets,
 986 |         groups,
 987 |         weights,
 988 |     ) = load_numerai_data(
 989 |         data_file,
 990 |         resample_freq=1,
 991 |         startera=startera,
 992 |         endera=endera,
 993 |         target_col=target_col,
 994 |         data_version=data_version,
 995 |     )
 996 | 
 997 |     dynamic_predictions, dynamic_correlations = dynamic_feature_neutralisation(
 998 |         average_prediction_df,
 999 |         features,
1000 |         feature_corr,
1001 |         features_optimizer,
1002 |         target_col=target_col,
1003 |         modelname=MODEL_NAME,
1004 |         debug=debug,
1005 |     )
1006 |     summary_correlations = dynamic_correlations.pivot(
1007 |         index="era", columns="model_name", values=["neutralised_correlation"]
1008 |     ).dropna()
1009 |     strategy_flavour = pd.DataFrame.from_records(
1010 |         summary_correlations.apply(strategy_metrics, axis=0),
1011 |         index=summary_correlations.columns,
1012 |     )
1013 |     if data_version == "signals":
1014 |         return (
1015 |             strategy_flavour,
1016 |             summary_correlations,
1017 |             dynamic_predictions,
1018 |         )
1019 |     else:
1020 |         return (
1021 |             strategy_flavour,
1022 |             create_era_index(summary_correlations),
1023 |             dynamic_predictions,
1024 |         )
1025 | 
1026 | 
1027 | ## Run Numerai Model Performances for both Classic and Signals tournament
1028 | def run_numerai_models_performances(
1029 |     Numerai_Model_Names,
1030 |     feature_corr,
1031 |     features_optimizer,
1032 |     PERFORMANCES_FOLDER,
1033 |     data_file="data/v4_all_int8.parquet",
1034 |     data_version="v4",
1035 |     target_col=["target"],
1036 |     gbm_start_iteration=0,
1037 | ):
1038 |     ## Calculate Starting Era
1039 |     parametername = Numerai_Model_Names[0]
1040 |     no_models = len(Numerai_Model_Names)
1041 |     stem = parametername.split("/")[-1].replace(".parameters", "")
1042 |     correlations_filename = f"{PERFORMANCES_FOLDER}/{stem}_{no_models}.csv"
1043 |     if os.path.exists(parametername):
1044 |         parameters = joblib.load(parametername)
1045 |         if data_version == "signals":
1046 |             test_start = parameters["parameters"]["model"]["validation_end"]
1047 |             test_end = datetime.datetime.strptime("2099-12-31", "%Y-%m-%d")
1048 |         else:
1049 |             test_start = shift_era(
1050 |                 parameters["parameters"]["model"]["validation_end"], gap=14
1051 |             )
1052 |             test_end = feature_corr.index[-1]
1053 |         if os.path.exists(correlations_filename):
1054 |             most_recent_date = pd.read_csv(correlations_filename, index_col=0).index[-1]
1055 |             if data_version == "signals":
1056 |                 test_start = datetime.datetime.strptime(most_recent_date, "%Y-%m-%d")
1057 |             else:
1058 |                 test_start = shift_era(convert_datetime_to_era(most_recent_date), gap=1)
1059 |         print(f"Model Performances {test_start} {test_end}")
1060 |         ### Get Model Predictions for the latest eras
1061 |         if test_end >= test_start:
1062 |             (
1063 |                 validate_performance,
1064 |                 validate_correlations,
1065 |                 validate_predictions,
1066 |             ) = save_model_performance_test(
1067 |                 Numerai_Model_Names,
1068 |                 feature_corr,
1069 |                 features_optimizer,
1070 |                 startera=test_start,
1071 |                 endera=test_end,
1072 |                 data_file=data_file,
1073 |                 data_version=data_version,
1074 |                 target_col=target_col,
1075 |                 gbm_start_iteration=gbm_start_iteration,
1076 |             )
1077 |             ## Update Model Performances
1078 |             output = validate_correlations["neutralised_correlation"]
1079 |             if os.path.exists(correlations_filename):
1080 |                 old_file = pd.read_csv(correlations_filename, index_col=0)
1081 |                 df = pd.concat([old_file, output.dropna()])
1082 |                 df.index = pd.to_datetime(df.index)
1083 |                 df[~df.index.duplicated()].sort_index().to_csv(correlations_filename)
1084 |             else:
1085 |                 output.dropna().to_csv(correlations_filename)
1086 | 


--------------------------------------------------------------------------------