├── vscode.env ├── src └── pythor │ ├── __init__.py │ ├── feature.py │ ├── neural.py │ ├── optimisation.py │ ├── benchmark.py │ ├── util.py │ └── numerai.py ├── setup.cfg ├── MANIFEST.in ├── Numerai_Paper3_Rain-2.pdf ├── requirements.txt ├── .vscode └── settings.json ├── tox.ini ├── .editorconfig ├── pyproject.toml ├── Dockerfile ├── .github └── workflows │ ├── test.yml │ └── release.yml ├── LICENSE ├── setup.py ├── README.md └── .gitignore /vscode.env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=/;src/;${PYTHONPATH} 2 | -------------------------------------------------------------------------------- /src/pythor/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.1.2" 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = attr: pythor.__version__ 3 | license_files = LICENSE 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pyproject.toml 2 | include *.md 3 | include LICENSE 4 | recursive-include tests test*.py 5 | -------------------------------------------------------------------------------- /Numerai_Paper3_Rain-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThomasWong2022/thor-public/HEAD/Numerai_Paper3_Rain-2.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | joblib 3 | numpy 4 | pandas 5 | scipy 6 | scikit-learn 7 | torch 8 | signatory 9 | xgboost 10 | lightgbm 11 | catboost 12 | optuna 13 | cupy 14 | cuml 15 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.unittestEnabled": false, 3 | "python.testing.nosetestsEnabled": false, 4 | "python.testing.pytestEnabled": true, 5 | "python.envFile": "${workspaceRoot}/vscode.env" 6 | } 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{37,38,39} 3 | minversion = 3.3.0 4 | isolated_build = true 5 | 6 | [testenv] 7 | deps = 8 | check-manifest >= 0.42 9 | pytest 10 | commands = 11 | check-manifest --ignore 'tox.ini,tests/**,.editorconfig,vscode.env,.vscode/**' 12 | python setup.py check -m -s 13 | pytest tests {posargs} 14 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 2 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.py] 12 | charset = utf-8 13 | indent_style = space 14 | indent_size = 4 15 | end_of_line = lf 16 | insert_final_newline = true 17 | trim_trailing_whitespace = true 18 | 19 | [*.{md,mdx}] 20 | trim_trailing_whitespace = false 21 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=46.4.0", 3 | "wheel", 4 | "joblib", 5 | "numpy", 6 | "pandas", 7 | "scipy", 8 | "scikit-learn", 9 | "optuna", 10 | ] 11 | build-backend = "setuptools.build_meta" 12 | [options.extras_require] 13 | cuda = [ 14 | "cupy", 15 | "cuml", 16 | "torch", 17 | "lightgbm --install-option=--cuda", 18 | "xgboost", 19 | "catboost", 20 | ] -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/kaggle-gpu-images/python:latest 2 | #### Install Signatory, needs to update when newer supporting torch1.11.0 3 | RUN pip install signatory==1.2.6.1.9.0 --no-deps 4 | #### Additional NN models 5 | RUN pip install pytorch-lightning 6 | RUN pip install pytorch-tabnet --no-deps 7 | RUN pip install numerapi==2.12.9 8 | #### Build THOR package 9 | WORKDIR / 10 | COPY src/pythor/ src/pythor/ 11 | COPY setup.py setup.py 12 | COPY README.md README.md 13 | RUN pip install . 14 | WORKDIR /workspace 15 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - nonexisting 7 | pull_request: 8 | branches: 9 | - nonexisting 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python: ["3.7.12","3.8.16","3.9.16"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Setup Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python }} 24 | - name: Install Tox and any other packages 25 | run: pip install tox 26 | - name: Run Tox 27 | # Run tox using the version of Python in `PATH` 28 | run: tox -e py 29 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: "3.x" 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: __token__ 23 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload --repository pypi dist/* 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tom Chen (tomchen.org) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="thorml", 8 | author="Thomas Wong", 9 | author_email="mw4315@ic.ac.uk", 10 | description="AutoML tools for Tabular Datasets", 11 | keywords="autoML", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ThomasWong2022/thor-public", 15 | project_urls={ 16 | "Documentation": "https://github.com/ThomasWong2022/thor-public", 17 | "Bug Reports": "https://github.com/ThomasWong2022/thor-public/issues", 18 | "Source Code": "https://github.com/ThomasWong2022/thor-public", 19 | }, 20 | package_dir={"": "src"}, 21 | packages=setuptools.find_packages(where="src"), 22 | classifiers=[ 23 | # see https://pypi.org/classifiers/ 24 | "Development Status :: 3 - Alpha", 25 | "Intended Audience :: Developers", 26 | "Topic :: Software Development :: Build Tools", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.7", 29 | "Programming Language :: Python :: 3.8", 30 | "Programming Language :: Python :: 3.9", 31 | "Programming Language :: Python :: 3 :: Only", 32 | "License :: OSI Approved :: MIT License", 33 | "Operating System :: OS Independent", 34 | ], 35 | python_requires=">=3.7", 36 | # install_requires=['Pillow'], 37 | extras_require={ 38 | "dev": ["check-manifest"], 39 | }, 40 | ) 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # THOR: Time-Varying High-dimensional Ordinal Regression 2 | 3 | [![Downloads](https://static.pepy.tech/badge/thorml)](https://pepy.tech/project/thorml) 4 | 5 | THOR is a new autoML tool for temporal tabular datasets and time series. It handles high dimensional datasets with distribution shifts better than other tools. It makes use of the latest research results from incremental learning to improve robustness of machine learning methods. 6 | 7 | 8 | ### Docker 9 | 10 | As this packages used various machine learning and CUDA libaries for GPU support, we recommend to use docker to manage the dependencies. 11 | 12 | The image is now uploaded on [Docker Hub](https://hub.docker.com/repository/docker/thomaswong2023/thor-public/general). 13 | 14 | The following Docker images contains all the dependencies used in this tool. 15 | 16 | ```bash 17 | docker pull thomaswong2023/thor-public:deps 18 | docker run --gpus device=all -it -d --rm --name thor-public-example thomaswong2023/thor:public:deps bash 19 | 20 | ``` 21 | 22 | 23 | ### PyPI 24 | 25 | This project is also on [PyPI](https://pypi.org/project/thorml/). 26 | 27 | Install the package with the following command. Dependencies are not installed with the package 28 | 29 | ```bash 30 | pip install thorml -r requirements.txt 31 | 32 | ``` 33 | 34 | 35 | 36 | ## Citation 37 | If you are using this package in your scientific work, we would appreciate citations to the following preprint on arxiv. 38 | 39 | [Dynamic Feature Projection and model selection methods for temporal tabular datasets with regime changes](https://arxiv.org/abs/2301.00790) 40 | 41 | Bibtex entry: 42 | ``` 43 | @misc{wong2023dynamic, 44 | title={Dynamic Feature Engineering and model selection methods for temporal tabular datasets with regime changes}, 45 | author={Thomas Wong and Mauricio Barahona}, 46 | year={2023}, 47 | eprint={2301.00790}, 48 | archivePrefix={arXiv}, 49 | primaryClass={q-fin.CP} 50 | } 51 | ``` 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /src/pythor/feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # A collection of feature enginnering methods for time-series data 5 | # 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | 21 | from joblib import Parallel, delayed 22 | import pandas as pd 23 | import numpy as np 24 | from sklearn.base import TransformerMixin, BaseEstimator 25 | import cupy as cp 26 | import cuml 27 | import torch, signatory 28 | 29 | 30 | import logging 31 | 32 | logger = logging.getLogger("Numerai") 33 | 34 | 35 | """ 36 | Feature Engineering used in Numerai Thesis 37 | """ 38 | 39 | 40 | class NumeraiTransformer(TransformerMixin, BaseEstimator): 41 | def __init__( 42 | self, 43 | seed=0, 44 | usesquare=False, 45 | dropout_pct=0.05, 46 | no_product_features=10, 47 | no_pca_features=0, 48 | ): 49 | self.seed = seed 50 | self.usesquare = usesquare 51 | self.dropout_pct = dropout_pct 52 | self.no_product_features = no_product_features 53 | self.no_pca_features = no_pca_features 54 | ## Data Dictionary to reconsturct transformer during inference 55 | self.data = dict() 56 | 57 | ## Transform Numerai Features with mean zero (-2,-1,0,1,2) 58 | def transform(self, X, is_train=True): 59 | ## Numpy Random Number Generator 60 | rng = np.random.default_rng(self.seed) 61 | 62 | ## Drop Out Matrix 63 | if self.dropout_pct > 0 and is_train: 64 | dropout_matrix = 1 - np.random.binomial(1, self.dropout_pct, X.shape) 65 | X_val = X.values * dropout_matrix 66 | 67 | if self.usesquare: 68 | squareX = pd.DataFrame(np.square(X_val), index=X.index) 69 | squareX.columns = ["{}_square".format(x) for x in X.columns] 70 | else: 71 | squareX = pd.DataFrame() 72 | 73 | ## Pair Transforms 74 | if self.no_product_features > 0: 75 | if is_train: 76 | col1 = np.random.choice(X.columns, self.no_product_features) 77 | col2 = np.random.choice( 78 | X.columns, 79 | self.no_product_features, 80 | ) 81 | self.product_features = pd.DataFrame( 82 | { 83 | "col1": col1, 84 | "col2": col2, 85 | } 86 | ).drop_duplicates() 87 | self.data["product_features"] = self.product_features 88 | else: 89 | self.product_features = self.data["product_features"] 90 | 91 | productX = pd.DataFrame( 92 | np.array(X[self.product_features["col1"]]) 93 | * np.array(X[self.product_features["col2"]]), 94 | index=X.index, 95 | ) 96 | productX.columns = [ 97 | f"feature_product_{i}" for i in range(self.product_features.shape[0]) 98 | ] 99 | else: 100 | productX = pd.DataFrame() 101 | 102 | ## Concat All Features to output 103 | transformed_features = pd.concat( 104 | [ 105 | X.astype(np.int8), 106 | squareX.astype(np.int8), 107 | productX.astype(np.int8), 108 | ], 109 | axis=1, 110 | ) 111 | 112 | return transformed_features 113 | 114 | 115 | class SignatureTransformer(TransformerMixin, BaseEstimator): 116 | def __init__( 117 | self, 118 | lookback, 119 | signature_level, 120 | ): 121 | self.lookback = lookback 122 | self.signature_level = signature_level 123 | 124 | ## Can also be used to transform data in an online fashion by transform 125 | def transform(self, X): 126 | history_length = X.shape[0] 127 | path_class = signatory.Path( 128 | torch.Tensor(cp.asarray([X.values])), self.signature_level 129 | ) 130 | sigs = list() 131 | for i in range(self.lookback, history_length): 132 | sigs.append(path_class.logsignature(i - self.lookback, i)) 133 | all_sig = torch.concat(sigs) 134 | transformed_signature = pd.DataFrame( 135 | all_sig.numpy(), index=X.index[self.lookback :] 136 | ) 137 | transformed_signature.columns = [ 138 | "lookback_{}_signature_{}".format(self.lookback, i) 139 | for i in range(transformed_signature.shape[1]) 140 | ] 141 | return transformed_signature 142 | 143 | 144 | def features_transform_batch(transformer, data, is_train=True): 145 | BATCH_SIZE = 10000000000 146 | start_index = 0 147 | transformed_features_batches = list() 148 | 149 | while start_index < data.shape[0]: 150 | data_batch = data.iloc[start_index : start_index + BATCH_SIZE] 151 | transformed_featrues_batch = pd.DataFrame( 152 | transformer.transform(data_batch, is_train=is_train), index=data_batch.index 153 | ) 154 | transformed_features_batches.append(transformed_featrues_batch) 155 | start_index = start_index + BATCH_SIZE 156 | 157 | transformed_features = pd.concat(transformed_features_batches, axis=0) 158 | return transformer, transformed_features 159 | 160 | 161 | def benchmark_features_transform( 162 | X_train, 163 | y_train, 164 | X_test=None, 165 | group_train=None, 166 | group_test=None, 167 | feature_eng=None, 168 | feature_eng_parameters=None, 169 | debug=False, 170 | ): 171 | ### Numerai 172 | if feature_eng in [ 173 | "numerai", 174 | ]: 175 | if feature_eng_parameters is None: 176 | feature_eng_parameters = { 177 | "usesquare": False, 178 | "no_product_features": 0, 179 | "seed": 10, 180 | } 181 | transformer = NumeraiTransformer(**feature_eng_parameters) 182 | 183 | if feature_eng is not None: 184 | extracted_features_train = transformer.transform(X_train, is_train=True) 185 | if X_test is not None: 186 | extracted_features_test = transformer.transform(X_test, is_train=False) 187 | else: 188 | extracted_features_test = None 189 | 190 | return transformer, extracted_features_train, extracted_features_test 191 | else: 192 | if X_test is not None: 193 | return None, X_train, X_test 194 | else: 195 | return None, X_train, None 196 | -------------------------------------------------------------------------------- /src/pythor/neural.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import joblib, os, shutil, datetime 4 | import torch 5 | 6 | import logging, gc 7 | 8 | logger = logging.getLogger("Numerai") 9 | 10 | 11 | from torch import nn 12 | import torch.nn.functional as F 13 | from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset 14 | from pytorch_lightning import Trainer, LightningModule, seed_everything 15 | from pytorch_lightning.callbacks.early_stopping import EarlyStopping 16 | import cupy as cp 17 | 18 | 19 | # + 20 | ## Tabular Models 21 | # - 22 | 23 | 24 | class TabularModel: 25 | def __init__(self, nn_model, config): 26 | """ 27 | Args: 28 | nn_model (LightningModule): Neural Networks implmented as a LightningModule 29 | config (dict): A dictionary which contains the parameters for training NN 30 | """ 31 | 32 | self.nn_model = nn_model 33 | self.config = config 34 | seed_everything(config.get("seed", 0), workers=True) 35 | 36 | def train(self, X_train, y_train, X_validate, y_validate): 37 | self.config["input_shape"] = X_train.shape[1] 38 | self.config["output_shape"] = y_train.shape[1] 39 | 40 | self.network = self.nn_model(self.config) 41 | 42 | early_stop_callback = EarlyStopping( 43 | monitor="val_loss", 44 | min_delta=0.00, 45 | patience=self.config.get("patience", 5), 46 | verbose=False, 47 | mode="min", 48 | ) 49 | 50 | ## Assume X is a DataFrame, assume y is a DataFrame or pd Series 51 | dataset_train = TensorDataset( 52 | torch.from_numpy(X_train.values), torch.from_numpy(y_train.values) 53 | ) 54 | dataloader_train = DataLoader( 55 | dataset_train, 56 | batch_size=self.config.get("batch_size", 4096), 57 | num_workers=0, 58 | ) 59 | dataset_validate = TensorDataset( 60 | torch.from_numpy(X_validate.values), torch.from_numpy(y_validate.values) 61 | ) 62 | dataloader_validate = DataLoader( 63 | dataset_validate, 64 | batch_size=self.config.get("batch_size", 4096), 65 | num_workers=0, 66 | ) 67 | 68 | ## Use GPU if possible 69 | self.trainer = Trainer( 70 | accelerator="cuda", 71 | deterministic=True, 72 | auto_lr_find=True, 73 | max_epochs=self.config.get("max_epochs", 3), 74 | callbacks=[early_stop_callback], 75 | ) 76 | 77 | self.trainer.fit(self.network, dataloader_train, dataloader_validate) 78 | 79 | def predict(self, X): 80 | self.network.eval() 81 | with torch.no_grad(): 82 | predictions = self.network(torch.from_numpy(X)) 83 | return predictions.numpy() 84 | 85 | def load_model(self, checkpoint): 86 | self.network = self.nn_model.load_from_checkpoint(checkpoint) 87 | 88 | def save_model(self, checkpoint): 89 | self.trainer.save_checkpoint(checkpoint) 90 | 91 | 92 | # + 93 | ## Tabular Modules 94 | # - 95 | 96 | 97 | class MLP(LightningModule): 98 | def __init__(self, config): 99 | super().__init__() 100 | self.config = config 101 | 102 | neuron_sizes = config.get("neurons", 256) 103 | num_layers = config.get("num_layers", 2) 104 | 105 | self.layers = nn.Sequential( 106 | nn.Linear(config["input_shape"], neuron_sizes), 107 | nn.ReLU(), 108 | nn.Dropout(config.get("dropout", 0.5)), 109 | ) 110 | 111 | for i in range( 112 | 1, 113 | num_layers, 114 | ): 115 | new_neuron_sizes = int(neuron_sizes * config.get("neuron_scale", 0.5)) + 1 116 | self.layers.append( 117 | nn.Linear(neuron_sizes, new_neuron_sizes), 118 | ) 119 | self.layers.append(nn.ReLU()) 120 | self.layers.append(nn.Dropout(config.get("dropout", 0.5))) 121 | neuron_sizes = new_neuron_sizes 122 | 123 | self.layers.append(nn.Linear(neuron_sizes, config["output_shape"])) 124 | 125 | ## Need to have this to ensure correct hyper-parameters are loaded 126 | ## https://github.com/Lightning-AI/lightning/issues/3981 127 | self.save_hyperparameters() 128 | 129 | def forward(self, x): 130 | return self.layers(x.float()) 131 | 132 | def training_step(self, batch, batch_idx): 133 | x, y = batch 134 | y_hat = self.layers(x.float()) 135 | loss = F.mse_loss(y_hat, y.float()) 136 | self.log("train_loss", loss) 137 | return loss 138 | 139 | def validation_step(self, batch, batch_idx): 140 | x, y = batch 141 | y_hat = self.layers(x.float()) 142 | loss = F.mse_loss(y_hat, y.float()) 143 | self.log("val_loss", loss) 144 | 145 | def predict_step(self, batch, batch_idx): 146 | return self(batch) 147 | 148 | def configure_optimizers(self): 149 | optimizer = torch.optim.Adam( 150 | self.parameters(), lr=self.config.get("learning_rate", 1e-4) 151 | ) 152 | return optimizer 153 | 154 | 155 | class LSTM_Tabular(LightningModule): 156 | def __init__(self, config): 157 | super().__init__() 158 | self.config = config 159 | self.lstm = nn.LSTM( 160 | input_size=self.config.get("no_channels", 4), 161 | hidden_size=self.config.get("hidden_size", 4), 162 | num_layers=self.config.get("num_layers", 2), 163 | dropout=self.config.get("dropout", 0.1), 164 | batch_first=True, 165 | ) 166 | self.fc = nn.Linear( 167 | self.config.get("hidden_size", 4), self.config.get("output_shape", 1) 168 | ) 169 | self.save_hyperparameters() 170 | 171 | def forward(self, x): 172 | batch_size, flattened = x.shape 173 | x = torch.reshape( 174 | x, 175 | ( 176 | batch_size, 177 | -1, 178 | self.config.get("no_channels", 4), 179 | ), 180 | ) 181 | flip_columns_order = self.config.get("flip_column_order", True) 182 | if flip_columns_order: 183 | x = torch.flip(x, [1]) 184 | ## LSTM Layers 185 | lstm_out, _ = self.lstm( 186 | x.float() 187 | ) # lstm_out = (batch_size, seq_len, hidden_size) 188 | x = self.fc(lstm_out[:, -1]) 189 | return x 190 | 191 | def training_step(self, batch, batch_idx): 192 | x, y = batch 193 | y_hat = self.forward(x) 194 | loss = F.mse_loss(y_hat.float(), y.float()) 195 | self.log("train_loss", loss) 196 | return loss 197 | 198 | def validation_step(self, batch, batch_idx): 199 | x, y = batch 200 | y_hat = self.forward(x) 201 | loss = F.mse_loss(y_hat.float(), y.float()) 202 | self.log("val_loss", loss) 203 | 204 | def predict_step(self, batch, batch_idx): 205 | return self(batch) 206 | 207 | def configure_optimizers(self): 208 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-4) 209 | return optimizer 210 | 211 | 212 | # + 213 | ## Time Series Models 214 | ## Needs Further Development 215 | # - 216 | 217 | 218 | class TimeSeriesDataset(Dataset): 219 | """Face Landmarks dataset.""" 220 | 221 | def __init__(self, timeseries, targets=None, lookback=200): 222 | """ 223 | Args: 224 | timeseries (pd.DataFrame): A DataFrame of a multivaraite time-series 225 | targets (pd.DataFrame): A DataFrame of targets for the time-series 226 | lookback (int): Number of data records to include in lookback 227 | """ 228 | self.X = timeseries 229 | self.y = targets 230 | self.lookback = lookback 231 | 232 | def __len__(self): 233 | return self.X.shape[0] - (self.lookback - 1) 234 | 235 | def __getitem__(self, idx): 236 | if torch.is_tensor(idx): 237 | idx = idx.tolist() 238 | if self.y is not None: 239 | return torch.tensor( 240 | self.X.values[idx : idx + self.lookback, :] 241 | ), torch.tensor(self.y.values[idx + self.lookback - 1, :]) 242 | else: 243 | return torch.tensor(self.X.values[idx : idx + self.lookback, :]) 244 | 245 | 246 | class TimeSeriesModel: 247 | def __init__(self, nn_model, config): 248 | """ 249 | Args: 250 | nn_model (LightningModule): Neural Networks implmented as a LightningModule 251 | config (dict): A dictionary which contains the parameters for training NN 252 | """ 253 | 254 | self.nn_model = nn_model 255 | self.config = config 256 | seed_everything(config.get("seed", 0), workers=True) 257 | 258 | def train(self, X_train, y_train, X_validate, y_validate): 259 | self.config["input_shape"] = X_train.shape[1] 260 | self.config["output_shape"] = y_train.shape[1] 261 | 262 | self.network = self.nn_model(self.config) 263 | 264 | early_stop_callback = EarlyStopping( 265 | monitor="val_loss", 266 | min_delta=0.00, 267 | patience=self.config.get("patience", 10), 268 | verbose=False, 269 | mode="min", 270 | ) 271 | 272 | ## Assume X is a DataFrame, assume y is a DataFrame or pd Series 273 | 274 | train_dataset = TimeSeriesDataset( 275 | X_train, y_train, lookback=self.config.get("lookback", 200) 276 | ) 277 | dataloader_train = torch.utils.data.DataLoader( 278 | train_dataset, batch_size=self.config.get("batch_size", 1000), shuffle=False 279 | ) 280 | validate_dataset = TimeSeriesDataset( 281 | X_validate, y_validate, lookback=self.config.get("lookback", 200) 282 | ) 283 | dataloader_validate = torch.utils.data.DataLoader( 284 | validate_dataset, 285 | batch_size=self.config.get("batch_size", 1000), 286 | shuffle=False, 287 | ) 288 | 289 | ## Use GPU if possible 290 | self.trainer = Trainer( 291 | accelerator="cuda", 292 | deterministic=True, 293 | auto_lr_find=True, 294 | max_epochs=self.config.get("max_epochs", 3), 295 | callbacks=[early_stop_callback], 296 | ) 297 | 298 | self.trainer.fit(self.network, dataloader_train, dataloader_validate) 299 | 300 | def predict(self, X): 301 | self.network.eval() 302 | with torch.no_grad(): 303 | predictions = self.network(X) 304 | return predictions.numpy() 305 | 306 | def load_model(self, checkpoint): 307 | self.network = self.nn_model.load_from_checkpoint(checkpoint) 308 | 309 | def save_model(self, checkpoint): 310 | self.trainer.save_checkpoint(checkpoint) 311 | 312 | 313 | # + 314 | ### TimeSeires Modules 315 | # - 316 | 317 | 318 | class LSTM(LightningModule): 319 | def __init__(self, config): 320 | super().__init__() 321 | self.config = config 322 | self.lstm = nn.LSTM( 323 | input_size=self.config.get("input_size", 11), 324 | hidden_size=self.config.get("hidden_size", 4), 325 | num_layers=self.config.get("num_layers", 2), 326 | dropout=self.config.get("dropout", 0.1), 327 | batch_first=True, 328 | ) 329 | self.fc = nn.Linear(self.config.get("hidden_size", 4), 11) 330 | 331 | def forward(self, x): 332 | lstm_out, _ = self.lstm(x) # lstm_out = (batch_size, seq_len, hidden_size) 333 | x = self.fc(lstm_out[:, -1]) 334 | return x 335 | 336 | def training_step(self, batch, batch_idx): 337 | x, y = batch 338 | y_hat = self.forward(x.float()) 339 | loss = F.mse_loss(y_hat, y.float()) 340 | self.log("train_loss", loss) 341 | return loss 342 | 343 | def validation_step(self, batch, batch_idx): 344 | x, y = batch 345 | y_hat = self.forward(x.float()) 346 | loss = F.mse_loss(y_hat, y.float()) 347 | self.log("val_loss", loss) 348 | 349 | def predict_step(self, batch, batch_idx): 350 | return self(batch) 351 | 352 | def configure_optimizers(self): 353 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-4) 354 | return optimizer 355 | -------------------------------------------------------------------------------- /src/pythor/optimisation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Optimising hyper-parameters for ML models with Optuna 5 | # 6 | 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import joblib, json, os, gc 11 | 12 | import optuna 13 | from optuna.samplers import RandomSampler, TPESampler 14 | 15 | 16 | from .util import GroupedTimeSeriesSplit, strategy_metrics 17 | from .benchmark import benchmark_pipeline, save_best_model, load_best_model 18 | from .numerai import load_numerai_data, score_numerai 19 | 20 | 21 | import logging 22 | 23 | logger = logging.getLogger("Numerai") 24 | 25 | 26 | ### Create Hyper-parameter space for optuna 27 | ### Extract parameter space that needs to be optimised from the config dictionary 28 | def create_optuna_space(config_dictionary, trial): 29 | space = dict() 30 | for step in ["feature_eng", "ml_method"]: 31 | for k, v in config_dictionary[step]["parameters"].items(): 32 | if isinstance(v, list): 33 | space[k] = getattr(trial, f"suggest_{v[0]}")(name=k, **v[1]) 34 | else: 35 | space[k] = v 36 | return space 37 | 38 | 39 | ### Create Parameter Sets from optuna trial instances 40 | def create_parameters_sets( 41 | args, 42 | config_dictionary, 43 | seed=0, 44 | ): 45 | ### Feature Engineering 46 | feature_eng_parameters = {} 47 | for k, v in config_dictionary["feature_eng"]["parameters"].items(): 48 | feature_eng_parameters[k] = args.get(k, v) 49 | 50 | ### ML Methods 51 | tabular_hyper = { 52 | "seed": seed, 53 | } 54 | 55 | for k, v in config_dictionary["ml_method"]["parameters"].items(): 56 | tabular_hyper[k] = args.get(k, v) 57 | 58 | ### Additional Hyper-parameters to be passed to training loop, NOT used now 59 | additional_hyper = dict() 60 | 61 | return feature_eng_parameters, tabular_hyper, additional_hyper 62 | 63 | 64 | # Create Objective function using optuna for Numerai Classic and Numerai Signals Tournament 65 | 66 | 67 | def create_optuna_numerai_objective( 68 | config_dictionary, numerai_files, seed=0, debug=False 69 | ): 70 | def objective(trial): 71 | with open(numerai_files["feature_metadata"], "r") as f: 72 | feature_metadata = json.load(f) 73 | if config_dictionary["model_params"]["feature_sets"] == "v4": 74 | features_optimizer = feature_metadata["feature_sets"]["fncv3_features"] 75 | bad_features = [ 76 | "feature_palpebral_univalve_pennoncel", 77 | "feature_unsustaining_chewier_adnoun", 78 | "feature_brainish_nonabsorbent_assurance", 79 | "feature_coastal_edible_whang", 80 | "feature_disprovable_topmost_burrower", 81 | "feature_trisomic_hagiographic_fragrance", 82 | "feature_queenliest_childing_ritual", 83 | "feature_censorial_leachier_rickshaw", 84 | "feature_daylong_ecumenic_lucina", 85 | "feature_steric_coxcombic_relinquishment", 86 | ] 87 | features_optimizer = list(set(features_optimizer) - set(bad_features)) 88 | else: 89 | features_optimizer = list() 90 | 91 | features, targets, groups, weights = load_numerai_data( 92 | numerai_files["dataset"], 93 | feature_metadata=numerai_files["feature_metadata"], 94 | resample=0, 95 | resample_freq=config_dictionary["model_params"]["train_resample_freq"], 96 | target_col=config_dictionary["model_params"]["train_targets"], 97 | data_version=config_dictionary["model_params"]["feature_sets"], 98 | startera=config_dictionary["model_params"]["train_startera"], 99 | endera=config_dictionary["model_params"]["train_endera"], 100 | ) 101 | 102 | param = create_optuna_space(config_dictionary, trial) 103 | 104 | logger.info(param) 105 | 106 | ( 107 | feature_eng_parameters, 108 | tabular_hyper, 109 | additional_hyper, 110 | ) = create_parameters_sets( 111 | param, 112 | config_dictionary, 113 | seed=seed, 114 | ) 115 | 116 | model_performance, trained_models, data, parameters = benchmark_pipeline( 117 | features, 118 | targets, 119 | weights, 120 | groups, 121 | feature_eng=config_dictionary["feature_eng"]["method"], 122 | feature_eng_parameters=feature_eng_parameters, 123 | tabular_model=config_dictionary["ml_method"]["method"], 124 | tabular_hyper=tabular_hyper, 125 | model_params=config_dictionary["model_params"]["train"], 126 | additional_hyper=additional_hyper, 127 | debug=debug, 128 | ) 129 | ## Get Predictions for each of the walk forward model 130 | ## Score on Validation data 131 | predictions = list() 132 | for model_name in list(data.keys()): 133 | predictions.append(data[model_name]["prediction"]) 134 | train_prediction_df = pd.DataFrame(pd.concat(predictions, axis=0).mean(axis=1)) 135 | train_prediction_df.columns = ["prediction"] 136 | train_prediction_df["target"] = targets.reindex(train_prediction_df.index) 137 | train_prediction_df["era"] = groups.reindex(train_prediction_df.index) 138 | train_prediction_df, correlations_by_era = score_numerai( 139 | train_prediction_df, 140 | features, 141 | riskiest_features=features_optimizer, 142 | proportion=float( 143 | config_dictionary["model_params"]["selection"]["proportion"] 144 | ), 145 | era_col="era", 146 | target_col_name="target", 147 | ) 148 | performances = strategy_metrics(correlations_by_era["neutralised_correlation"]) 149 | metric = performances[ 150 | config_dictionary["model_params"]["selection"]["criteria"] 151 | ] 152 | logger.info(f"Out of Sample Metric {metric}") 153 | return metric 154 | 155 | return objective 156 | 157 | 158 | def optuna_search( 159 | config_dictionary, 160 | numerai_files, 161 | n_trials=10, 162 | timeout=10000, 163 | seed=0, 164 | debug=False, 165 | ): 166 | optuna.logging.set_verbosity(optuna.logging.WARNING) 167 | 168 | numerai_objective = create_optuna_numerai_objective( 169 | config_dictionary, numerai_files, seed=seed, debug=debug 170 | ) 171 | study = optuna.create_study( 172 | direction="maximize", 173 | ) 174 | study.optimize( 175 | numerai_objective, n_trials=n_trials, timeout=timeout, gc_after_trial=True 176 | ) 177 | 178 | return study.best_trial.params, study.best_trial.value 179 | 180 | 181 | def train_best_model_optuna( 182 | target_col_name, 183 | end_era, 184 | best_parameters, 185 | config_dictionary, 186 | numerai_files, 187 | seed=0, 188 | debug=False, 189 | ): 190 | resample_seed = int( 191 | seed % config_dictionary["model_params"]["validate_resample_freq"] 192 | ) 193 | features, targets, groups, weights = load_numerai_data( 194 | numerai_files["dataset"], 195 | feature_metadata=numerai_files["feature_metadata"], 196 | resample=resample_seed, 197 | resample_freq=config_dictionary["model_params"]["validate_resample_freq"], 198 | target_col=[target_col_name], 199 | data_version=config_dictionary["model_params"]["feature_sets"], 200 | startera=config_dictionary["model_params"]["train_startera"], 201 | endera=end_era, 202 | ) 203 | 204 | output_folder = config_dictionary["model_params"]["output_folder"] 205 | 206 | if not os.path.exists(f"{output_folder}/"): 207 | os.mkdir(f"{output_folder}/") 208 | 209 | feature_eng_parameters, tabular_hyper, additional_hyper = create_parameters_sets( 210 | best_parameters, 211 | config_dictionary, 212 | seed=seed, 213 | ) 214 | 215 | model_performance, trained_models, data, parameters = benchmark_pipeline( 216 | features, 217 | targets, 218 | weights, 219 | groups, 220 | feature_eng=config_dictionary["feature_eng"]["method"], 221 | feature_eng_parameters=feature_eng_parameters, 222 | tabular_model=config_dictionary["ml_method"]["method"], 223 | tabular_hyper=tabular_hyper, 224 | model_params=config_dictionary["model_params"]["validate"], 225 | additional_hyper=additional_hyper, 226 | debug=debug, 227 | ) 228 | 229 | ## Save each model 230 | for model_name in list(trained_models.keys()): 231 | ## Save Parameters and Feature Transformer 232 | output_parameters_path = f"{output_folder}/{model_name}_{seed}.parameters" 233 | output_parameters = dict() 234 | output_parameters["parameters"] = parameters[model_name] 235 | output_parameters["transformer"] = trained_models[model_name]["transformer"] 236 | joblib.dump(output_parameters, output_parameters_path) 237 | 238 | ## Save Model 239 | output_model_path = f"{output_folder}/{model_name}_{seed}.model" 240 | save_best_model( 241 | trained_models[model_name]["model"], 242 | parameters[model_name]["model"]["tabular_model"], 243 | output_model_path, 244 | ) 245 | 246 | return None 247 | 248 | 249 | def numerai_optimisation_pipeline_optuna( 250 | config_dictionary, 251 | numerai_files, 252 | run_optimisation=True, 253 | optimised_parameters_path="numerai_best_parameters.json", 254 | grid_search_seed=0, 255 | n_trials=40, 256 | timeout=2000, 257 | debug=False, 258 | ): 259 | ## Search for optimal hyper-parameters 260 | if run_optimisation: 261 | best_parameters, best_value = optuna_search( 262 | config_dictionary, 263 | numerai_files, 264 | seed=grid_search_seed, 265 | n_trials=n_trials, 266 | timeout=timeout, 267 | debug=debug, 268 | ) 269 | with open(optimised_parameters_path, "w") as f: 270 | best_parameters["Optuna_Best_Value"] = best_value 271 | json.dump(best_parameters, f) 272 | else: 273 | with open(optimised_parameters_path, "r") as f: 274 | best_parameters = json.load(f) 275 | logger.info(f"Using Best parameters {best_parameters}") 276 | 277 | START_SEED = config_dictionary["model_params"]["model_no_start"] 278 | NO_MODELS_PER_CONFIG = config_dictionary["model_params"]["no_models_per_config"] 279 | 280 | if config_dictionary["model_params"]["mix_cv"]: 281 | for target_col_name in config_dictionary["model_params"]["validate_targets"]: 282 | for end_era in config_dictionary["model_params"]["validate_enderas"]: 283 | for seed in range(START_SEED, START_SEED + NO_MODELS_PER_CONFIG): 284 | ## Check if Model already exists 285 | output_folder = config_dictionary["model_params"]["output_folder"] 286 | tabular_model = config_dictionary["ml_method"]["method"] 287 | feature_eng = config_dictionary["feature_eng"]["method"] 288 | model_name = "{}_{}_{}".format(tabular_model, feature_eng, 1) 289 | output_model_path = f"{output_folder}/{model_name}_{seed}.model" 290 | if not os.path.exists(output_model_path): 291 | train_best_model_optuna( 292 | target_col_name, 293 | end_era, 294 | best_parameters, 295 | config_dictionary, 296 | numerai_files, 297 | seed=seed, 298 | debug=debug, 299 | ) 300 | 301 | START_SEED = START_SEED + NO_MODELS_PER_CONFIG 302 | else: 303 | for end_era in config_dictionary["model_params"]["validate_enderas"]: 304 | for target_col_name in config_dictionary["model_params"][ 305 | "validate_targets" 306 | ]: 307 | for seed in range(START_SEED, START_SEED + NO_MODELS_PER_CONFIG): 308 | ## Check if Model already exists 309 | output_folder = config_dictionary["model_params"]["output_folder"] 310 | tabular_model = config_dictionary["ml_method"]["method"] 311 | feature_eng = config_dictionary["feature_eng"]["method"] 312 | model_name = "{}_{}_{}".format(tabular_model, feature_eng, 1) 313 | output_model_path = f"{output_folder}/{model_name}_{seed}.model" 314 | if not os.path.exists(output_model_path): 315 | train_best_model_optuna( 316 | target_col_name, 317 | end_era, 318 | best_parameters, 319 | config_dictionary, 320 | numerai_files, 321 | seed=seed, 322 | debug=debug, 323 | ) 324 | 325 | START_SEED = START_SEED + NO_MODELS_PER_CONFIG 326 | -------------------------------------------------------------------------------- /src/pythor/benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # A collection of GBDT models for temporal tabular data 5 | # 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | import pandas as pd 21 | import numpy as np 22 | import joblib, os, shutil, datetime 23 | 24 | 25 | 26 | import logging, gc 27 | 28 | if torch.cuda.is_available(): 29 | import cupy as cp 30 | 31 | 32 | from sklearn.metrics import mean_squared_error 33 | from sklearn.model_selection import KFold, GroupKFold, GroupShuffleSplit 34 | 35 | ## Machine Learning packages 36 | from xgboost import XGBRegressor 37 | from lightgbm import LGBMRegressor, LGBMClassifier 38 | from catboost import CatBoostRegressor 39 | 40 | import lightgbm, xgboost, catboost 41 | import torch 42 | 43 | 44 | 45 | from .util import align_features_target, RollingTSTransformer, GroupedTimeSeriesSplit 46 | from .feature import benchmark_features_transform 47 | from .neural import TabularModel, MLP, LSTM_Tabular 48 | 49 | # ## Persistence of ML models 50 | 51 | 52 | ### Save Best Model using method provided 53 | def save_best_model(model, model_type, outputpath): 54 | if model_type in [ 55 | "lightgbm", 56 | "lightgbm-gbdt", 57 | "lightgbm-goss", 58 | "lightgbm-rf", 59 | "lightgbm-dart", 60 | ]: 61 | model.save_model(outputpath) 62 | if model_type in [ 63 | "xgboost", 64 | "xgboost-dart", 65 | "xgboost-gbtree", 66 | ]: 67 | model.save_model(outputpath) 68 | if model_type == "catboost": 69 | model.save_model(outputpath) 70 | if model_type in [ 71 | "Numerai-MLP", 72 | "Numerai-LSTM", 73 | ]: 74 | model.save_model(outputpath) 75 | if model_type == "tabnet": 76 | model.save_model(outputpath) 77 | os.rename("{}.zip".format(outputpath), outputpath) 78 | return None 79 | 80 | 81 | ### load Best Model using method provided 82 | def load_best_model(model_type, outputpath): 83 | if model_type in [ 84 | "lightgbm", 85 | "lightgbm-gbdt", 86 | "lightgbm-goss", 87 | "lightgbm-rf", 88 | "lightgbm-dart", 89 | ]: 90 | reg = lightgbm.Booster(model_file=outputpath) 91 | if model_type in [ 92 | "xgboost", 93 | "xgboost-dart", 94 | "xgboost-gbtree", 95 | ]: 96 | reg = xgboost.Booster() 97 | reg.load_model(outputpath) 98 | if model_type == "catboost": 99 | reg = catboost.CatBoost() 100 | reg.load_model(outputpath) 101 | if model_type in [ 102 | "Numerai-MLP", 103 | ]: 104 | reg = TabularModel(MLP, config=dict()) 105 | reg.load_model(outputpath) 106 | if model_type in [ 107 | "Numerai-LSTM", 108 | ]: 109 | reg = TabularModel(LSTM_Tabular, config=dict()) 110 | reg.load_model(outputpath) 111 | if model_type == "tabnet": 112 | from pytorch_tabnet.tab_model import TabNetRegressor 113 | 114 | reg = TabNetRegressor() 115 | reg.load_model(outputpath) 116 | if model_type == "feature-momentum": 117 | reg = None 118 | return reg 119 | 120 | 121 | # ## Fit ML Models 122 | 123 | 124 | def benchmark_neural_model( 125 | extracted_features_train, 126 | y_train, 127 | weights_train, 128 | extracted_features_test=None, 129 | y_test=None, 130 | weights_test=None, 131 | tabular_model="Numerai-MLP", 132 | tabular_hyper=None, 133 | additional_hyper=None, 134 | debug=False, 135 | ): 136 | gc.collect() 137 | 138 | ## Initialise and Train Models 139 | if tabular_model in [ 140 | "Numerai-MLP", 141 | ]: 142 | reg = TabularModel(MLP, config=tabular_hyper) 143 | reg.train(extracted_features_train, y_train, extracted_features_test, y_test) 144 | pred = reg.predict(extracted_features_test.values) 145 | return reg, pred 146 | 147 | if tabular_model in [ 148 | "Numerai-LSTM", 149 | ]: 150 | reg = TabularModel(LSTM_Tabular, config=tabular_hyper) 151 | reg.train(extracted_features_train, y_train, extracted_features_test, y_test) 152 | pred = reg.predict(extracted_features_test.values) 153 | return reg, pred 154 | 155 | if tabular_model == "tabnet": 156 | ## Default is PyTorch Adam Optimizer 157 | from torch.optim import Adam 158 | from torch.optim.lr_scheduler import StepLR 159 | from pytorch_tabnet.tab_model import TabNetRegressor 160 | 161 | tabnet_hyper = dict() 162 | tabnet_hyper["optimizer_fn"] = Adam 163 | tabnet_hyper["optimizer_params"] = { 164 | "lr": 0.02, 165 | } 166 | tabnet_hyper["scheduler_fn"] = StepLR 167 | tabnet_hyper["scheduler_params"] = {"gamma": 0.95, "step_size": 20} 168 | 169 | for key in [ 170 | "seed", 171 | "n_d", 172 | "n_a", 173 | "n_steps", 174 | "n_independent", 175 | "n_shared", 176 | "gamma", 177 | "momentum", 178 | "lambda_sparse", 179 | ]: 180 | tabnet_hyper[key] = tabular_hyper[key] 181 | 182 | ## Separate Hyper-parameters in the fit function 183 | tabnet_fit_hyper = dict() 184 | for key in [ 185 | "max_epochs", 186 | "patience", 187 | "batch_size", 188 | ]: 189 | tabnet_fit_hyper[key] = tabular_hyper[key] 190 | 191 | reg = TabNetRegressor(**tabnet_hyper) 192 | reg.fit( 193 | extracted_features_train.values, 194 | y_train.values, 195 | eval_set=[(extracted_features_test.values, y_test.values)], 196 | max_epochs=tabnet_fit_hyper.get("max_epochs", 20), 197 | patience=tabnet_fit_hyper.get("patience", 5), 198 | batch_size=tabnet_fit_hyper.get("batch_size", 40960), 199 | virtual_batch_size=int(tabnet_fit_hyper.get("batch_size", 40960) / 4), 200 | num_workers=0, 201 | ) 202 | pred = reg.predict(extracted_features_test.values) 203 | return reg, pred 204 | 205 | 206 | def benchmark_tree_model( 207 | extracted_features_train, 208 | y_train, 209 | weights_train, 210 | extracted_features_test=None, 211 | y_test=None, 212 | weights_test=None, 213 | tabular_model="lightgbm", 214 | tabular_hyper=None, 215 | additional_hyper=None, 216 | debug=False, 217 | ): 218 | ### Free up Memory from previous loop 219 | gc.collect() 220 | 221 | #### Fit Regressor Model for different ML methods 222 | if tabular_model in [ 223 | "lightgbm", 224 | "lightgbm-gbdt", 225 | "lightgbm-dart", 226 | "lightgbm-goss", 227 | "lightgbm-rf", 228 | ]: 229 | if y_test is not None: 230 | train_data = lightgbm.Dataset( 231 | extracted_features_train, 232 | label=y_train, 233 | weight=weights_train, 234 | params={"max_bin": tabular_hyper["max_bin"]}, 235 | ) 236 | test_data = lightgbm.Dataset( 237 | extracted_features_test, 238 | label=y_test, 239 | weight=weights_test, 240 | params={"max_bin": tabular_hyper["max_bin"]}, 241 | ) 242 | early_stopping_rounds = tabular_hyper.get("early_stopping_round", 0) 243 | model = lightgbm.train( 244 | tabular_hyper, 245 | train_set=train_data, 246 | num_boost_round=tabular_hyper["num_iterations"], 247 | valid_sets=[test_data], 248 | callbacks=[ 249 | lightgbm.log_evaluation(period=1000), 250 | lightgbm.early_stopping(early_stopping_rounds), 251 | ], 252 | ) 253 | valid_iteration = min( 254 | additional_hyper.get("gbm_start_iteration", 0), 255 | int(model.num_trees() // 2), 256 | ) 257 | pred = model.predict( 258 | extracted_features_test, start_iteration=valid_iteration 259 | ) 260 | return model, pred 261 | else: 262 | train_data = lightgbm.Dataset( 263 | extracted_features_train, 264 | label=y_train, 265 | weight=weights_train, 266 | ) 267 | model = lightgbm.train( 268 | tabular_hyper, 269 | train_set=train_data, 270 | num_boost_round=tabular_hyper["num_iterations"], 271 | ) 272 | return model 273 | 274 | ## xgboost ignores extra parameters 275 | if tabular_model in [ 276 | "xgboost", 277 | "xgboost-dart", 278 | "xgboost-gbtree", 279 | ]: 280 | ## Create DMatrix 281 | if y_test is not None: 282 | train_data = xgboost.DMatrix( 283 | extracted_features_train, 284 | label=y_train.values.reshape(-1), 285 | weight=weights_train.values.reshape(-1), 286 | ) 287 | test_data = xgboost.DMatrix( 288 | extracted_features_test, 289 | label=y_test.values.reshape(-1), 290 | weight=weights_test.values.reshape(-1), 291 | ) 292 | ### Train XGBoost model 293 | model = xgboost.train( 294 | tabular_hyper, 295 | train_data, 296 | num_boost_round=tabular_hyper["num_boost_round"], 297 | evals=[(test_data, "xgboost_test_data")], 298 | early_stopping_rounds=tabular_hyper["early_stopping_rounds"], 299 | verbose_eval=100, 300 | ) 301 | start_iteration = min( 302 | additional_hyper.get("gbm_start_iteration", 0), 303 | int(model.best_iteration // 2), 304 | ) 305 | end_iteration = model.best_iteration 306 | pred = model.predict( 307 | test_data, 308 | iteration_range=(start_iteration, end_iteration), 309 | ) 310 | return model, pred 311 | else: 312 | train_data = xgboost.DMatrix( 313 | extracted_features_train, 314 | label=y_train.values.reshape(-1), 315 | weight=weights_train.values.reshape(-1), 316 | ) 317 | model = xgboost.train( 318 | tabular_hyper, 319 | train_data, 320 | num_boost_round=tabular_hyper["num_boost_round"], 321 | ) 322 | return model 323 | 324 | 325 | ### Run ML pipeline for temporal tabular data 326 | def benchmark_pipeline( 327 | features, 328 | target, 329 | weights, 330 | groups, 331 | model_params=None, 332 | feature_eng=None, 333 | feature_eng_parameters=None, 334 | tabular_model="lightgbm", 335 | tabular_hyper=None, 336 | additional_hyper=None, 337 | debug=False, 338 | ): 339 | if debug: 340 | print(f"Dataset Sizes {features.shape} {target.shape} {groups.shape}") 341 | 342 | if not model_params: 343 | model_params = { 344 | "valid_splits": 1, 345 | "test_size": 52, 346 | "max_train_size": 52, 347 | "gap": 52, 348 | "cross_validation": "GroupedTimeSeriesSplit", 349 | } 350 | 351 | ## Cross Validation split 352 | if model_params["cross_validation"] == "GroupedTimeSeriesSplit": 353 | tscv = GroupedTimeSeriesSplit( 354 | valid_splits=model_params["valid_splits"], 355 | test_size=model_params["test_size"], 356 | max_train_size=model_params["max_train_size"], 357 | gap=model_params["gap"], 358 | debug=debug, 359 | ) 360 | elif model_params["cross_validation"] == "GroupShuffleSplit": 361 | tscv = GroupShuffleSplit( 362 | n_splits=model_params["n_splits"], 363 | test_size=model_params["test_size"], 364 | train_size=model_params["train_size"], 365 | random_state=model_params.get("random_state", 0), 366 | ) 367 | else: 368 | tscv = KFold( 369 | n_splits=model_params["n_splits"], 370 | shuffle=True, 371 | random_state=model_params.get("random_state", 0), 372 | ) 373 | model_no = 1 374 | model_performance = dict() 375 | trained_models = dict() 376 | data = dict() 377 | parameters = dict() 378 | 379 | for train_index, test_index in tscv.split(features, groups=groups): 380 | ## Get Trained and Test Data 381 | if model_params["cross_validation"] == "GroupedTimeSeriesSplit": 382 | X_train, X_test = features.loc[train_index, :], features.loc[test_index, :] 383 | y_train, y_test = target.loc[train_index, :], target.loc[test_index, :] 384 | ## Data Weights are pd Series 385 | weights_train, weights_test = ( 386 | weights.loc[train_index], 387 | weights.loc[test_index], 388 | ) 389 | ## Group Labels are pd Series 390 | group_train, group_test = ( 391 | groups.loc[train_index], 392 | groups.loc[test_index], 393 | ) 394 | if debug: 395 | print(X_train.shape, X_test.shape) 396 | 397 | ## For Existing Cross Validation Splits in scikit-learn it is based on index location (iloc) 398 | else: 399 | X_train, X_test = ( 400 | features.iloc[train_index, :], 401 | features.iloc[test_index, :], 402 | ) 403 | y_train, y_test = target.iloc[train_index, :], target.iloc[test_index, :] 404 | weights_train, weights_test = ( 405 | weights.iloc[train_index], 406 | weights.iloc[test_index], 407 | ) 408 | group_train, group_test = ( 409 | groups.iloc[train_index], 410 | groups.iloc[test_index], 411 | ) 412 | 413 | ### Transform features 414 | 415 | ( 416 | transformer, 417 | extracted_features_train, 418 | extracted_features_test, 419 | ) = benchmark_features_transform( 420 | X_train, 421 | y_train, 422 | X_test, 423 | group_train, 424 | group_test, 425 | feature_eng, 426 | feature_eng_parameters, 427 | debug, 428 | ) 429 | 430 | if tabular_model in [ 431 | "lightgbm-gbdt", 432 | "lightgbm-goss", 433 | "lightgbm-dart", 434 | "lightgbm-rf", 435 | "xgboost-dart", 436 | "xgboost-gbtree", 437 | "catboost", 438 | "lightgbm", 439 | "xgboost", 440 | ]: 441 | ### Train Tabular Models 442 | reg, pred = benchmark_tree_model( 443 | extracted_features_train, 444 | y_train, 445 | weights_train, 446 | extracted_features_test, 447 | y_test, 448 | weights_test, 449 | tabular_model, 450 | tabular_hyper, 451 | additional_hyper, 452 | debug, 453 | ) 454 | 455 | if tabular_model in [ 456 | "Numerai-MLP", 457 | "Numerai-LSTM", 458 | "tabnet", 459 | ]: 460 | ### Train Tabular Models 461 | reg, pred = benchmark_neural_model( 462 | extracted_features_train, 463 | y_train, 464 | weights_train, 465 | extracted_features_test, 466 | y_test, 467 | weights_test, 468 | tabular_model, 469 | tabular_hyper, 470 | additional_hyper, 471 | debug, 472 | ) 473 | 474 | ## Convert Prediction output to a dataframe 475 | pred = pd.DataFrame(pred, index=y_test.index, columns=y_test.columns) 476 | 477 | model_name = "{}_{}_{}".format(tabular_model, feature_eng, model_no) 478 | 479 | parameters[model_name] = { 480 | "feature_eng": feature_eng_parameters.copy(), 481 | "tabular": tabular_hyper.copy(), 482 | "additional": additional_hyper.copy(), 483 | } 484 | 485 | ### Compute model performance 486 | model_metrics = dict() 487 | model_metrics["MSE"] = mean_squared_error(y_test, pred) 488 | model_performance[model_name] = model_metrics.copy() 489 | 490 | #### Training Parameters 491 | model_params["feature_columns"] = features.columns 492 | model_params["target_columns"] = target.columns 493 | model_params["feature_engineering"] = feature_eng 494 | model_params["tabular_model"] = tabular_model 495 | model_params["train_start"] = group_train.iloc[0] 496 | model_params["train_end"] = group_train.iloc[-1] 497 | model_params["validation_start"] = group_test.iloc[0] 498 | model_params["validation_end"] = group_test.iloc[-1] 499 | model_params["model_name"] = model_name 500 | parameters[model_name]["model"] = model_params.copy() 501 | 502 | if debug: 503 | print(parameters[model_name]) 504 | 505 | if transformer is not None: 506 | trained_models[model_name] = { 507 | "transformer": transformer.data, 508 | "model": reg, 509 | } 510 | else: 511 | trained_models[model_name] = { 512 | "transformer": None, 513 | "model": reg, 514 | } 515 | 516 | data[model_name] = { 517 | "prediction": pred, 518 | "y_test": y_test, 519 | } 520 | 521 | model_no += 1 522 | 523 | return model_performance, trained_models, data, parameters 524 | -------------------------------------------------------------------------------- /src/pythor/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # A collection of tools for data pre-processing for non-stationary time-series and tabular data 5 | # 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | 21 | import pandas as pd 22 | import numpy as np 23 | import joblib, os, glob 24 | 25 | from sklearn.model_selection import TimeSeriesSplit 26 | from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples 27 | 28 | from sklearn.inspection import permutation_importance 29 | from sklearn.preprocessing import StandardScaler 30 | from sklearn.base import TransformerMixin, BaseEstimator 31 | 32 | 33 | """ 34 | Strategy Metrics , Regime Analysis 35 | """ 36 | 37 | 38 | def strategy_metrics(strategy, interval=1, numerai=True, accuracy=4): 39 | results = dict() 40 | results["mean"] = np.around(strategy.mean(), accuracy) 41 | results["volatility"] = np.around(strategy.std(), accuracy) 42 | results["skew"] = np.around(strategy.skew(), accuracy) 43 | results["kurtosis"] = np.around(strategy.kurtosis(), accuracy) 44 | if numerai: 45 | portfolio = strategy.cumsum() 46 | else: 47 | portfolio = (1 + strategy).cumprod() 48 | if numerai: 49 | dd = portfolio - portfolio.cummax() 50 | else: 51 | dd = (portfolio - portfolio.cummax()) / portfolio.cummax() 52 | results["max_drawdown"] = np.around(-1 * dd.cummin().min(), accuracy).item() 53 | if strategy.std() > 0: 54 | results["sharpe"] = np.around(strategy.mean() / strategy.std(), accuracy) 55 | else: 56 | results["sharpe"] = np.around(results["mean"] / 1e-4, accuracy) 57 | if results["max_drawdown"] > 0: 58 | results["calmar"] = np.around( 59 | results["mean"] / results["max_drawdown"], accuracy 60 | ) 61 | else: 62 | results["calmar"] = np.around(results["mean"] / 1e-4, accuracy) 63 | return results 64 | 65 | 66 | def regime_analysis( 67 | df, 68 | performance_col="correlation", 69 | regime_columns="regime", 70 | ): 71 | ans = df.groupby(regime_columns).agg({performance_col: strategy_metrics}) 72 | ans_df = pd.DataFrame(ans[performance_col].values.tolist()) 73 | ans_df.index = ans.index 74 | return ans_df.reset_index() 75 | 76 | 77 | """ 78 | Dynamic Model Selection 79 | """ 80 | 81 | 82 | def dynamic_model_selection_masks(performances, gap=6, lookback=52, top_models=1): 83 | mean = performances.shift(gap).rolling(lookback).mean() 84 | volatility = performances.shift(gap).rolling(lookback).std() 85 | skew = performances.shift(gap).rolling(lookback).skew() 86 | kurt = performances.shift(gap).rolling(lookback).kurt() 87 | drawdown = ( 88 | -1 89 | * ( 90 | performances.shift(gap).cumsum() - performances.shift(gap).cumsum().cummax() 91 | ).cummin() 92 | ) 93 | sharpe = mean / volatility 94 | calmar = mean / drawdown 95 | 96 | metric_masks = dict() 97 | for metric in [ 98 | "mean", 99 | "volatility", 100 | "skew", 101 | "kurt", 102 | "drawdown", 103 | "sharpe", 104 | "calmar", 105 | ]: 106 | metric_masks[f"{metric}_min"] = np.where( 107 | locals()[metric].rank( 108 | axis=1, 109 | ascending=True, 110 | na_option="bottom", 111 | ) 112 | <= top_models, 113 | 1 / top_models, 114 | np.nan, 115 | ) 116 | metric_masks[f"{metric}_max"] = np.where( 117 | locals()[metric].rank( 118 | axis=1, 119 | ascending=False, 120 | na_option="bottom", 121 | ) 122 | <= top_models, 123 | 1 / top_models, 124 | np.nan, 125 | ) 126 | 127 | masks_dataframes = dict() 128 | for metric in [ 129 | "mean", 130 | "volatility", 131 | "skew", 132 | "kurt", 133 | "drawdown", 134 | "sharpe", 135 | "calmar", 136 | ]: 137 | masks_dataframes[f"{metric}_min"] = pd.DataFrame( 138 | metric_masks[f"{metric}_min"], 139 | columns=locals()[metric].columns, 140 | index=locals()[metric].index, 141 | ) 142 | masks_dataframes[f"{metric}_max"] = pd.DataFrame( 143 | metric_masks[f"{metric}_max"], 144 | columns=locals()[metric].columns, 145 | index=locals()[metric].index, 146 | ) 147 | return masks_dataframes 148 | 149 | 150 | def walk_forward_dynamic_models(df_list): 151 | Model_Sets = dict() 152 | Imputed_Models = dict() 153 | 154 | for key in [ 155 | "Ensemble", 156 | "Baseline", 157 | "Optimizer", 158 | "Small", 159 | "Medium", 160 | "Standard", 161 | "Average", 162 | ]: 163 | Model_Sets[key] = list() 164 | 165 | for dynamic_models in df_list: 166 | Model_Sets["Ensemble"].append( 167 | dynamic_models[ 168 | [ 169 | x 170 | for x in dynamic_models.columns 171 | if "baseline" in x 172 | or "optimizer" in x 173 | or ("standard" in x and not "average" in x and not "random" in x) 174 | ] 175 | ] 176 | ) 177 | Model_Sets["Average"].append( 178 | dynamic_models[ 179 | [ 180 | x 181 | for x in dynamic_models.columns 182 | if "baseline" in x or "optimizer" in x or "average" in x 183 | ] 184 | ] 185 | ) 186 | Model_Sets["Baseline"].append( 187 | dynamic_models[[x for x in dynamic_models.columns if "baseline" in x]] 188 | ) 189 | Model_Sets["Optimizer"].append( 190 | dynamic_models[[x for x in dynamic_models.columns if "optimizer" in x]] 191 | ) 192 | Model_Sets["Standard"].append( 193 | dynamic_models[ 194 | [ 195 | x 196 | for x in dynamic_models.columns 197 | if "standard" in x and not "average" in x and not "random" in x 198 | ] 199 | ] 200 | ) 201 | Model_Sets["Small"].append( 202 | dynamic_models[ 203 | [ 204 | x 205 | for x in dynamic_models.columns 206 | if "small" in x and not "average" in x and not "random" in x 207 | ] 208 | ] 209 | ) 210 | 211 | for key in [ 212 | "Ensemble", 213 | "Baseline", 214 | "Optimizer", 215 | "Small", 216 | "Standard", 217 | "Average", 218 | ]: 219 | models_over_time = pd.concat(Model_Sets[key], axis=1) 220 | # models_over_time = ( 221 | # models_over_time.transpose() 222 | # .fillna(models_over_time.mean(axis=1)) 223 | # .transpose() 224 | # ) 225 | models_over_time = models_over_time.transpose().fillna(0).transpose() 226 | Imputed_Models[key] = models_over_time.sort_index() 227 | 228 | return Imputed_Models 229 | 230 | 231 | ### Compare Against All Trained Models 232 | 233 | 234 | def create_leaderboard( 235 | performances_folder, 236 | searchkey="*", 237 | lookback=52, 238 | no_tops=1, 239 | model_no_lower=0, 240 | model_no_upper=1e8, 241 | ): 242 | ## Load csv files 243 | performances_files = sorted(glob.glob(f"{performances_folder}/{searchkey}.csv")) 244 | models_list = list() 245 | for f in performances_files: 246 | model_no = int(f.split(".csv")[0].split("_")[-2]) 247 | model_seq = int(f.split(".csv")[0].split("_")[-1]) 248 | model_name = "_".join(f.split(".csv")[0].split("/")[-1].split("_")[:3]) 249 | if ( 250 | os.path.isfile(f) 251 | and model_no_lower <= model_no 252 | and model_no <= model_no_upper 253 | ): 254 | df = pd.read_csv(f, index_col=0).sort_index() 255 | df = df[~df.index.duplicated()] 256 | df.index = pd.to_datetime(df.index) 257 | models_list.append(df) 258 | 259 | dynamic_models_collection = walk_forward_dynamic_models(models_list) 260 | 261 | ### Compute Performances of Portfolios of dynamically selected models 262 | recent_results = list() 263 | dynamic_portfolios = dict() 264 | gap = 6 265 | criteria = [ 266 | "mean", 267 | # "calmar", 268 | # "sharpe", 269 | ] 270 | 271 | for Sets in [ 272 | "Baseline", 273 | "Optimizer", 274 | "Ensemble", 275 | "Small", 276 | "Standard", 277 | ]: 278 | df = dynamic_models_collection[Sets].sort_index() 279 | if df.shape[0] > 0: 280 | dynamic_masks = dynamic_model_selection_masks( 281 | df, top_models=no_tops, lookback=lookback, gap=gap 282 | ) 283 | for base_method in criteria: 284 | for method in [ 285 | f"{base_method}_max", 286 | ]: 287 | portfolio = (dynamic_masks[method] * df).sum(axis=1, min_count=1) 288 | dynamic_portfolios[ 289 | f"{Sets}_{method}_{no_tops}_lookback_{lookback}" 290 | ] = portfolio.tail(df.shape[0] - lookback - gap) 291 | performances = strategy_metrics( 292 | portfolio.tail(df.shape[0] - lookback - gap) 293 | ) 294 | performances["method"] = method 295 | performances["no_tops"] = no_tops 296 | performances["sets"] = Sets 297 | performances["lookback"] = lookback 298 | recent_results.append(performances) 299 | 300 | dynamic_performances = pd.DataFrame(recent_results).dropna() 301 | 302 | leaderboards = dict() 303 | ## Recent Leaderboards to be used in Model Submissions 304 | for model_subset in [ 305 | "Baseline", 306 | "Ensemble", 307 | "Optimizer", 308 | "Small", 309 | "Standard", 310 | ]: 311 | leaderboard = pd.DataFrame( 312 | dynamic_models_collection[model_subset] 313 | .sort_index() 314 | .iloc[-1 * lookback :] 315 | .apply(strategy_metrics) 316 | .to_dict() 317 | ).transpose() 318 | 319 | if len(dynamic_models_collection[model_subset].columns) > 0: 320 | leaderboard.index = dynamic_models_collection[model_subset].columns 321 | leaderboard["proportion"] = [ 322 | float(x[-1]) for x in leaderboard.index.str.split("-") 323 | ] 324 | leaderboard["flavour"] = [x[-2] for x in leaderboard.index.str.split("-")] 325 | leaderboard["model_seq"] = [ 326 | int("-".join(x[:-2]).split("_")[-1]) 327 | for x in leaderboard.index.str.split("-") 328 | ] 329 | leaderboard["model_seed"] = [ 330 | int("-".join(x[:-2]).split("_")[-2]) 331 | for x in leaderboard.index.str.split("-") 332 | ] 333 | leaderboard["model_cv"] = [ 334 | "-".join(x[:-2]).split("_")[-3] 335 | for x in leaderboard.index.str.split("-") 336 | ] 337 | leaderboard["model_feature_engineering"] = [ 338 | "-".join(x[:-2]).split("_")[-4] 339 | for x in leaderboard.index.str.split("-") 340 | ] 341 | leaderboard["model_tabular_method"] = [ 342 | "-".join(x[:-2]).split("_")[-5] 343 | for x in leaderboard.index.str.split("-") 344 | ] 345 | leaderboards[model_subset] = leaderboard 346 | 347 | ## Leaderboard Since beginning of data 348 | if dynamic_models_collection[model_subset].shape[0] < lookback + gap: 349 | start_of_data = 0 350 | else: 351 | start_of_data = lookback + gap 352 | leaderboard = pd.DataFrame( 353 | dynamic_models_collection[model_subset] 354 | .sort_index() 355 | .iloc[start_of_data:] 356 | .apply(strategy_metrics) 357 | .to_dict() 358 | ).transpose() 359 | leaderboard.index = dynamic_models_collection[model_subset].columns 360 | leaderboards[f"{model_subset}-All"] = leaderboard 361 | 362 | return ( 363 | dynamic_performances, 364 | dynamic_portfolios, 365 | dynamic_models_collection, 366 | leaderboards, 367 | ) 368 | 369 | 370 | """ 371 | Cross Validation Schemes 372 | 373 | TimeSeries Grouped CV 374 | 375 | """ 376 | 377 | 378 | class GroupedTimeSeriesSplit(TimeSeriesSplit): 379 | def __init__( 380 | self, 381 | n_splits=5, 382 | valid_splits=1, 383 | max_train_size=None, 384 | test_size=52 * 2, 385 | gap=52, 386 | debug=False, 387 | ): 388 | self.n_splits = n_splits 389 | self.valid_splits = valid_splits 390 | self.shuffle = False 391 | self.random_state = None 392 | self.max_train_size = max_train_size 393 | self.test_size = test_size 394 | self.gap = gap 395 | self.debug = debug 396 | 397 | def split(self, X, y=None, groups=None): 398 | """Generate indices to split data into training and test set. 399 | Parameters 400 | ---------- 401 | X : pd.DataFrame of shape (n_samples, n_features) 402 | Training data, where `n_samples` is the number of samples 403 | and `n_features` is the number of features. 404 | y : array-like of shape (n_samples,) 405 | Always ignored, exists for compatibility. 406 | groups : pd.Series of shape (n_samples,) 407 | Group Labels of training data 408 | Yields 409 | ------ 410 | train : ndarray 411 | The training set indices for that split. 412 | test : ndarray 413 | The testing set indices for that split. 414 | """ 415 | X, y, groups = indexable(X, y, groups) 416 | n_samples = _num_samples(X) 417 | 418 | if groups is None: 419 | # n_samples = X.shape[0] 420 | n_splits = self.n_splits 421 | valid_splits = self.valid_splits 422 | n_folds = n_splits + 1 423 | gap = self.gap 424 | test_size = ( 425 | self.test_size if self.test_size is not None else n_samples // n_folds 426 | ) 427 | 428 | # Make sure we have enough samples for the given split parameters 429 | if n_folds > n_samples: 430 | raise ValueError( 431 | f"Cannot have number of folds={n_folds} greater" 432 | f" than the number of samples={n_samples}." 433 | ) 434 | if n_samples - gap - (test_size * n_splits) <= 0: 435 | raise ValueError( 436 | f"Too many splits={n_splits} for number of samples" 437 | f"={n_samples} with test_size={test_size} and gap={gap}." 438 | ) 439 | 440 | indices = X.index 441 | test_starts = range( 442 | n_samples - valid_splits * test_size, n_samples, test_size 443 | ) 444 | 445 | for test_start in test_starts: 446 | train_end = test_start - gap 447 | if self.max_train_size and self.max_train_size < train_end: 448 | yield ( 449 | indices[max(train_end - self.max_train_size, 0) : train_end], 450 | indices[test_start : test_start + test_size], 451 | ) 452 | else: 453 | yield ( 454 | indices[:train_end], 455 | indices[test_start : test_start + test_size], 456 | ) 457 | else: 458 | ## Get unique groups 459 | unique_groups = groups.unique() 460 | gap = self.gap 461 | ## Calculate test size if not provided 462 | if self.test_size: 463 | n_folds = (len(unique_groups) - gap) // self.test_size 464 | else: 465 | n_folds = self.n_splits + 1 466 | self.test_size = len(unique_groups) // n_folds 467 | test_splits = [ 468 | unique_groups[ 469 | len(unique_groups) 470 | - (i + 1) * self.test_size : len(unique_groups) 471 | - i * self.test_size 472 | ] 473 | for i in range(n_folds - 1) 474 | ] 475 | if self.max_train_size: 476 | train_splits = [ 477 | unique_groups[ 478 | max( 479 | len(unique_groups) 480 | - (i + 1) * self.test_size 481 | - gap 482 | - self.max_train_size, 483 | 0, 484 | ) : len(unique_groups) 485 | - (i + 1) * self.test_size 486 | - gap 487 | ] 488 | for i in range(n_folds - 1) 489 | ] 490 | else: 491 | train_splits = [ 492 | unique_groups[: len(unique_groups) - (i + 1) * self.test_size - gap] 493 | for i in range(n_folds - 1) 494 | ] 495 | for i in range(0, self.valid_splits): 496 | yield ( 497 | groups[groups.isin(train_splits[i])].index, 498 | groups[groups.isin(test_splits[i])].index, 499 | ) 500 | 501 | 502 | """ 503 | Data Dimension Transformer 504 | Currently Implemeted: Constant lookback size with zero-padding 505 | Convert from 2D DataFrame, 506 | given a lookback size into nested DataFrames for sktime transformers 507 | """ 508 | 509 | 510 | def forward_fill_zero(series, length): 511 | fill_length = length - series.shape[0] 512 | fill_series = pd.Series(np.zeros(fill_length)) 513 | return pd.concat([fill_series, series], axis=0).reset_index(drop=True) 514 | 515 | 516 | ### Create rolling windows of nested dataframe for sktime, forward fill zero if there are not enough data at the start 517 | def roll_2D_to_nested(X, lookback=20, normalise=True): 518 | ## Python index start at zero 519 | lookback = lookback - 1 520 | index = X.index 521 | columns = X.columns 522 | output = np.empty((len(index), len(columns)), dtype=object) 523 | for i in range(X.shape[0]): 524 | for j, c in enumerate(X.columns): 525 | start_index = max(0, i - lookback) 526 | recent_rawdata = pd.Series(X.loc[X.index[start_index : i + 1], c]) 527 | if normalise and i >= 1: 528 | normalised_rawdata = ( 529 | recent_rawdata - recent_rawdata.mean() 530 | ) / recent_rawdata.std() 531 | output[i, j] = forward_fill_zero(normalised_rawdata, lookback + 1) 532 | else: 533 | output[i, j] = forward_fill_zero(recent_rawdata, lookback + 1) 534 | return pd.DataFrame(output, index=index, columns=columns) 535 | 536 | 537 | class RollingTSTransformer(BaseEstimator, TransformerMixin): 538 | def __init__(self, lookback=20, normalise=True): 539 | self.lookback = lookback 540 | self.normalise = normalise 541 | 542 | def fit(self, X, y): 543 | return self 544 | 545 | def transform(self, X): 546 | output = roll_2D_to_nested(X, self.lookback, self.normalise) 547 | return output 548 | 549 | 550 | ### Data Pre-processing 551 | 552 | 553 | def align_features_target(features, target, large_value=1e6): 554 | ## Flatten multi-index column names for tsfresh 555 | if isinstance(features, pd.DataFrame): 556 | if features.columns.nlevels > 1: 557 | features.columns = [ 558 | "_".join(column).rstrip("_") 559 | for column in features.columns.to_flat_index() 560 | ] 561 | ## Remove rows with na and align features and target to same length 562 | ##features.replace(np.inf, large_value, inplace=True) 563 | ##features.replace(-np.inf, -1 * large_value, inplace=True) 564 | ##features = features.dropna() 565 | ##target = target.dropna() 566 | valid_index = features.index.intersection(target.index) 567 | features = features.reindex(valid_index) 568 | target = target.reindex(valid_index) 569 | return features, target 570 | -------------------------------------------------------------------------------- /src/pythor/numerai.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # A collection of tools for data processing for Numerai and other temporal tabular data competitions 5 | # 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | import joblib, datetime, json, os 21 | import pandas as pd 22 | import numpy as np 23 | import scipy 24 | import torch 25 | import xgboost 26 | 27 | from .benchmark import load_best_model 28 | from .util import strategy_metrics 29 | from .feature import NumeraiTransformer 30 | 31 | if torch.cuda.is_available(): 32 | import cupy as cp 33 | import cudf 34 | from cuml.neighbors import KNeighborsRegressor 35 | else: 36 | from sklearn.neighbors import KNeighborsRegressor 37 | 38 | 39 | """ 40 | Helper Functions to convert Numerai Era and Datetime 41 | """ 42 | 43 | 44 | ## Shifting Numerai Era 45 | def shift_era(era, gap=6): 46 | new_era_int = int(era) + gap 47 | new_era = str(new_era_int) 48 | while len(new_era) < 4: 49 | new_era = "0" + new_era 50 | return new_era 51 | 52 | 53 | ## Convert datetime into Numerai eras 54 | def convert_datetime_to_era(sample_date): 55 | baseline = datetime.datetime(year=2003, month=1, day=3) 56 | differences = datetime.datetime.strptime(sample_date, "%Y-%m-%d") - baseline 57 | new_era = str(differences.days // 7 + 1) 58 | while len(new_era) < 4: 59 | new_era = "0" + new_era 60 | return new_era 61 | 62 | 63 | def convert_era_to_datetime(era): 64 | baseline = datetime.datetime(year=2003, month=1, day=3) 65 | new_datetime = baseline + datetime.timedelta(days=7 * (int(era) - 1)) 66 | return new_datetime 67 | 68 | 69 | ### Map columns Numerai Era to datetime 70 | def create_era_index( 71 | df, 72 | baseline=datetime.datetime(year=2003, month=1, day=3), 73 | ): 74 | mapped_era = [ 75 | baseline + datetime.timedelta(days=7 * (int(x) - 1)) for x in df.index 76 | ] 77 | df.index = mapped_era 78 | return df 79 | 80 | 81 | """ 82 | Data Loader for Numerai Data 83 | 84 | """ 85 | 86 | 87 | def load_numerai_data_era( 88 | filename, 89 | feature_metadata="v4_features.json", 90 | resample=0, 91 | resample_freq=1, 92 | target_col=["target"], 93 | era_col="era", 94 | data_version="v4", 95 | startera=None, 96 | endera=None, 97 | ): 98 | ## Read Train Data 99 | df_raw = pd.read_parquet(filename) 100 | ## Select Range 101 | if startera is not None and endera is not None: 102 | df_raw = df_raw[(df_raw[era_col] <= endera) & (df_raw[era_col] >= startera)] 103 | elif endera is not None: 104 | df_raw = df_raw[(df_raw[era_col] <= endera)] 105 | ## Downsample Eras 106 | if resample_freq > 1: 107 | downsampled_eras = df_raw[era_col].unique()[resample::resample_freq] 108 | df = df_raw[df_raw[era_col].isin(downsampled_eras)] 109 | else: 110 | df = df_raw.copy() 111 | 112 | del df_raw 113 | 114 | ## Features Sets 115 | feature_col = [col for col in df.columns if col.startswith("feature_")] 116 | 117 | if data_version in [ 118 | "v4", 119 | "v4-all", 120 | ]: 121 | bad_features = [ 122 | "feature_palpebral_univalve_pennoncel", 123 | "feature_unsustaining_chewier_adnoun", 124 | "feature_brainish_nonabsorbent_assurance", 125 | "feature_coastal_edible_whang", 126 | "feature_disprovable_topmost_burrower", 127 | "feature_trisomic_hagiographic_fragrance", 128 | "feature_queenliest_childing_ritual", 129 | "feature_censorial_leachier_rickshaw", 130 | "feature_daylong_ecumenic_lucina", 131 | "feature_steric_coxcombic_relinquishment", 132 | ] 133 | feature_col = list(set(feature_col) - set(bad_features)) 134 | 135 | ## Features and Targets are DataFrame 136 | if data_version == "signals": 137 | features = df[feature_col].fillna(0) 138 | ## For Numerai Classic Tournament, v4 dataset 139 | else: 140 | features = df[feature_col].fillna(2) - 2 141 | 142 | target_median = df[target_col].median() 143 | targets = df[target_col].fillna(target_median) - target_median 144 | ## Group column has to be pd.Series for time-series cross validation 145 | groups = df[era_col] 146 | ## weights column has to be pd.Series for time-series cross validation 147 | df["weights"] = 1 148 | weights = df["weights"] 149 | return features.astype(np.int8), targets.astype(np.float32), groups, weights 150 | 151 | 152 | def load_numerai_data( 153 | data_folder, 154 | feature_metadata="v4_features.json", 155 | resample=0, 156 | resample_freq=1, 157 | target_col=["target"], 158 | era_col="era", 159 | data_version="v4", 160 | startera=None, 161 | endera=None, 162 | ): 163 | if data_version in [ 164 | "v4", 165 | "v4.1", 166 | "v5", 167 | "v6", 168 | ]: 169 | features_list = list() 170 | targets_list = list() 171 | groups_list = list() 172 | weights_list = list() 173 | 174 | if startera is None: 175 | startera = "0001" 176 | if endera is None: 177 | endera = "0001" 178 | 179 | for i in range(int(startera) + resample, int(endera) + 1, resample_freq): 180 | if i <= 9: 181 | test_start_str = "000" + str(i) 182 | elif i <= 99: 183 | test_start_str = "00" + str(i) 184 | elif i <= 999: 185 | test_start_str = "0" + str(i) 186 | else: 187 | test_start_str = str(i) 188 | 189 | data_file = f"{data_folder}/{data_version}_{test_start_str}_int8.parquet" 190 | 191 | features, targets, groups, weights = load_numerai_data_era( 192 | data_file, 193 | feature_metadata=feature_metadata, 194 | resample=0, 195 | resample_freq=1, 196 | target_col=target_col, 197 | era_col=era_col, 198 | data_version=data_version, 199 | startera=test_start_str, 200 | endera=test_start_str, 201 | ) 202 | 203 | features_list.append(features) 204 | targets_list.append(targets) 205 | groups_list.append(groups) 206 | weights_list.append(weights) 207 | 208 | return ( 209 | pd.concat(features_list), 210 | pd.concat(targets_list), 211 | pd.concat(groups_list), 212 | pd.concat(weights_list), 213 | ) 214 | else: 215 | features, targets, groups, weights = load_numerai_data_era( 216 | data_folder, 217 | feature_metadata=feature_metadata, 218 | resample=resample, 219 | resample_freq=resample_freq, 220 | target_col=target_col, 221 | era_col=era_col, 222 | data_version=data_version, 223 | startera=startera, 224 | endera=endera, 225 | ) 226 | return features, targets, groups, weights 227 | 228 | 229 | """ 230 | Generate Predictions for Numerai 231 | 232 | trained_model: model object which has method .predict to generate predictions 233 | parameters: dictionary which contains parameters of the trained_model 234 | modelname: str Name of Model 235 | start_iteration: for tree-based methods, skip the first N trees in model when generating predictions 236 | startera: first era to get predictions 237 | endera: last era to get predictions 238 | 239 | 240 | Output: prediction_df: pd.DataFrame with columns era, prediction, model_name, target_col 241 | 242 | """ 243 | 244 | 245 | class FeatureMomentumModel: 246 | def __init__( 247 | self, 248 | lookback=52, 249 | shift=6, 250 | correlation_file_path=None, 251 | portfolio_file_path=None, 252 | target_col=None, 253 | seed=0, 254 | ): 255 | self.seed = seed 256 | self.lookback = lookback 257 | self.shift = shift 258 | self.correlation_file_path = correlation_file_path 259 | self.portfolio_file_path = portfolio_file_path 260 | self.target_col = target_col 261 | 262 | def predict(self, features): 263 | correlation_matrix = pd.read_parquet(self.correlation_file_path) 264 | factor_momentum = ( 265 | correlation_matrix.shift(self.shift) 266 | .fillna(0) 267 | .rolling(self.lookback) 268 | .mean() 269 | .dropna() 270 | ) 271 | last_momentum = factor_momentum.tail(1).transpose().squeeze()[features.columns] 272 | preds = features * np.sign(last_momentum) 273 | return preds.mean(axis=1) 274 | 275 | def copy_performance(self, outputfolder): 276 | portfolio = pd.read_csv(self.portfolio_file_path, index_col=0) 277 | portfolio.columns = [f"feature-momentum_None_1_{self.seed}_1-baseline-0"] 278 | portfolio.to_csv(f"{outputfolder}/feature-momentum_None_1_{self.seed}_1.csv") 279 | 280 | 281 | def predict_numerai( 282 | features_raw, 283 | targets, 284 | groups, 285 | trained_model, 286 | parameters, 287 | modelname="sample", 288 | gbm_start_iteration=0, ## Backward Comptability 289 | era_col="era", 290 | debug=False, 291 | ): 292 | ## Score on Dataset 293 | 294 | selected_cols = parameters["parameters"]["model"]["feature_columns"] 295 | target_col = parameters["parameters"]["model"]["target_columns"] 296 | 297 | ## Transform Features 298 | if parameters["parameters"]["model"]["feature_engineering"] is not None: 299 | if parameters["parameters"]["model"]["feature_engineering"] in [ 300 | "numerai", 301 | ]: 302 | feature_eng_parameters = parameters["parameters"]["feature_eng"] 303 | transformer = NumeraiTransformer(**feature_eng_parameters) 304 | transformer.data = parameters["transformer"] 305 | features = transformer.transform( 306 | features_raw[selected_cols], is_train=False 307 | ) 308 | if parameters["parameters"]["model"]["feature_engineering"] in [ 309 | "numeraiv4", 310 | "numeraiv4.1", 311 | ]: 312 | feature_eng_parameters = parameters["parameters"]["feature_eng"] 313 | transformer = NumeraiTransformerV4(**feature_eng_parameters) 314 | transformer.data = parameters["transformer"] 315 | features = transformer.transform( 316 | features_raw[selected_cols], is_train=False 317 | ) 318 | else: 319 | features = features_raw[selected_cols] 320 | 321 | ## Run Predictions 322 | ## For tree-based models can run some of the trees only 323 | if parameters["parameters"]["model"]["tabular_model"] in [ 324 | "lightgbm", 325 | "lightgbm-gbdt", 326 | "lightgbm-dart", 327 | "lightgbm-goss", 328 | ]: 329 | ## Backward Compatability 330 | if "additional" in parameters["parameters"] and gbm_start_iteration is None: 331 | gbm_start_iteration = parameters["parameters"]["additional"].get( 332 | "gbm_start_iteration", 0 333 | ) 334 | start_iteration = min( 335 | gbm_start_iteration, int(trained_model.num_trees() * 0.75) 336 | ) 337 | predictions_raw = trained_model.predict( 338 | features, start_iteration=start_iteration 339 | ) 340 | elif parameters["parameters"]["model"]["tabular_model"] in [ 341 | "xgboost", 342 | ]: 343 | if hasattr(trained_model, "best_iteration"): 344 | end_iteration = trained_model.best_iteration 345 | else: 346 | end_iteration = trained_model.num_boosted_rounds() 347 | start_iteration = min(gbm_start_iteration, int(end_iteration * 0.75)) 348 | xgboost_features = xgboost.DMatrix(features) 349 | predictions_raw = trained_model.predict( 350 | xgboost_features, 351 | iteration_range=(start_iteration, end_iteration), 352 | ) 353 | elif parameters["parameters"]["model"]["tabular_model"] in [ 354 | "Numerai-MLP", 355 | "Numerai-LSTM", 356 | ]: 357 | predictions_raw = trained_model.predict(features.values) 358 | elif parameters["parameters"]["model"]["tabular_model"] in [ 359 | "tabnet", 360 | ]: 361 | predictions_raw = trained_model.predict(features.values) 362 | elif parameters["parameters"]["model"]["tabular_model"] in [ 363 | "feature-momentum", 364 | ]: 365 | trained_model = FeatureMomentumModel(**parameters["parameters"]["tabular"]) 366 | predictions_raw = trained_model.predict(features) 367 | else: 368 | ## General Model which implements a predict method 369 | predictions_raw = trained_model.predict(features) 370 | 371 | ## Process Predictions into DataFrame 372 | predictions = pd.DataFrame( 373 | predictions_raw, 374 | columns=target_col, 375 | index=targets.index, 376 | ) 377 | predictions[era_col] = groups 378 | ## Rank Predictions within each era 379 | normalised_predictions = list() 380 | for i, df in predictions.groupby(era_col): 381 | per_era = df[target_col].rank(pct=True, axis=0) 382 | normalised_predictions.append(per_era) 383 | processed_predictions = pd.concat(normalised_predictions, axis=0) 384 | predictions["prediction"] = processed_predictions[target_col].mean(axis=1) 385 | prediction_df = pd.concat([predictions[[era_col, "prediction"]], targets], axis=1) 386 | prediction_df["model_name"] = modelname 387 | return prediction_df 388 | 389 | 390 | def predict_numerai_multiple( 391 | Numerai_Model_Names, 392 | correlation_matrix=None, 393 | filename="data/v4_all_int8.parquet", 394 | data_version="v4", 395 | startera=None, 396 | endera=None, 397 | debug=False, 398 | era_col="era", 399 | target_col=["target"], 400 | embargo=26, 401 | gbm_start_iteration=0, 402 | ): 403 | features, targets, groups, weights = load_numerai_data( 404 | filename, 405 | target_col=target_col, 406 | era_col=era_col, 407 | data_version=data_version, 408 | startera=startera, 409 | endera=endera, 410 | ) 411 | 412 | INDEX_COL_NAMES = features.index.names 413 | 414 | prediction_df_list = list() 415 | score_df_list = list() 416 | 417 | for Numerai_Model_Name in Numerai_Model_Names: 418 | modelname = Numerai_Model_Name.replace(".parameters", ".model") 419 | parameters = joblib.load(Numerai_Model_Name) 420 | most_recent_model = load_best_model( 421 | parameters["parameters"]["model"]["tabular_model"], modelname 422 | ) 423 | 424 | ## Check Embargo Period for Numerai Classic Models 425 | if data_version in [ 426 | "v4", 427 | "v4.1", 428 | "v5", 429 | "v6", 430 | ]: 431 | test_start = shift_era( 432 | parameters["parameters"]["model"]["validation_end"], embargo 433 | ) 434 | required_index = groups[groups >= test_start].index 435 | else: 436 | required_index = groups.index 437 | 438 | if debug: 439 | print(modelname, test_start) 440 | 441 | if required_index.shape[0] > 0: 442 | prediction_df = predict_numerai( 443 | features.loc[required_index], 444 | targets.loc[required_index], 445 | groups.loc[required_index], 446 | most_recent_model, 447 | parameters, 448 | modelname=modelname, 449 | gbm_start_iteration=gbm_start_iteration, 450 | era_col=era_col, 451 | debug=debug, 452 | ) 453 | prediction_df_list.append(prediction_df) 454 | 455 | if debug: 456 | print(prediction_df.columns, prediction_df.shape) 457 | 458 | if len(prediction_df_list) > 0: 459 | output_cols = [era_col, "prediction"] + target_col 460 | average_prediction_df = ( 461 | pd.concat(prediction_df_list, axis=0) 462 | .groupby(INDEX_COL_NAMES)[output_cols] 463 | .mean() 464 | ) 465 | average_prediction_df[era_col] = groups 466 | if debug: 467 | print(average_prediction_df.columns, average_prediction_df.shape) 468 | 469 | return average_prediction_df.sort_values(era_col), prediction_df_list 470 | else: 471 | return pd.DataFrame(), pd.DataFrame() 472 | 473 | 474 | """ 475 | Score Numerai Models with FN using CUDA 476 | 477 | prediction_df: pd.DataFrame with columns era, prediction, model_name, target_col and index id 478 | features: pd.DataFrame with columns feature_xxx and index id 479 | riskiest_fatures: list of str 480 | 481 | 482 | Output 483 | prediction_df: pd.DataFrame with columns era, model_name, prediction, neutralised_prediction, target_col, index id 484 | correlations_by_era: pd.DataFrame with columns correlation, normalised_correlation, neutralised_correlation, index era 485 | 486 | """ 487 | 488 | 489 | def score_numerai( 490 | prediction_df, 491 | features, 492 | riskiest_features, 493 | proportion=0, 494 | modelname="sample", 495 | target_col_name="target", 496 | prediction_col="prediction", 497 | era_col="era", 498 | debug=False, 499 | ): 500 | ## Find Correlation by era 501 | correlations_by_era = list() 502 | for i, df in prediction_df.groupby(era_col): 503 | output = dict() 504 | output[era_col] = i 505 | ## Computation on CUDA 506 | if torch.cuda.is_available(): 507 | temp = ( 508 | scipy.stats.rankdata(df[prediction_col], method="ordinal") - 0.5 509 | ) / len(df[prediction_col]) 510 | df["normalised_prediction"] = scipy.stats.norm.ppf(temp) 511 | ## Neutralised targets (FNC) 512 | if proportion > 0 and len(riskiest_features) > 0: 513 | exposures = cp.asarray(features.loc[df.index, riskiest_features]) 514 | normalised_prediction = cp.asarray(df["normalised_prediction"]) 515 | gram_mtx = cp.dot(cp.linalg.pinv(exposures), normalised_prediction) 516 | projected_values = normalised_prediction - cp.asarray( 517 | proportion 518 | ) * cp.dot(exposures, gram_mtx) 519 | df["neutralised_prediction"] = projected_values.get() 520 | df["neutralised_prediction"] = ( 521 | df["neutralised_prediction"] / df["neutralised_prediction"].std() 522 | ) 523 | output["neutralised_correlation"] = cp.corrcoef( 524 | cp.asarray(df[target_col_name]), 525 | cp.asarray(df["neutralised_prediction"].rank(pct=True)), 526 | )[0, 1].get() 527 | prediction_df.loc[df.index, "neutralised_prediction"] = df[ 528 | "neutralised_prediction" 529 | ].rank(pct=True) 530 | else: 531 | output["neutralised_correlation"] = cp.corrcoef( 532 | cp.asarray(df[target_col_name]), 533 | cp.asarray(df[prediction_col].rank(pct=True)), 534 | )[0, 1].get() 535 | prediction_df.loc[df.index, "neutralised_prediction"] = df[ 536 | prediction_col 537 | ].rank(pct=True) 538 | ### Computation on CPU 539 | else: 540 | ## Normalise prediction 541 | temp = ( 542 | scipy.stats.rankdata(df[prediction_col], method="ordinal") - 0.5 543 | ) / len(df[prediction_col]) 544 | df["normalised_prediction"] = scipy.stats.norm.ppf(temp) 545 | ## Neutralised targets (FNC) 546 | if proportion > 0 and len(riskiest_features) > 0: 547 | exposures = features.loc[df.index, riskiest_features] 548 | df["neutralised_prediction"] = df[ 549 | "normalised_prediction" 550 | ] - proportion * exposures.dot( 551 | np.linalg.pinv(exposures).dot(df["normalised_prediction"]) 552 | ) 553 | df["neutralised_prediction"] = ( 554 | df["neutralised_prediction"] / df["neutralised_prediction"].std() 555 | ) 556 | output["neutralised_correlation"] = np.corrcoef( 557 | df[target_col_name], df["neutralised_prediction"].rank(pct=True) 558 | )[0, 1] 559 | prediction_df.loc[df.index, "neutralised_prediction"] = df[ 560 | "neutralised_prediction" 561 | ].rank(pct=True) 562 | else: 563 | output["neutralised_correlation"] = np.corrcoef( 564 | df[target_col_name], df[prediction_col].rank(pct=True) 565 | )[0, 1] 566 | prediction_df.loc[df.index, "neutralised_prediction"] = df[ 567 | prediction_col 568 | ].rank(pct=True) 569 | correlations_by_era.append(output) 570 | ## Generate Overall files 571 | correlations_by_era_all = pd.DataFrame.from_records(correlations_by_era) 572 | prediction_df["model_name"] = modelname 573 | correlations_by_era_all["model_name"] = modelname 574 | return prediction_df, correlations_by_era_all 575 | 576 | 577 | """ 578 | Linear Factor Model 579 | Factor Timing 580 | rawdata: pd.DataFrame: Numerai dataset with columns containing the 1149 features and 20 targets, index id 581 | """ 582 | 583 | 584 | def numerai_feature_correlation_matrix( 585 | rawdata, feature_col=None, target_col_name=None, era_col="era" 586 | ): 587 | output = dict() 588 | for i, df in rawdata.groupby(era_col): 589 | corr_dict = dict() 590 | for feature in feature_col: 591 | corr_dict[feature] = np.corrcoef( 592 | df[feature].fillna(2).astype(float), df[target_col_name] 593 | )[0, 1] 594 | output[i] = corr_dict 595 | 596 | return pd.DataFrame.from_records(output).transpose()[feature_col] 597 | 598 | 599 | def numerai_feature_momentum( 600 | data_folder="../data/era", 601 | output_folder="../data/feature_momentum", 602 | data_version="v4", 603 | startera="0001", 604 | endera="1037", 605 | era_col="era", 606 | lookback=52, 607 | update_correlation_mtx=True, 608 | feature_col=None, 609 | ): 610 | if update_correlation_mtx: 611 | ## Calculate Correlation Matrix 612 | for i in range(int(startera), int(endera) + 1): 613 | if i <= 9: 614 | test_start_str = "000" + str(i) 615 | elif i <= 99: 616 | test_start_str = "00" + str(i) 617 | elif i <= 999: 618 | test_start_str = "0" + str(i) 619 | else: 620 | test_start_str = str(i) 621 | data_file = f"{data_folder}/{data_version}_{test_start_str}_int8.parquet" 622 | rawdata = pd.read_parquet(data_file) 623 | if feature_col is None: 624 | feature_col = [x for x in rawdata.columns if x.startswith("feature_")] 625 | target_cols = [x for x in rawdata.columns if x.startswith("target_")] 626 | for target_col in target_cols: 627 | correlation_file = f"{output_folder}/{target_col}_corr.parquet" 628 | if os.path.exists(correlation_file): 629 | correlation_matrix_old = pd.read_parquet(correlation_file) 630 | if test_start_str > correlation_matrix_old.index[-1]: 631 | rawdata_copy = rawdata.dropna(subset=[target_col]).copy() 632 | feature_col = correlation_matrix_old.columns 633 | if rawdata_copy.shape[0] > 0: 634 | correlation_matrix = numerai_feature_correlation_matrix( 635 | rawdata_copy, feature_col, target_col 636 | ) 637 | pd.concat( 638 | [correlation_matrix_old, correlation_matrix] 639 | ).to_parquet(correlation_file) 640 | else: 641 | rawdata_copy = rawdata.dropna(subset=[target_col]).copy() 642 | if rawdata_copy.shape[0] > 0: 643 | correlation_matrix = numerai_feature_correlation_matrix( 644 | rawdata_copy, feature_col, target_col 645 | ) 646 | correlation_matrix.to_parquet(correlation_file) 647 | 648 | data_file = f"{data_folder}/{data_version}_{endera}_int8.parquet" 649 | rawdata = pd.read_parquet(data_file) 650 | # feature_col = [x for x in rawdata.columns if x.startswith("feature_")] 651 | target_cols = [x for x in rawdata.columns if x.startswith("target_")] 652 | 653 | ## Factor Momentum Portfolio 654 | for target_col in target_cols: 655 | correlation_file = f"{output_folder}/{target_col}_corr.parquet" 656 | correlation_matrix = pd.read_parquet(correlation_file) 657 | feature_col = correlation_matrix.columns 658 | 659 | if "60" in target_col: 660 | gap = 14 661 | else: 662 | gap = 6 663 | 664 | factor_momentum = ( 665 | correlation_matrix.shift(gap).fillna(0).rolling(lookback).mean().dropna() 666 | ) 667 | factor_volatility = ( 668 | correlation_matrix.shift(gap).fillna(0).rolling(lookback).std().dropna() 669 | ) 670 | fm_max_index = factor_momentum.index.max() 671 | fm_min_index = factor_momentum.index.min() 672 | 673 | factor_momentum_eras = factor_momentum.unstack(level=0).reset_index() 674 | factor_momentum_eras.columns = ["feature_name", "era", "momentum"] 675 | factor_volatility_eras = factor_volatility.unstack(level=0).reset_index() 676 | factor_volatility_eras.columns = ["feature_name", "era", "volatility"] 677 | 678 | for i in range(int(startera), int(endera) + 1): 679 | if i <= 9: 680 | test_start_str = "000" + str(i) 681 | elif i <= 99: 682 | test_start_str = "00" + str(i) 683 | elif i <= 999: 684 | test_start_str = "0" + str(i) 685 | else: 686 | test_start_str = str(i) 687 | 688 | if (test_start_str <= fm_max_index) & (test_start_str >= fm_min_index): 689 | factor_file = f"{output_folder}/{target_col}_feature_momentum.csv" 690 | if os.path.exists(factor_file): 691 | factor_portfolio_old = pd.read_csv(factor_file, index_col=0) 692 | factor_portfolio_old.index = pd.to_datetime( 693 | factor_portfolio_old.index 694 | ) 695 | if ( 696 | convert_era_to_datetime(test_start_str) 697 | > factor_portfolio_old.index[-1] 698 | ): 699 | update = True 700 | else: 701 | update = False 702 | else: 703 | update = True 704 | 705 | if update: 706 | ## Read Data 707 | data_file = ( 708 | f"{data_folder}/{data_version}_{test_start_str}_int8.parquet" 709 | ) 710 | df = pd.read_parquet(data_file) 711 | feature_col = [x for x in df.columns if x.startswith("feature_")] 712 | df[feature_col] = df[feature_col].fillna(2) - 2 713 | 714 | ## Factor Momentum 715 | portfolio_predictions = df[[era_col, "target"]] 716 | per_era = df[feature_col] * np.sign( 717 | factor_momentum.loc[test_start_str, feature_col] 718 | ) 719 | portfolio_predictions["prediction"] = per_era.mean(axis=1) 720 | prediction_era, correlations_era = score_numerai( 721 | portfolio_predictions, 722 | df[feature_col], 723 | None, 724 | proportion=0, 725 | modelname=f"{target_col}_feature_momentum-baseline-0", 726 | target_col_name="target", 727 | ) 728 | factor_porfolio = create_era_index( 729 | correlations_era.pivot( 730 | index="era", 731 | columns=["model_name"], 732 | values=["neutralised_correlation"], 733 | ) 734 | ) 735 | if os.path.exists(factor_file): 736 | factor_portfolio_old = pd.read_csv(factor_file, index_col=0) 737 | factor_portfolio_old.index = pd.to_datetime( 738 | factor_portfolio_old.index 739 | ) 740 | pd.concat( 741 | [ 742 | factor_portfolio_old, 743 | factor_porfolio["neutralised_correlation"], 744 | ] 745 | ).to_csv(factor_file) 746 | else: 747 | factor_porfolio["neutralised_correlation"].to_csv(factor_file) 748 | 749 | 750 | """ 751 | 752 | Benchmark Performances of Numerai Models 753 | 754 | Run Model Performances for models trained with a single ML model 755 | 756 | """ 757 | 758 | 759 | def dynamic_feature_neutralisation( 760 | prediction_df, 761 | features_raw, 762 | feature_corr=None, 763 | features_optimizer=None, 764 | modelname="sample", 765 | era_col="era", 766 | target_col=["target"], 767 | cutoff=420, 768 | gap=6, 769 | lookback=52, 770 | proportion=1, 771 | debug=False, 772 | ): 773 | if features_optimizer is None: 774 | features_optimizer = features_raw.columns[:cutoff] 775 | 776 | if feature_corr is None: 777 | ## Get index by era 778 | prediction_dynamic = list() 779 | correlation_dynamic = list() 780 | for i, df in prediction_df.groupby(era_col): 781 | if debug: 782 | print(modelname, i, df.shape) 783 | prediction_df_era = prediction_df.loc[df.index] 784 | features_raw_era = features_raw.loc[df.index] 785 | ## Baseline 786 | prediction_df_era_new, correlations_by_era = score_numerai( 787 | prediction_df_era, 788 | features_raw_era, 789 | list(), 790 | proportion=0, 791 | modelname=f"{modelname}-baseline", 792 | target_col_name=target_col[0], 793 | era_col=era_col, 794 | debug=debug, 795 | ) 796 | prediction_dynamic.append(prediction_df_era_new.copy()) 797 | correlation_dynamic.append(correlations_by_era) 798 | return pd.concat(prediction_dynamic, axis=0), pd.concat( 799 | correlation_dynamic, axis=0 800 | ) 801 | 802 | else: 803 | ## Generate Feature Momentum Leaderboard 804 | factor_mean = ( 805 | feature_corr.shift(gap).fillna(0).rolling(lookback).mean().dropna() 806 | ) 807 | factor_volatility = ( 808 | feature_corr.shift(gap).fillna(0).rolling(lookback).std().dropna() 809 | ) 810 | factor_skew = ( 811 | feature_corr.shift(gap).fillna(0).rolling(lookback).skew().dropna() 812 | ) 813 | factor_kurt = ( 814 | feature_corr.shift(gap).fillna(0).rolling(lookback).kurt().dropna() 815 | ) 816 | factor_drawdown = ( 817 | (-1 * (feature_corr.cumsum() - feature_corr.cumsum().cummax()).cummin()) 818 | .shift(gap) 819 | .fillna(0) 820 | ) 821 | factor_sharpe = factor_mean / factor_volatility 822 | factor_calmar = factor_mean / factor_drawdown 823 | factor_autocorrelation = ( 824 | feature_corr.rolling(lookback) 825 | .corr(feature_corr.shift(4)) 826 | .shift(gap) 827 | .fillna(0) 828 | ) 829 | 830 | fm_max_index = factor_mean.index.max() 831 | fm_min_index = factor_mean.index.min() 832 | 833 | ## 834 | factor_flavour_eras = dict() 835 | for flavour in [ 836 | "mean", 837 | "volatility", 838 | ]: 839 | factor_flavour_eras[flavour] = ( 840 | locals()[f"factor_{flavour}"].unstack(level=0).reset_index() 841 | ) 842 | factor_flavour_eras[flavour].columns = ["feature_name", "era", flavour] 843 | 844 | ## Get index by era 845 | prediction_dynamic = list() 846 | correlation_dynamic = list() 847 | for i, df in prediction_df.groupby(era_col): 848 | if debug: 849 | print(modelname, i, df.shape) 850 | if (i <= fm_max_index) & (i >= fm_min_index): 851 | prediction_df_era = prediction_df.loc[df.index] 852 | features_raw_era = features_raw.loc[df.index] 853 | 854 | ## Baseline 855 | prediction_df_era_new, correlations_by_era = score_numerai( 856 | prediction_df_era, 857 | features_raw_era, 858 | list(), 859 | proportion=0, 860 | modelname=f"{modelname}-baseline", 861 | target_col_name=target_col[0], 862 | era_col=era_col, 863 | debug=debug, 864 | ) 865 | prediction_dynamic.append(prediction_df_era_new.copy()) 866 | correlation_dynamic.append(correlations_by_era) 867 | 868 | ## For v4-data only 869 | bad_features = [ 870 | "feature_palpebral_univalve_pennoncel", 871 | "feature_unsustaining_chewier_adnoun", 872 | "feature_brainish_nonabsorbent_assurance", 873 | "feature_coastal_edible_whang", 874 | "feature_disprovable_topmost_burrower", 875 | "feature_trisomic_hagiographic_fragrance", 876 | "feature_queenliest_childing_ritual", 877 | "feature_censorial_leachier_rickshaw", 878 | "feature_daylong_ecumenic_lucina", 879 | "feature_steric_coxcombic_relinquishment", 880 | ] 881 | 882 | features_optimizer = list(set(features_optimizer) - set(bad_features)) 883 | 884 | ## Optimizer 885 | prediction_df_era_new, correlations_by_era = score_numerai( 886 | prediction_df_era, 887 | features_raw_era, 888 | features_optimizer, 889 | proportion=proportion, 890 | modelname=f"{modelname}-optimizer", 891 | target_col_name=target_col[0], 892 | era_col=era_col, 893 | debug=debug, 894 | ) 895 | prediction_dynamic.append(prediction_df_era_new.copy()) 896 | correlation_dynamic.append(correlations_by_era) 897 | 898 | ### Dynamic Feature Neutralisation by different criteria 899 | DFN_params = list() 900 | for flavour in [ 901 | "mean", 902 | "volatility", 903 | ]: 904 | for direction in [ 905 | "tail", 906 | "head", 907 | ]: 908 | for size in [ 909 | 420, 910 | ]: 911 | if direction == "tail": 912 | name = f"high_{flavour}_" 913 | else: 914 | name = f"low_{flavour}_" 915 | if size == 420: 916 | name = name + "standard" 917 | elif size == 105: 918 | name = name + "small" 919 | temp = (flavour, size, direction, name) 920 | DFN_params.append(temp) 921 | 922 | for DFN_param in DFN_params: 923 | flavour = DFN_param[0] 924 | factor_flavour_era = factor_flavour_eras[flavour][ 925 | factor_flavour_eras[flavour]["era"] == i 926 | ] 927 | selected_features = getattr( 928 | factor_flavour_era.sort_values(flavour), DFN_param[2] 929 | )(DFN_param[1])["feature_name"] 930 | 931 | selected_features = list(set(selected_features) - set(bad_features)) 932 | 933 | prediction_df_era_new, correlations_by_era = score_numerai( 934 | prediction_df_era, 935 | features_raw_era, 936 | selected_features, 937 | proportion=proportion, 938 | modelname=f"{modelname}-{DFN_param[3]}", 939 | target_col_name=target_col[0], 940 | era_col=era_col, 941 | debug=debug, 942 | ) 943 | prediction_dynamic.append(prediction_df_era_new.copy()) 944 | correlation_dynamic.append(correlations_by_era) 945 | 946 | return pd.concat(prediction_dynamic, axis=0), pd.concat( 947 | correlation_dynamic, axis=0 948 | ) 949 | 950 | 951 | def save_model_performance_test( 952 | Numerai_Model_Names, 953 | feature_corr, 954 | features_optimizer, 955 | startera=None, 956 | endera=None, 957 | data_file="data/v4_all_int8.parquet", 958 | data_version="v4", 959 | target_col=["target"], 960 | debug=False, 961 | gbm_start_iteration=0, 962 | ): 963 | ( 964 | average_prediction_df, 965 | prediction_df_list, 966 | ) = predict_numerai_multiple( 967 | Numerai_Model_Names, 968 | feature_corr, 969 | filename=data_file, 970 | data_version=data_version, 971 | startera=startera, 972 | endera=endera, 973 | debug=debug, 974 | target_col=target_col, 975 | gbm_start_iteration=gbm_start_iteration, 976 | ) 977 | 978 | del prediction_df_list 979 | 980 | MODEL_NAME = Numerai_Model_Names[0].split(".parameters")[0].split("/")[-1] 981 | MODEL_NAME = MODEL_NAME + f"_{len(Numerai_Model_Names)}" 982 | 983 | ( 984 | features, 985 | targets, 986 | groups, 987 | weights, 988 | ) = load_numerai_data( 989 | data_file, 990 | resample_freq=1, 991 | startera=startera, 992 | endera=endera, 993 | target_col=target_col, 994 | data_version=data_version, 995 | ) 996 | 997 | dynamic_predictions, dynamic_correlations = dynamic_feature_neutralisation( 998 | average_prediction_df, 999 | features, 1000 | feature_corr, 1001 | features_optimizer, 1002 | target_col=target_col, 1003 | modelname=MODEL_NAME, 1004 | debug=debug, 1005 | ) 1006 | summary_correlations = dynamic_correlations.pivot( 1007 | index="era", columns="model_name", values=["neutralised_correlation"] 1008 | ).dropna() 1009 | strategy_flavour = pd.DataFrame.from_records( 1010 | summary_correlations.apply(strategy_metrics, axis=0), 1011 | index=summary_correlations.columns, 1012 | ) 1013 | if data_version == "signals": 1014 | return ( 1015 | strategy_flavour, 1016 | summary_correlations, 1017 | dynamic_predictions, 1018 | ) 1019 | else: 1020 | return ( 1021 | strategy_flavour, 1022 | create_era_index(summary_correlations), 1023 | dynamic_predictions, 1024 | ) 1025 | 1026 | 1027 | ## Run Numerai Model Performances for both Classic and Signals tournament 1028 | def run_numerai_models_performances( 1029 | Numerai_Model_Names, 1030 | feature_corr, 1031 | features_optimizer, 1032 | PERFORMANCES_FOLDER, 1033 | data_file="data/v4_all_int8.parquet", 1034 | data_version="v4", 1035 | target_col=["target"], 1036 | gbm_start_iteration=0, 1037 | ): 1038 | ## Calculate Starting Era 1039 | parametername = Numerai_Model_Names[0] 1040 | no_models = len(Numerai_Model_Names) 1041 | stem = parametername.split("/")[-1].replace(".parameters", "") 1042 | correlations_filename = f"{PERFORMANCES_FOLDER}/{stem}_{no_models}.csv" 1043 | if os.path.exists(parametername): 1044 | parameters = joblib.load(parametername) 1045 | if data_version == "signals": 1046 | test_start = parameters["parameters"]["model"]["validation_end"] 1047 | test_end = datetime.datetime.strptime("2099-12-31", "%Y-%m-%d") 1048 | else: 1049 | test_start = shift_era( 1050 | parameters["parameters"]["model"]["validation_end"], gap=14 1051 | ) 1052 | test_end = feature_corr.index[-1] 1053 | if os.path.exists(correlations_filename): 1054 | most_recent_date = pd.read_csv(correlations_filename, index_col=0).index[-1] 1055 | if data_version == "signals": 1056 | test_start = datetime.datetime.strptime(most_recent_date, "%Y-%m-%d") 1057 | else: 1058 | test_start = shift_era(convert_datetime_to_era(most_recent_date), gap=1) 1059 | print(f"Model Performances {test_start} {test_end}") 1060 | ### Get Model Predictions for the latest eras 1061 | if test_end >= test_start: 1062 | ( 1063 | validate_performance, 1064 | validate_correlations, 1065 | validate_predictions, 1066 | ) = save_model_performance_test( 1067 | Numerai_Model_Names, 1068 | feature_corr, 1069 | features_optimizer, 1070 | startera=test_start, 1071 | endera=test_end, 1072 | data_file=data_file, 1073 | data_version=data_version, 1074 | target_col=target_col, 1075 | gbm_start_iteration=gbm_start_iteration, 1076 | ) 1077 | ## Update Model Performances 1078 | output = validate_correlations["neutralised_correlation"] 1079 | if os.path.exists(correlations_filename): 1080 | old_file = pd.read_csv(correlations_filename, index_col=0) 1081 | df = pd.concat([old_file, output.dropna()]) 1082 | df.index = pd.to_datetime(df.index) 1083 | df[~df.index.duplicated()].sort_index().to_csv(correlations_filename) 1084 | else: 1085 | output.dropna().to_csv(correlations_filename) 1086 | --------------------------------------------------------------------------------