├── .github
    └── workflows
    │   └── build-models.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── cached-pickles
    ├── example_model.pkl
    ├── feature_neutralization.pkl
    ├── hello_numerai.pkl
    └── target_ensemble.pkl
├── example_model.ipynb
├── feature_neutralization.ipynb
├── hello_numerai.ipynb
├── target_ensemble.ipynb
└── utils.py


/.github/workflows/build-models.yml:
--------------------------------------------------------------------------------
 1 | name: Build Example Model Pickles
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     # paths:
 7 |     #   - example_model.ipynb
 8 |     #   - hello_numerai.ipynb
 9 |     #   - feature_neutralization.ipynb
10 |     #   - target_ensemble.ipynb
11 |     branches:
12 |       - master
13 | 
14 | concurrency: build-example-models
15 | 
16 | jobs:
17 | 
18 |   build_and_test:
19 |     name: "Build Example Model Pickles"
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |       - uses: actions/setup-python@v5
24 |         with:
25 |           python-version: "3.10"
26 |       - name: Install jupyter
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           pip install jupyter
30 |           pip install -r https://raw.githubusercontent.com/numerai/numerai-predict/refs/heads/master/requirements.txt
31 |       - name: build-example-model
32 |         run: |
33 |           jupyter nbconvert \
34 |             --execute example_model.ipynb \
35 |             --ExecutePreprocessor.timeout=-1 \
36 |             --to html
37 |       - name: build-hello-numerai
38 |         run: |
39 |           jupyter nbconvert \
40 |             --execute hello_numerai.ipynb \
41 |             --ExecutePreprocessor.timeout=-1 \
42 |             --to html
43 |       - name: build-feature-neutralization
44 |         run: |
45 |           jupyter nbconvert \
46 |             --execute feature_neutralization.ipynb \
47 |             --ExecutePreprocessor.timeout=-1 \
48 |             --to html
49 |       - name: build-target-ensemble
50 |         run: |
51 |           jupyter nbconvert \
52 |             --execute target_ensemble.ipynb \
53 |             --ExecutePreprocessor.timeout=-1 \
54 |             --to html
55 |       - name: delete-html
56 |         run: |
57 |           rm example_model.html
58 |           rm hello_numerai.html
59 |           rm feature_neutralization.html
60 |           rm target_ensemble.html
61 |       - name: move-pickles-to-cached-pickles-dir
62 |         run: |
63 |           mkdir -p cached-pickles/
64 |           mv -f example_model.pkl cached-pickles/
65 |           mv -f hello_numerai.pkl cached-pickles/
66 |           mv -f feature_neutralization.pkl cached-pickles/
67 |           mv -f target_ensemble.pkl cached-pickles/
68 |       - name: commit-to-master
69 |         uses: EndBug/add-and-commit@v9
70 |         with:
71 |           add: "cached-pickles/*"
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | # Data
 92 | *.csv
 93 | *.parquet
 94 | *.json
 95 | *.model
 96 | 
 97 | .idea
 98 | example_model.xgb
 99 | 
100 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Numerai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Numerai Example Scripts
 2 | 
 3 | A collection of scripts and notebooks to help you get started quickly. 
 4 | 
 5 | Need help? Find us on Discord:
 6 | 
 7 | [![](https://dcbadge.vercel.app/api/server/numerai)](https://discord.gg/numerai)
 8 | 
 9 | 
10 | ## Notebooks 
11 | 
12 | Try running these notebooks on Google Colab's free tier!
13 | 
14 | ### Hello Numerai
15 | <a target="_blank" href="https://colab.research.google.com/github/numerai/example-scripts/blob/master/hello_numerai.ipynb">
16 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
17 | </a>
18 | 
19 | Start here if you are new! Explore the dataset and build your first model. 
20 | 
21 | ### Feature Neutralization
22 | <a target="_blank" href="https://colab.research.google.com/github/numerai/example-scripts/blob/master/feature_neutralization.ipynb">
23 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
24 | </a>
25 | 
26 | Learn how to measure feature risk and control it with feature neutralization.
27 | 
28 | ### Target Ensemble
29 | <a target="_blank" href="https://colab.research.google.com/github/numerai/example-scripts/blob/master/target_ensemble.ipynb">
30 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
31 | </a>
32 | 
33 | Learn how to create an ensemble trained on different targets.
34 | 
35 | ### Model Upload
36 | <a target="_blank" href="https://colab.research.google.com/github/numerai/example-scripts/blob/master/example_model.ipynb">
37 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
38 | </a>
39 | 
40 | A barebones example of how to build and upload your model to Numerai.


--------------------------------------------------------------------------------
/cached-pickles/example_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/example_model.pkl


--------------------------------------------------------------------------------
/cached-pickles/feature_neutralization.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/feature_neutralization.pkl


--------------------------------------------------------------------------------
/cached-pickles/hello_numerai.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/hello_numerai.pkl


--------------------------------------------------------------------------------
/cached-pickles/target_ensemble.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/target_ensemble.pkl


--------------------------------------------------------------------------------
/example_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "ZqK_u9k-hMqE"
  7 |       },
  8 |       "source": [
  9 |         "# Model Upload"
 10 |       ]
 11 |     },
 12 |     {
 13 |       "cell_type": "code",
 14 |       "execution_count": null,
 15 |       "metadata": {
 16 |         "colab": {
 17 |           "base_uri": "https://localhost:8080/"
 18 |         },
 19 |         "id": "Ekw8Z93ljC3v",
 20 |         "outputId": "675ac893-5a46-4c6b-dc03-09438941d1fc"
 21 |       },
 22 |       "outputs": [
 23 |         {
 24 |           "name": "stdout",
 25 |           "output_type": "stream",
 26 |           "text": [
 27 |             "Python 3.10.12\n"
 28 |           ]
 29 |         }
 30 |       ],
 31 |       "source": [
 32 |         "!python --version"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "cell_type": "code",
 37 |       "execution_count": null,
 38 |       "metadata": {
 39 |         "colab": {
 40 |           "base_uri": "https://localhost:8080/"
 41 |         },
 42 |         "id": "yoy_wT1rhMqF",
 43 |         "outputId": "4268fdb0-84d2-4502-97e4-e93a1440c8ee"
 44 |       },
 45 |       "outputs": [
 46 |         {
 47 |           "name": "stdout",
 48 |           "output_type": "stream",
 49 |           "text": [
 50 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.4/34.4 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 51 |             "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
 52 |             "lida 0.0.10 requires fastapi, which is not installed.\n",
 53 |             "lida 0.0.10 requires kaleido, which is not installed.\n",
 54 |             "lida 0.0.10 requires python-multipart, which is not installed.\n",
 55 |             "lida 0.0.10 requires uvicorn, which is not installed.\u001b[0m\u001b[31m\n",
 56 |             "\u001b[0m"
 57 |           ]
 58 |         }
 59 |       ],
 60 |       "source": [
 61 |         "# Install dependencies\n",
 62 |         "!pip install -q numerapi pandas lightgbm cloudpickle==2.2.1 pyarrow scikit-learn scipy==1.10.1"
 63 |       ]
 64 |     },
 65 |     {
 66 |       "cell_type": "code",
 67 |       "execution_count": null,
 68 |       "metadata": {
 69 |         "colab": {
 70 |           "base_uri": "https://localhost:8080/",
 71 |           "height": 17
 72 |         },
 73 |         "id": "13hdRk9ghMqI",
 74 |         "outputId": "857a4882-83e5-4a76-9b1e-57d6d822cc67"
 75 |       },
 76 |       "outputs": [
 77 |         {
 78 |           "data": {
 79 |             "application/javascript": "\n    async function download(id, filename, size) {\n      if (!google.colab.kernel.accessAllowed) {\n        return;\n      }\n      const div = document.createElement('div');\n      const label = document.createElement('label');\n      label.textContent = `Downloading \"${filename}\": `;\n      div.appendChild(label);\n      const progress = document.createElement('progress');\n      progress.max = size;\n      div.appendChild(progress);\n      document.body.appendChild(div);\n\n      const buffers = [];\n      let downloaded = 0;\n\n      const channel = await google.colab.kernel.comms.open(id);\n      // Send a message to notify the kernel that we're ready.\n      channel.send({})\n\n      for await (const message of channel.messages) {\n        // Send a message to notify the kernel that we're ready.\n        channel.send({})\n        if (message.buffers) {\n          for (const buffer of message.buffers) {\n            buffers.push(buffer);\n            downloaded += buffer.byteLength;\n            progress.value = downloaded;\n          }\n        }\n      }\n      const blob = new Blob(buffers, {type: 'application/binary'});\n      const a = document.createElement('a');\n      a.href = window.URL.createObjectURL(blob);\n      a.download = filename;\n      div.appendChild(a);\n      a.click();\n      div.remove();\n    }\n  ",
 80 |             "text/plain": [
 81 |               "<IPython.core.display.Javascript object>"
 82 |             ]
 83 |           },
 84 |           "metadata": {},
 85 |           "output_type": "display_data"
 86 |         },
 87 |         {
 88 |           "data": {
 89 |             "application/javascript": "download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)",
 90 |             "text/plain": [
 91 |               "<IPython.core.display.Javascript object>"
 92 |             ]
 93 |           },
 94 |           "metadata": {},
 95 |           "output_type": "display_data"
 96 |         }
 97 |       ],
 98 |       "source": [
 99 |         "from numerapi import NumerAPI\n",
100 |         "import pandas as pd\n",
101 |         "import json\n",
102 |         "napi = NumerAPI()\n",
103 |         "\n",
104 |         "# use one of the latest data versions\n",
105 |         "DATA_VERSION = \"v5.0\"\n",
106 |         "\n",
107 |         "# Download data\n",
108 |         "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
109 |         "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
110 |         "\n",
111 |         "# Load data\n",
112 |         "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
113 |         "features = feature_metadata[\"feature_sets\"][\"small\"]\n",
114 |         "# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
115 |         "# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
116 |         "# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
117 |         "train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
118 |         "\n",
119 |         "# For better models, join train and validation data and train on all of it.\n",
120 |         "# This would cause diagnostics to be misleading though.\n",
121 |         "# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
122 |         "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
123 |         "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
124 |         "# train = pd.concat([train, validation])\n",
125 |         "\n",
126 |         "# Downsample for speed\n",
127 |         "train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])]  # skip this step for better performance\n",
128 |         "\n",
129 |         "# Train model\n",
130 |         "import lightgbm as lgb\n",
131 |         "model = lgb.LGBMRegressor(\n",
132 |         "    n_estimators=2000,\n",
133 |         "    learning_rate=0.01,\n",
134 |         "    max_depth=5,\n",
135 |         "    num_leaves=2**5-1,\n",
136 |         "    colsample_bytree=0.1\n",
137 |         ")\n",
138 |         "# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n",
139 |         "# model = lgb.LGBMRegressor(\n",
140 |         "#     n_estimators=30_000,\n",
141 |         "#     learning_rate=0.001,\n",
142 |         "#     max_depth=10,\n",
143 |         "#     num_leaves=2**10,\n",
144 |         "#     colsample_bytree=0.1,\n",
145 |         "#     min_data_in_leaf=10000,\n",
146 |         "# )\n",
147 |         "model.fit(\n",
148 |         "    train[features],\n",
149 |         "    train[\"target\"]\n",
150 |         ")\n",
151 |         "\n",
152 |         "# Define predict function\n",
153 |         "def predict(\n",
154 |         "    live_features: pd.DataFrame,\n",
155 |         "    live_benchmark_models: pd.DataFrame\n",
156 |         " ) -> pd.DataFrame:\n",
157 |         "    live_predictions = model.predict(live_features[features])\n",
158 |         "    submission = pd.Series(live_predictions, index=live_features.index)\n",
159 |         "    return submission.to_frame(\"prediction\")\n",
160 |         "\n",
161 |         "# Pickle predict function\n",
162 |         "import cloudpickle\n",
163 |         "p = cloudpickle.dumps(predict)\n",
164 |         "with open(\"example_model.pkl\", \"wb\") as f:\n",
165 |         "    f.write(p)\n",
166 |         "\n",
167 |         "# Download file if running in Google Colab\n",
168 |         "try:\n",
169 |         "    from google.colab import files\n",
170 |         "    files.download('example_model.pkl')\n",
171 |         "except:\n",
172 |         "    pass"
173 |       ]
174 |     }
175 |   ],
176 |   "metadata": {
177 |     "colab": {
178 |       "provenance": []
179 |     },
180 |     "kernelspec": {
181 |       "display_name": "venv",
182 |       "language": "python",
183 |       "name": "python3"
184 |     },
185 |     "language_info": {
186 |       "codemirror_mode": {
187 |         "name": "ipython",
188 |         "version": 3
189 |       },
190 |       "file_extension": ".py",
191 |       "mimetype": "text/x-python",
192 |       "name": "python",
193 |       "nbconvert_exporter": "python",
194 |       "pygments_lexer": "ipython3",
195 |       "version": "3.10.12"
196 |     },
197 |     "orig_nbformat": 4
198 |   },
199 |   "nbformat": 4,
200 |   "nbformat_minor": 0
201 | }
202 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | THIS MODULE IS DEPRECATED. Use numerai-tools:
  3 | https://github.com/numerai/numerai-tools
  4 | 
  5 | If there is a feature missing from numerai-tools, please
  6 | open an issue with a link to the function in this file you'd
  7 | like to see.
  8 | """
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import scipy
 13 | from tqdm import tqdm
 14 | from pathlib import Path
 15 | import json
 16 | 
 17 | ERA_COL = "era"
 18 | TARGET_COL = "target_cyrus_v4_20"
 19 | DATA_TYPE_COL = "data_type"
 20 | EXAMPLE_PREDS_COL = "example_preds"
 21 | MODEL_FOLDER = "models"
 22 | MODEL_CONFIGS_FOLDER = "model_configs"
 23 | PREDICTION_FILES_FOLDER = "prediction_files"
 24 | 
 25 | 
 26 | def save_prediction(df, name):
 27 |     """DEPRECATED"""
 28 |     try:
 29 |         Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
 30 |     except Exception as ex:
 31 |         pass
 32 |     df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
 33 | 
 34 | 
 35 | def save_model(model, name):
 36 |     """DEPRECATED"""
 37 |     try:
 38 |         Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
 39 |     except Exception as ex:
 40 |         pass
 41 |     pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")
 42 | 
 43 | 
 44 | def load_model(name):
 45 |     """DEPRECATED"""
 46 |     path = Path(f"{MODEL_FOLDER}/{name}.pkl")
 47 |     if path.is_file():
 48 |         model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
 49 |     else:
 50 |         model = False
 51 |     return model
 52 | 
 53 | 
 54 | def save_model_config(model_config, model_name):
 55 |     """DEPRECATED"""
 56 |     try:
 57 |         Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True)
 58 |     except Exception as ex:
 59 |         pass
 60 |     with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", "w") as fp:
 61 |         json.dump(model_config, fp)
 62 | 
 63 | 
 64 | def load_model_config(model_name):
 65 |     """DEPRECATED"""
 66 |     path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json"
 67 |     path = Path(path_str)
 68 |     if path.is_file():
 69 |         with open(path_str, "r") as fp:
 70 |             model_config = json.load(fp)
 71 |     else:
 72 |         model_config = False
 73 |     return model_config
 74 | 
 75 | 
 76 | def get_biggest_change_features(corrs, n):
 77 |     """DEPRECATED"""
 78 |     all_eras = corrs.index.sort_values()
 79 |     h1_eras = all_eras[: len(all_eras) // 2]
 80 |     h2_eras = all_eras[len(all_eras) // 2 :]
 81 | 
 82 |     h1_corr_means = corrs.loc[h1_eras, :].mean()
 83 |     h2_corr_means = corrs.loc[h2_eras, :].mean()
 84 | 
 85 |     corr_diffs = h2_corr_means - h1_corr_means
 86 |     worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
 87 |     return worst_n
 88 | 
 89 | 
 90 | def get_time_series_cross_val_splits(data, cv=3, embargo=12):
 91 |     """DEPRECATED"""
 92 |     all_train_eras = data[ERA_COL].unique()
 93 |     len_split = len(all_train_eras) // cv
 94 |     test_splits = [
 95 |         all_train_eras[i * len_split : (i + 1) * len_split] for i in range(cv)
 96 |     ]
 97 |     # fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
 98 |     remainder = len(all_train_eras) % cv
 99 |     if remainder != 0:
100 |         test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:])
101 | 
102 |     train_splits = []
103 |     for test_split in test_splits:
104 |         test_split_max = int(np.max(test_split))
105 |         test_split_min = int(np.min(test_split))
106 |         # get all of the eras that aren't in the test split
107 |         train_split_not_embargoed = [
108 |             e
109 |             for e in all_train_eras
110 |             if not (test_split_min <= int(e) <= test_split_max)
111 |         ]
112 |         # embargo the train split so we have no leakage.
113 |         # one era is length 5, so we need to embargo by target_length/5 eras.
114 |         # To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
115 |         train_split = [
116 |             e
117 |             for e in train_split_not_embargoed
118 |             if abs(int(e) - test_split_max) > embargo
119 |             and abs(int(e) - test_split_min) > embargo
120 |         ]
121 |         train_splits.append(train_split)
122 | 
123 |     # convenient way to iterate over train and test splits
124 |     train_test_zip = zip(train_splits, test_splits)
125 |     return train_test_zip
126 | 
127 | 
128 | def neutralize(
129 |     df,
130 |     columns,
131 |     neutralizers=None,
132 |     proportion=1.0,
133 |     normalize=True,
134 |     era_col="era",
135 |     verbose=False,
136 | ):
137 |     """DEPRECATED"""
138 |     if neutralizers is None:
139 |         neutralizers = []
140 |     unique_eras = df[era_col].unique()
141 |     computed = []
142 |     if verbose:
143 |         iterator = tqdm(unique_eras)
144 |     else:
145 |         iterator = unique_eras
146 |     for u in iterator:
147 |         df_era = df[df[era_col] == u]
148 |         scores = df_era[columns].values
149 |         if normalize:
150 |             scores2 = []
151 |             for x in scores.T:
152 |                 x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x)
153 |                 x = scipy.stats.norm.ppf(x)
154 |                 scores2.append(x)
155 |             scores = np.array(scores2).T
156 |         exposures = df_era[neutralizers].values
157 | 
158 |         scores -= proportion * exposures.dot(
159 |             np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(
160 |                 scores.astype(np.float32)
161 |             )
162 |         )
163 | 
164 |         scores /= scores.std(ddof=0)
165 | 
166 |         computed.append(scores)
167 | 
168 |     return pd.DataFrame(np.concatenate(computed), columns=columns, index=df.index)
169 | 
170 | 
171 | def neutralize_series(series, by, proportion=1.0):
172 |     """DEPRECATED"""
173 |     scores = series.values.reshape(-1, 1)
174 |     exposures = by.values.reshape(-1, 1)
175 | 
176 |     # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
177 |     exposures = np.hstack(
178 |         (exposures, np.array([np.mean(series)] * len(exposures)).reshape(-1, 1))
179 |     )
180 | 
181 |     correction = proportion * (
182 |         exposures.dot(np.linalg.lstsq(exposures, scores, rcond=None)[0])
183 |     )
184 |     corrected_scores = scores - correction
185 |     neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
186 |     return neutralized
187 | 
188 | 
189 | def unif(df):
190 |     """DEPRECATED"""
191 |     x = (df.rank(method="first") - 0.5) / len(df)
192 |     return pd.Series(x, index=df.index)
193 | 
194 | 
195 | def numerai_corr(preds, target):
196 |     """DEPRECATED"""
197 |     # rank (keeping ties) then gaussianize predictions to standardize prediction distributions
198 |     ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
199 |     gauss_ranked_preds = scipy.stats.norm.ppf(ranked_preds)
200 |     # center targets around 0
201 |     centered_target = target - target.mean()
202 |     # raise both preds and target to the power of 1.5 to accentuate the tails
203 |     preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
204 |     target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
205 |     # finally return the Pearson correlation
206 |     return np.corrcoef(preds_p15, target_p15)[0, 1]
207 | 
208 | 
209 | def get_feature_neutral_mean(
210 |     df, prediction_col, target_col, features_for_neutralization=None
211 | ):
212 |     """DEPRECATED"""
213 |     if features_for_neutralization is None:
214 |         features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
215 |     df.loc[:, "neutral_sub"] = neutralize(
216 |         df, [prediction_col], features_for_neutralization
217 |     )[prediction_col]
218 |     scores = (
219 |         df.groupby("era")
220 |         .apply(lambda x: numerai_corr(x["neutral_sub"], x[target_col]))
221 |         .mean()
222 |     )
223 |     return np.mean(scores)
224 | 
225 | 
226 | def get_feature_neutral_mean_tb_era(
227 |     df, prediction_col, target_col, tb, features_for_neutralization=None
228 | ):
229 |     """DEPRECATED"""
230 |     if features_for_neutralization is None:
231 |         features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
232 |     temp_df = df.reset_index(
233 |         drop=True
234 |     ).copy()  # Reset index due to use of argsort later
235 |     temp_df.loc[:, "neutral_sub"] = neutralize(
236 |         temp_df, [prediction_col], features_for_neutralization
237 |     )[prediction_col]
238 |     temp_df_argsort = temp_df.loc[:, "neutral_sub"].argsort()
239 |     temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb], temp_df_argsort.iloc[-tb:]])
240 |     temp_df_tb = temp_df.loc[temp_df_tb_idx]
241 |     tb_fnc = numerai_corr(temp_df_tb["neutral_sub"], temp_df_tb[target_col])
242 |     return tb_fnc
243 | 
244 | 
245 | def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
246 |     """DEPRECATED"""
247 |     unique_eras = df[era_col].unique()
248 |     computed = []
249 |     for u in unique_eras:
250 |         df_era = df[df[era_col] == u]
251 |         era_pred = np.float64(df_era[columns].values.T)
252 |         era_target = np.float64(df_era[target].values.T)
253 | 
254 |         if tb is None:
255 |             ccs = numerai_corr(era_pred, era_target)
256 |         else:
257 |             tbidx = np.argsort(era_pred, axis=1)
258 |             tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
259 |             ccs = [
260 |                 numerai_corr(pd.Series(era_target[tmpidx]), pd.Series(tmppred[tmpidx]))
261 |                 for tmpidx, tmppred in zip(tbidx, era_pred)
262 |             ]
263 |             ccs = np.array(ccs)
264 | 
265 |         computed.append(ccs)
266 | 
267 |     return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
268 | 
269 | 
270 | def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None):
271 |     """DEPRECATED"""
272 |     if feature_cols is None:
273 |         feature_cols = [c for c in df.columns if c.startswith("feature")]
274 |     u = df.loc[:, feature_cols].corrwith(df[prediction_col])
275 |     e = df.loc[:, feature_cols].corrwith(df[example_col])
276 |     return 1 - (np.dot(u, e) / np.dot(e, e))
277 | 
278 | 
279 | def validation_metrics(
280 |     validation_data,
281 |     pred_cols,
282 |     example_col,
283 |     fast_mode=False,
284 |     target_col=TARGET_COL,
285 |     features_for_neutralization=None,
286 | ):
287 |     """DEPRECATED"""
288 |     validation_stats = pd.DataFrame()
289 |     feature_cols = [c for c in validation_data if c.startswith("feature_")]
290 |     for pred_col in pred_cols:
291 |         # Check the per-era correlations on the validation set (out of sample)
292 |         validation_correlations = validation_data.groupby(ERA_COL).apply(
293 |             lambda d: numerai_corr(d[pred_col], d[target_col])
294 |         )
295 | 
296 |         mean = validation_correlations.mean()
297 |         std = validation_correlations.std(ddof=0)
298 |         sharpe = mean / std
299 | 
300 |         validation_stats.loc["mean", pred_col] = mean
301 |         validation_stats.loc["std", pred_col] = std
302 |         validation_stats.loc["sharpe", pred_col] = sharpe
303 | 
304 |         rolling_max = (
305 |             (validation_correlations + 1)
306 |             .cumprod()
307 |             .rolling(window=9000, min_periods=1)  # arbitrarily large
308 |             .max()
309 |         )
310 |         daily_value = (validation_correlations + 1).cumprod()
311 |         max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
312 |         validation_stats.loc["max_drawdown", pred_col] = max_drawdown
313 | 
314 |         payout_scores = validation_correlations.clip(-0.25, 0.25)
315 |         payout_daily_value = (payout_scores + 1).cumprod()
316 | 
317 |         apy = (
318 |             ((payout_daily_value.dropna().iloc[-1]) ** (1 / len(payout_scores)))
319 |             ** 49  # 52 weeks of compounding minus 3 for stake compounding lag
320 |             - 1
321 |         ) * 100
322 | 
323 |         validation_stats.loc["apy", pred_col] = apy
324 | 
325 |         if not fast_mode:
326 |             # Check the feature exposure of your validation predictions
327 |             max_per_era = validation_data.groupby(ERA_COL).apply(
328 |                 lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()
329 |             )
330 |             max_feature_exposure = max_per_era.mean()
331 |             validation_stats.loc["max_feature_exposure", pred_col] = (
332 |                 max_feature_exposure
333 |             )
334 | 
335 |             # Check feature neutral mean
336 |             feature_neutral_mean = get_feature_neutral_mean(
337 |                 validation_data, pred_col, target_col, features_for_neutralization
338 |             )
339 |             validation_stats.loc["feature_neutral_mean", pred_col] = (
340 |                 feature_neutral_mean
341 |             )
342 | 
343 |             # Check TB200 feature neutral mean
344 |             tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(
345 |                 lambda df: get_feature_neutral_mean_tb_era(
346 |                     df, pred_col, target_col, 200, features_for_neutralization
347 |                 )
348 |             )
349 |             validation_stats.loc["tb200_feature_neutral_mean", pred_col] = (
350 |                 tb200_feature_neutral_mean_era.mean()
351 |             )
352 | 
353 |             # Check top and bottom 200 metrics (TB200)
354 |             tb200_validation_correlations = fast_score_by_date(
355 |                 validation_data, [pred_col], target_col, tb=200, era_col=ERA_COL
356 |             )
357 | 
358 |             tb200_mean = tb200_validation_correlations.mean()[pred_col]
359 |             tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col]
360 |             tb200_sharpe = tb200_mean / tb200_std
361 | 
362 |             validation_stats.loc["tb200_mean", pred_col] = tb200_mean
363 |             validation_stats.loc["tb200_std", pred_col] = tb200_std
364 |             validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe
365 | 
366 |         # MMC over validation
367 |         mmc_scores = []
368 |         corr_scores = []
369 |         for _, x in validation_data.groupby(ERA_COL):
370 |             series = neutralize_series(unif(x[pred_col]), (x[example_col]))
371 |             mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29**2))
372 |             corr_scores.append(unif(x[pred_col]).corr(x[target_col]))
373 | 
374 |         val_mmc_mean = np.mean(mmc_scores)
375 |         val_mmc_std = np.std(mmc_scores)
376 |         corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
377 |         corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
378 | 
379 |         validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean
380 |         validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe
381 | 
382 |         # Check correlation with example predictions
383 |         per_era_corrs = validation_data.groupby(ERA_COL).apply(
384 |             lambda d: unif(d[pred_col]).corr(unif(d[example_col]))
385 |         )
386 |         corr_with_example_preds = per_era_corrs.mean()
387 |         validation_stats.loc["corr_with_example_preds", pred_col] = (
388 |             corr_with_example_preds
389 |         )
390 | 
391 |         # Check exposure dissimilarity per era
392 |         tdf = validation_data.groupby(ERA_COL).apply(
393 |             lambda df: exposure_dissimilarity_per_era(
394 |                 df, pred_col, example_col, feature_cols
395 |             )
396 |         )
397 |         validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean()
398 | 
399 |     # .transpose so that stats are columns and the model_name is the row
400 |     return validation_stats.transpose()
401 | 


--------------------------------------------------------------------------------