├── .github
└── workflows
│ └── build-models.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── cached-pickles
├── example_model.pkl
├── feature_neutralization.pkl
├── hello_numerai.pkl
└── target_ensemble.pkl
├── example_model.ipynb
├── feature_neutralization.ipynb
├── hello_numerai.ipynb
├── target_ensemble.ipynb
└── utils.py
/.github/workflows/build-models.yml:
--------------------------------------------------------------------------------
1 | name: Build Example Model Pickles
2 |
3 | on:
4 | workflow_dispatch:
5 | push:
6 | # paths:
7 | # - example_model.ipynb
8 | # - hello_numerai.ipynb
9 | # - feature_neutralization.ipynb
10 | # - target_ensemble.ipynb
11 | branches:
12 | - master
13 |
14 | concurrency: build-example-models
15 |
16 | jobs:
17 |
18 | build_and_test:
19 | name: "Build Example Model Pickles"
20 | runs-on: ubuntu-latest
21 | steps:
22 | - uses: actions/checkout@v3
23 | - uses: actions/setup-python@v5
24 | with:
25 | python-version: "3.10"
26 | - name: Install jupyter
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install jupyter
30 | pip install -r https://raw.githubusercontent.com/numerai/numerai-predict/refs/heads/master/requirements.txt
31 | - name: build-example-model
32 | run: |
33 | jupyter nbconvert \
34 | --execute example_model.ipynb \
35 | --ExecutePreprocessor.timeout=-1 \
36 | --to html
37 | - name: build-hello-numerai
38 | run: |
39 | jupyter nbconvert \
40 | --execute hello_numerai.ipynb \
41 | --ExecutePreprocessor.timeout=-1 \
42 | --to html
43 | - name: build-feature-neutralization
44 | run: |
45 | jupyter nbconvert \
46 | --execute feature_neutralization.ipynb \
47 | --ExecutePreprocessor.timeout=-1 \
48 | --to html
49 | - name: build-target-ensemble
50 | run: |
51 | jupyter nbconvert \
52 | --execute target_ensemble.ipynb \
53 | --ExecutePreprocessor.timeout=-1 \
54 | --to html
55 | - name: delete-html
56 | run: |
57 | rm example_model.html
58 | rm hello_numerai.html
59 | rm feature_neutralization.html
60 | rm target_ensemble.html
61 | - name: move-pickles-to-cached-pickles-dir
62 | run: |
63 | mkdir -p cached-pickles/
64 | mv -f example_model.pkl cached-pickles/
65 | mv -f hello_numerai.pkl cached-pickles/
66 | mv -f feature_neutralization.pkl cached-pickles/
67 | mv -f target_ensemble.pkl cached-pickles/
68 | - name: commit-to-master
69 | uses: EndBug/add-and-commit@v9
70 | with:
71 | add: "cached-pickles/*"
72 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | # Data
92 | *.csv
93 | *.parquet
94 | *.json
95 | *.model
96 |
97 | .idea
98 | example_model.xgb
99 |
100 | .DS_Store
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Numerai
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Numerai Example Scripts
2 |
3 | A collection of scripts and notebooks to help you get started quickly.
4 |
5 | Need help? Find us on Discord:
6 |
7 | [](https://discord.gg/numerai)
8 |
9 |
10 | ## Notebooks
11 |
12 | Try running these notebooks on Google Colab's free tier!
13 |
14 | ### Hello Numerai
15 |
16 |
17 |
18 |
19 | Start here if you are new! Explore the dataset and build your first model.
20 |
21 | ### Feature Neutralization
22 |
23 |
24 |
25 |
26 | Learn how to measure feature risk and control it with feature neutralization.
27 |
28 | ### Target Ensemble
29 |
30 |
31 |
32 |
33 | Learn how to create an ensemble trained on different targets.
34 |
35 | ### Model Upload
36 |
37 |
38 |
39 |
40 | A barebones example of how to build and upload your model to Numerai.
--------------------------------------------------------------------------------
/cached-pickles/example_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/example_model.pkl
--------------------------------------------------------------------------------
/cached-pickles/feature_neutralization.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/feature_neutralization.pkl
--------------------------------------------------------------------------------
/cached-pickles/hello_numerai.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/hello_numerai.pkl
--------------------------------------------------------------------------------
/cached-pickles/target_ensemble.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/target_ensemble.pkl
--------------------------------------------------------------------------------
/example_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "ZqK_u9k-hMqE"
7 | },
8 | "source": [
9 | "# Model Upload"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "colab": {
17 | "base_uri": "https://localhost:8080/"
18 | },
19 | "id": "Ekw8Z93ljC3v",
20 | "outputId": "675ac893-5a46-4c6b-dc03-09438941d1fc"
21 | },
22 | "outputs": [
23 | {
24 | "name": "stdout",
25 | "output_type": "stream",
26 | "text": [
27 | "Python 3.10.12\n"
28 | ]
29 | }
30 | ],
31 | "source": [
32 | "!python --version"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "colab": {
40 | "base_uri": "https://localhost:8080/"
41 | },
42 | "id": "yoy_wT1rhMqF",
43 | "outputId": "4268fdb0-84d2-4502-97e4-e93a1440c8ee"
44 | },
45 | "outputs": [
46 | {
47 | "name": "stdout",
48 | "output_type": "stream",
49 | "text": [
50 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.4/34.4 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
51 | "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
52 | "lida 0.0.10 requires fastapi, which is not installed.\n",
53 | "lida 0.0.10 requires kaleido, which is not installed.\n",
54 | "lida 0.0.10 requires python-multipart, which is not installed.\n",
55 | "lida 0.0.10 requires uvicorn, which is not installed.\u001b[0m\u001b[31m\n",
56 | "\u001b[0m"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "# Install dependencies\n",
62 | "!pip install -q numerapi pandas lightgbm cloudpickle==2.2.1 pyarrow scikit-learn scipy==1.10.1"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {
69 | "colab": {
70 | "base_uri": "https://localhost:8080/",
71 | "height": 17
72 | },
73 | "id": "13hdRk9ghMqI",
74 | "outputId": "857a4882-83e5-4a76-9b1e-57d6d822cc67"
75 | },
76 | "outputs": [
77 | {
78 | "data": {
79 | "application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ",
80 | "text/plain": [
81 | ""
82 | ]
83 | },
84 | "metadata": {},
85 | "output_type": "display_data"
86 | },
87 | {
88 | "data": {
89 | "application/javascript": "download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)",
90 | "text/plain": [
91 | ""
92 | ]
93 | },
94 | "metadata": {},
95 | "output_type": "display_data"
96 | }
97 | ],
98 | "source": [
99 | "from numerapi import NumerAPI\n",
100 | "import pandas as pd\n",
101 | "import json\n",
102 | "napi = NumerAPI()\n",
103 | "\n",
104 | "# use one of the latest data versions\n",
105 | "DATA_VERSION = \"v5.0\"\n",
106 | "\n",
107 | "# Download data\n",
108 | "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
109 | "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
110 | "\n",
111 | "# Load data\n",
112 | "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
113 | "features = feature_metadata[\"feature_sets\"][\"small\"]\n",
114 | "# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
115 | "# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
116 | "# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
117 | "train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
118 | "\n",
119 | "# For better models, join train and validation data and train on all of it.\n",
120 | "# This would cause diagnostics to be misleading though.\n",
121 | "# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
122 | "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
123 | "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
124 | "# train = pd.concat([train, validation])\n",
125 | "\n",
126 | "# Downsample for speed\n",
127 | "train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])] # skip this step for better performance\n",
128 | "\n",
129 | "# Train model\n",
130 | "import lightgbm as lgb\n",
131 | "model = lgb.LGBMRegressor(\n",
132 | " n_estimators=2000,\n",
133 | " learning_rate=0.01,\n",
134 | " max_depth=5,\n",
135 | " num_leaves=2**5-1,\n",
136 | " colsample_bytree=0.1\n",
137 | ")\n",
138 | "# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n",
139 | "# model = lgb.LGBMRegressor(\n",
140 | "# n_estimators=30_000,\n",
141 | "# learning_rate=0.001,\n",
142 | "# max_depth=10,\n",
143 | "# num_leaves=2**10,\n",
144 | "# colsample_bytree=0.1,\n",
145 | "# min_data_in_leaf=10000,\n",
146 | "# )\n",
147 | "model.fit(\n",
148 | " train[features],\n",
149 | " train[\"target\"]\n",
150 | ")\n",
151 | "\n",
152 | "# Define predict function\n",
153 | "def predict(\n",
154 | " live_features: pd.DataFrame,\n",
155 | " live_benchmark_models: pd.DataFrame\n",
156 | " ) -> pd.DataFrame:\n",
157 | " live_predictions = model.predict(live_features[features])\n",
158 | " submission = pd.Series(live_predictions, index=live_features.index)\n",
159 | " return submission.to_frame(\"prediction\")\n",
160 | "\n",
161 | "# Pickle predict function\n",
162 | "import cloudpickle\n",
163 | "p = cloudpickle.dumps(predict)\n",
164 | "with open(\"example_model.pkl\", \"wb\") as f:\n",
165 | " f.write(p)\n",
166 | "\n",
167 | "# Download file if running in Google Colab\n",
168 | "try:\n",
169 | " from google.colab import files\n",
170 | " files.download('example_model.pkl')\n",
171 | "except:\n",
172 | " pass"
173 | ]
174 | }
175 | ],
176 | "metadata": {
177 | "colab": {
178 | "provenance": []
179 | },
180 | "kernelspec": {
181 | "display_name": "venv",
182 | "language": "python",
183 | "name": "python3"
184 | },
185 | "language_info": {
186 | "codemirror_mode": {
187 | "name": "ipython",
188 | "version": 3
189 | },
190 | "file_extension": ".py",
191 | "mimetype": "text/x-python",
192 | "name": "python",
193 | "nbconvert_exporter": "python",
194 | "pygments_lexer": "ipython3",
195 | "version": "3.10.12"
196 | },
197 | "orig_nbformat": 4
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 0
201 | }
202 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | THIS MODULE IS DEPRECATED. Use numerai-tools:
3 | https://github.com/numerai/numerai-tools
4 |
5 | If there is a feature missing from numerai-tools, please
6 | open an issue with a link to the function in this file you'd
7 | like to see.
8 | """
9 |
10 | import numpy as np
11 | import pandas as pd
12 | import scipy
13 | from tqdm import tqdm
14 | from pathlib import Path
15 | import json
16 |
17 | ERA_COL = "era"
18 | TARGET_COL = "target_cyrus_v4_20"
19 | DATA_TYPE_COL = "data_type"
20 | EXAMPLE_PREDS_COL = "example_preds"
21 | MODEL_FOLDER = "models"
22 | MODEL_CONFIGS_FOLDER = "model_configs"
23 | PREDICTION_FILES_FOLDER = "prediction_files"
24 |
25 |
26 | def save_prediction(df, name):
27 | """DEPRECATED"""
28 | try:
29 | Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True)
30 | except Exception as ex:
31 | pass
32 | df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True)
33 |
34 |
35 | def save_model(model, name):
36 | """DEPRECATED"""
37 | try:
38 | Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True)
39 | except Exception as ex:
40 | pass
41 | pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl")
42 |
43 |
44 | def load_model(name):
45 | """DEPRECATED"""
46 | path = Path(f"{MODEL_FOLDER}/{name}.pkl")
47 | if path.is_file():
48 | model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl")
49 | else:
50 | model = False
51 | return model
52 |
53 |
54 | def save_model_config(model_config, model_name):
55 | """DEPRECATED"""
56 | try:
57 | Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True)
58 | except Exception as ex:
59 | pass
60 | with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", "w") as fp:
61 | json.dump(model_config, fp)
62 |
63 |
64 | def load_model_config(model_name):
65 | """DEPRECATED"""
66 | path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json"
67 | path = Path(path_str)
68 | if path.is_file():
69 | with open(path_str, "r") as fp:
70 | model_config = json.load(fp)
71 | else:
72 | model_config = False
73 | return model_config
74 |
75 |
76 | def get_biggest_change_features(corrs, n):
77 | """DEPRECATED"""
78 | all_eras = corrs.index.sort_values()
79 | h1_eras = all_eras[: len(all_eras) // 2]
80 | h2_eras = all_eras[len(all_eras) // 2 :]
81 |
82 | h1_corr_means = corrs.loc[h1_eras, :].mean()
83 | h2_corr_means = corrs.loc[h2_eras, :].mean()
84 |
85 | corr_diffs = h2_corr_means - h1_corr_means
86 | worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
87 | return worst_n
88 |
89 |
90 | def get_time_series_cross_val_splits(data, cv=3, embargo=12):
91 | """DEPRECATED"""
92 | all_train_eras = data[ERA_COL].unique()
93 | len_split = len(all_train_eras) // cv
94 | test_splits = [
95 | all_train_eras[i * len_split : (i + 1) * len_split] for i in range(cv)
96 | ]
97 | # fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv
98 | remainder = len(all_train_eras) % cv
99 | if remainder != 0:
100 | test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:])
101 |
102 | train_splits = []
103 | for test_split in test_splits:
104 | test_split_max = int(np.max(test_split))
105 | test_split_min = int(np.min(test_split))
106 | # get all of the eras that aren't in the test split
107 | train_split_not_embargoed = [
108 | e
109 | for e in all_train_eras
110 | if not (test_split_min <= int(e) <= test_split_max)
111 | ]
112 | # embargo the train split so we have no leakage.
113 | # one era is length 5, so we need to embargo by target_length/5 eras.
114 | # To be consistent for all targets, let's embargo everything by 60/5 == 12 eras.
115 | train_split = [
116 | e
117 | for e in train_split_not_embargoed
118 | if abs(int(e) - test_split_max) > embargo
119 | and abs(int(e) - test_split_min) > embargo
120 | ]
121 | train_splits.append(train_split)
122 |
123 | # convenient way to iterate over train and test splits
124 | train_test_zip = zip(train_splits, test_splits)
125 | return train_test_zip
126 |
127 |
128 | def neutralize(
129 | df,
130 | columns,
131 | neutralizers=None,
132 | proportion=1.0,
133 | normalize=True,
134 | era_col="era",
135 | verbose=False,
136 | ):
137 | """DEPRECATED"""
138 | if neutralizers is None:
139 | neutralizers = []
140 | unique_eras = df[era_col].unique()
141 | computed = []
142 | if verbose:
143 | iterator = tqdm(unique_eras)
144 | else:
145 | iterator = unique_eras
146 | for u in iterator:
147 | df_era = df[df[era_col] == u]
148 | scores = df_era[columns].values
149 | if normalize:
150 | scores2 = []
151 | for x in scores.T:
152 | x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x)
153 | x = scipy.stats.norm.ppf(x)
154 | scores2.append(x)
155 | scores = np.array(scores2).T
156 | exposures = df_era[neutralizers].values
157 |
158 | scores -= proportion * exposures.dot(
159 | np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot(
160 | scores.astype(np.float32)
161 | )
162 | )
163 |
164 | scores /= scores.std(ddof=0)
165 |
166 | computed.append(scores)
167 |
168 | return pd.DataFrame(np.concatenate(computed), columns=columns, index=df.index)
169 |
170 |
171 | def neutralize_series(series, by, proportion=1.0):
172 | """DEPRECATED"""
173 | scores = series.values.reshape(-1, 1)
174 | exposures = by.values.reshape(-1, 1)
175 |
176 | # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
177 | exposures = np.hstack(
178 | (exposures, np.array([np.mean(series)] * len(exposures)).reshape(-1, 1))
179 | )
180 |
181 | correction = proportion * (
182 | exposures.dot(np.linalg.lstsq(exposures, scores, rcond=None)[0])
183 | )
184 | corrected_scores = scores - correction
185 | neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
186 | return neutralized
187 |
188 |
189 | def unif(df):
190 | """DEPRECATED"""
191 | x = (df.rank(method="first") - 0.5) / len(df)
192 | return pd.Series(x, index=df.index)
193 |
194 |
195 | def numerai_corr(preds, target):
196 | """DEPRECATED"""
197 | # rank (keeping ties) then gaussianize predictions to standardize prediction distributions
198 | ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
199 | gauss_ranked_preds = scipy.stats.norm.ppf(ranked_preds)
200 | # center targets around 0
201 | centered_target = target - target.mean()
202 | # raise both preds and target to the power of 1.5 to accentuate the tails
203 | preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
204 | target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
205 | # finally return the Pearson correlation
206 | return np.corrcoef(preds_p15, target_p15)[0, 1]
207 |
208 |
209 | def get_feature_neutral_mean(
210 | df, prediction_col, target_col, features_for_neutralization=None
211 | ):
212 | """DEPRECATED"""
213 | if features_for_neutralization is None:
214 | features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
215 | df.loc[:, "neutral_sub"] = neutralize(
216 | df, [prediction_col], features_for_neutralization
217 | )[prediction_col]
218 | scores = (
219 | df.groupby("era")
220 | .apply(lambda x: numerai_corr(x["neutral_sub"], x[target_col]))
221 | .mean()
222 | )
223 | return np.mean(scores)
224 |
225 |
226 | def get_feature_neutral_mean_tb_era(
227 | df, prediction_col, target_col, tb, features_for_neutralization=None
228 | ):
229 | """DEPRECATED"""
230 | if features_for_neutralization is None:
231 | features_for_neutralization = [c for c in df.columns if c.startswith("feature")]
232 | temp_df = df.reset_index(
233 | drop=True
234 | ).copy() # Reset index due to use of argsort later
235 | temp_df.loc[:, "neutral_sub"] = neutralize(
236 | temp_df, [prediction_col], features_for_neutralization
237 | )[prediction_col]
238 | temp_df_argsort = temp_df.loc[:, "neutral_sub"].argsort()
239 | temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb], temp_df_argsort.iloc[-tb:]])
240 | temp_df_tb = temp_df.loc[temp_df_tb_idx]
241 | tb_fnc = numerai_corr(temp_df_tb["neutral_sub"], temp_df_tb[target_col])
242 | return tb_fnc
243 |
244 |
245 | def fast_score_by_date(df, columns, target, tb=None, era_col="era"):
246 | """DEPRECATED"""
247 | unique_eras = df[era_col].unique()
248 | computed = []
249 | for u in unique_eras:
250 | df_era = df[df[era_col] == u]
251 | era_pred = np.float64(df_era[columns].values.T)
252 | era_target = np.float64(df_era[target].values.T)
253 |
254 | if tb is None:
255 | ccs = numerai_corr(era_pred, era_target)
256 | else:
257 | tbidx = np.argsort(era_pred, axis=1)
258 | tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1)
259 | ccs = [
260 | numerai_corr(pd.Series(era_target[tmpidx]), pd.Series(tmppred[tmpidx]))
261 | for tmpidx, tmppred in zip(tbidx, era_pred)
262 | ]
263 | ccs = np.array(ccs)
264 |
265 | computed.append(ccs)
266 |
267 | return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique())
268 |
269 |
270 | def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None):
271 | """DEPRECATED"""
272 | if feature_cols is None:
273 | feature_cols = [c for c in df.columns if c.startswith("feature")]
274 | u = df.loc[:, feature_cols].corrwith(df[prediction_col])
275 | e = df.loc[:, feature_cols].corrwith(df[example_col])
276 | return 1 - (np.dot(u, e) / np.dot(e, e))
277 |
278 |
279 | def validation_metrics(
280 | validation_data,
281 | pred_cols,
282 | example_col,
283 | fast_mode=False,
284 | target_col=TARGET_COL,
285 | features_for_neutralization=None,
286 | ):
287 | """DEPRECATED"""
288 | validation_stats = pd.DataFrame()
289 | feature_cols = [c for c in validation_data if c.startswith("feature_")]
290 | for pred_col in pred_cols:
291 | # Check the per-era correlations on the validation set (out of sample)
292 | validation_correlations = validation_data.groupby(ERA_COL).apply(
293 | lambda d: numerai_corr(d[pred_col], d[target_col])
294 | )
295 |
296 | mean = validation_correlations.mean()
297 | std = validation_correlations.std(ddof=0)
298 | sharpe = mean / std
299 |
300 | validation_stats.loc["mean", pred_col] = mean
301 | validation_stats.loc["std", pred_col] = std
302 | validation_stats.loc["sharpe", pred_col] = sharpe
303 |
304 | rolling_max = (
305 | (validation_correlations + 1)
306 | .cumprod()
307 | .rolling(window=9000, min_periods=1) # arbitrarily large
308 | .max()
309 | )
310 | daily_value = (validation_correlations + 1).cumprod()
311 | max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
312 | validation_stats.loc["max_drawdown", pred_col] = max_drawdown
313 |
314 | payout_scores = validation_correlations.clip(-0.25, 0.25)
315 | payout_daily_value = (payout_scores + 1).cumprod()
316 |
317 | apy = (
318 | ((payout_daily_value.dropna().iloc[-1]) ** (1 / len(payout_scores)))
319 | ** 49 # 52 weeks of compounding minus 3 for stake compounding lag
320 | - 1
321 | ) * 100
322 |
323 | validation_stats.loc["apy", pred_col] = apy
324 |
325 | if not fast_mode:
326 | # Check the feature exposure of your validation predictions
327 | max_per_era = validation_data.groupby(ERA_COL).apply(
328 | lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max()
329 | )
330 | max_feature_exposure = max_per_era.mean()
331 | validation_stats.loc["max_feature_exposure", pred_col] = (
332 | max_feature_exposure
333 | )
334 |
335 | # Check feature neutral mean
336 | feature_neutral_mean = get_feature_neutral_mean(
337 | validation_data, pred_col, target_col, features_for_neutralization
338 | )
339 | validation_stats.loc["feature_neutral_mean", pred_col] = (
340 | feature_neutral_mean
341 | )
342 |
343 | # Check TB200 feature neutral mean
344 | tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply(
345 | lambda df: get_feature_neutral_mean_tb_era(
346 | df, pred_col, target_col, 200, features_for_neutralization
347 | )
348 | )
349 | validation_stats.loc["tb200_feature_neutral_mean", pred_col] = (
350 | tb200_feature_neutral_mean_era.mean()
351 | )
352 |
353 | # Check top and bottom 200 metrics (TB200)
354 | tb200_validation_correlations = fast_score_by_date(
355 | validation_data, [pred_col], target_col, tb=200, era_col=ERA_COL
356 | )
357 |
358 | tb200_mean = tb200_validation_correlations.mean()[pred_col]
359 | tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col]
360 | tb200_sharpe = tb200_mean / tb200_std
361 |
362 | validation_stats.loc["tb200_mean", pred_col] = tb200_mean
363 | validation_stats.loc["tb200_std", pred_col] = tb200_std
364 | validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe
365 |
366 | # MMC over validation
367 | mmc_scores = []
368 | corr_scores = []
369 | for _, x in validation_data.groupby(ERA_COL):
370 | series = neutralize_series(unif(x[pred_col]), (x[example_col]))
371 | mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29**2))
372 | corr_scores.append(unif(x[pred_col]).corr(x[target_col]))
373 |
374 | val_mmc_mean = np.mean(mmc_scores)
375 | val_mmc_std = np.std(mmc_scores)
376 | corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
377 | corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
378 |
379 | validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean
380 | validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe
381 |
382 | # Check correlation with example predictions
383 | per_era_corrs = validation_data.groupby(ERA_COL).apply(
384 | lambda d: unif(d[pred_col]).corr(unif(d[example_col]))
385 | )
386 | corr_with_example_preds = per_era_corrs.mean()
387 | validation_stats.loc["corr_with_example_preds", pred_col] = (
388 | corr_with_example_preds
389 | )
390 |
391 | # Check exposure dissimilarity per era
392 | tdf = validation_data.groupby(ERA_COL).apply(
393 | lambda df: exposure_dissimilarity_per_era(
394 | df, pred_col, example_col, feature_cols
395 | )
396 | )
397 | validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean()
398 |
399 | # .transpose so that stats are columns and the model_name is the row
400 | return validation_stats.transpose()
401 |
--------------------------------------------------------------------------------