├── .github └── workflows │ └── build-models.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── cached-pickles ├── example_model.pkl ├── feature_neutralization.pkl ├── hello_numerai.pkl └── target_ensemble.pkl ├── example_model.ipynb ├── feature_neutralization.ipynb ├── hello_numerai.ipynb ├── target_ensemble.ipynb └── utils.py /.github/workflows/build-models.yml: -------------------------------------------------------------------------------- 1 | name: Build Example Model Pickles 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | # paths: 7 | # - example_model.ipynb 8 | # - hello_numerai.ipynb 9 | # - feature_neutralization.ipynb 10 | # - target_ensemble.ipynb 11 | branches: 12 | - master 13 | 14 | concurrency: build-example-models 15 | 16 | jobs: 17 | 18 | build_and_test: 19 | name: "Build Example Model Pickles" 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v3 23 | - uses: actions/setup-python@v5 24 | with: 25 | python-version: "3.10" 26 | - name: Install jupyter 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install jupyter 30 | pip install -r https://raw.githubusercontent.com/numerai/numerai-predict/refs/heads/master/requirements.txt 31 | - name: build-example-model 32 | run: | 33 | jupyter nbconvert \ 34 | --execute example_model.ipynb \ 35 | --ExecutePreprocessor.timeout=-1 \ 36 | --to html 37 | - name: build-hello-numerai 38 | run: | 39 | jupyter nbconvert \ 40 | --execute hello_numerai.ipynb \ 41 | --ExecutePreprocessor.timeout=-1 \ 42 | --to html 43 | - name: build-feature-neutralization 44 | run: | 45 | jupyter nbconvert \ 46 | --execute feature_neutralization.ipynb \ 47 | --ExecutePreprocessor.timeout=-1 \ 48 | --to html 49 | - name: build-target-ensemble 50 | run: | 51 | jupyter nbconvert \ 52 | --execute target_ensemble.ipynb \ 53 | --ExecutePreprocessor.timeout=-1 \ 54 | --to html 55 | - name: delete-html 56 | run: | 57 | rm example_model.html 58 | rm hello_numerai.html 59 | rm feature_neutralization.html 60 | rm target_ensemble.html 61 | - name: move-pickles-to-cached-pickles-dir 62 | run: | 63 | mkdir -p cached-pickles/ 64 | mv -f example_model.pkl cached-pickles/ 65 | mv -f hello_numerai.pkl cached-pickles/ 66 | mv -f feature_neutralization.pkl cached-pickles/ 67 | mv -f target_ensemble.pkl cached-pickles/ 68 | - name: commit-to-master 69 | uses: EndBug/add-and-commit@v9 70 | with: 71 | add: "cached-pickles/*" 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Data 92 | *.csv 93 | *.parquet 94 | *.json 95 | *.model 96 | 97 | .idea 98 | example_model.xgb 99 | 100 | .DS_Store -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Numerai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Numerai Example Scripts 2 | 3 | A collection of scripts and notebooks to help you get started quickly. 4 | 5 | Need help? Find us on Discord: 6 | 7 | [![](https://dcbadge.vercel.app/api/server/numerai)](https://discord.gg/numerai) 8 | 9 | 10 | ## Notebooks 11 | 12 | Try running these notebooks on Google Colab's free tier! 13 | 14 | ### Hello Numerai 15 | 16 | Open In Colab 17 | 18 | 19 | Start here if you are new! Explore the dataset and build your first model. 20 | 21 | ### Feature Neutralization 22 | 23 | Open In Colab 24 | 25 | 26 | Learn how to measure feature risk and control it with feature neutralization. 27 | 28 | ### Target Ensemble 29 | 30 | Open In Colab 31 | 32 | 33 | Learn how to create an ensemble trained on different targets. 34 | 35 | ### Model Upload 36 | 37 | Open In Colab 38 | 39 | 40 | A barebones example of how to build and upload your model to Numerai. -------------------------------------------------------------------------------- /cached-pickles/example_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/example_model.pkl -------------------------------------------------------------------------------- /cached-pickles/feature_neutralization.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/feature_neutralization.pkl -------------------------------------------------------------------------------- /cached-pickles/hello_numerai.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/hello_numerai.pkl -------------------------------------------------------------------------------- /cached-pickles/target_ensemble.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numerai/example-scripts/1f4bd2e230f82fb7c6d3c6ce6dd6fe9388dde83c/cached-pickles/target_ensemble.pkl -------------------------------------------------------------------------------- /example_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "ZqK_u9k-hMqE" 7 | }, 8 | "source": [ 9 | "# Model Upload" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "colab": { 17 | "base_uri": "https://localhost:8080/" 18 | }, 19 | "id": "Ekw8Z93ljC3v", 20 | "outputId": "675ac893-5a46-4c6b-dc03-09438941d1fc" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Python 3.10.12\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "!python --version" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "colab": { 40 | "base_uri": "https://localhost:8080/" 41 | }, 42 | "id": "yoy_wT1rhMqF", 43 | "outputId": "4268fdb0-84d2-4502-97e4-e93a1440c8ee" 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.4/34.4 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 51 | "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", 52 | "lida 0.0.10 requires fastapi, which is not installed.\n", 53 | "lida 0.0.10 requires kaleido, which is not installed.\n", 54 | "lida 0.0.10 requires python-multipart, which is not installed.\n", 55 | "lida 0.0.10 requires uvicorn, which is not installed.\u001b[0m\u001b[31m\n", 56 | "\u001b[0m" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# Install dependencies\n", 62 | "!pip install -q numerapi pandas lightgbm cloudpickle==2.2.1 pyarrow scikit-learn scipy==1.10.1" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "colab": { 70 | "base_uri": "https://localhost:8080/", 71 | "height": 17 72 | }, 73 | "id": "13hdRk9ghMqI", 74 | "outputId": "857a4882-83e5-4a76-9b1e-57d6d822cc67" 75 | }, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ", 80 | "text/plain": [ 81 | "" 82 | ] 83 | }, 84 | "metadata": {}, 85 | "output_type": "display_data" 86 | }, 87 | { 88 | "data": { 89 | "application/javascript": "download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)", 90 | "text/plain": [ 91 | "" 92 | ] 93 | }, 94 | "metadata": {}, 95 | "output_type": "display_data" 96 | } 97 | ], 98 | "source": [ 99 | "from numerapi import NumerAPI\n", 100 | "import pandas as pd\n", 101 | "import json\n", 102 | "napi = NumerAPI()\n", 103 | "\n", 104 | "# use one of the latest data versions\n", 105 | "DATA_VERSION = \"v5.0\"\n", 106 | "\n", 107 | "# Download data\n", 108 | "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n", 109 | "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n", 110 | "\n", 111 | "# Load data\n", 112 | "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n", 113 | "features = feature_metadata[\"feature_sets\"][\"small\"]\n", 114 | "# use \"medium\" or \"all\" for better performance. Requires more RAM.\n", 115 | "# features = feature_metadata[\"feature_sets\"][\"medium\"]\n", 116 | "# features = feature_metadata[\"feature_sets\"][\"all\"]\n", 117 | "train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n", 118 | "\n", 119 | "# For better models, join train and validation data and train on all of it.\n", 120 | "# This would cause diagnostics to be misleading though.\n", 121 | "# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n", 122 | "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n", 123 | "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n", 124 | "# train = pd.concat([train, validation])\n", 125 | "\n", 126 | "# Downsample for speed\n", 127 | "train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])] # skip this step for better performance\n", 128 | "\n", 129 | "# Train model\n", 130 | "import lightgbm as lgb\n", 131 | "model = lgb.LGBMRegressor(\n", 132 | " n_estimators=2000,\n", 133 | " learning_rate=0.01,\n", 134 | " max_depth=5,\n", 135 | " num_leaves=2**5-1,\n", 136 | " colsample_bytree=0.1\n", 137 | ")\n", 138 | "# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n", 139 | "# model = lgb.LGBMRegressor(\n", 140 | "# n_estimators=30_000,\n", 141 | "# learning_rate=0.001,\n", 142 | "# max_depth=10,\n", 143 | "# num_leaves=2**10,\n", 144 | "# colsample_bytree=0.1,\n", 145 | "# min_data_in_leaf=10000,\n", 146 | "# )\n", 147 | "model.fit(\n", 148 | " train[features],\n", 149 | " train[\"target\"]\n", 150 | ")\n", 151 | "\n", 152 | "# Define predict function\n", 153 | "def predict(\n", 154 | " live_features: pd.DataFrame,\n", 155 | " live_benchmark_models: pd.DataFrame\n", 156 | " ) -> pd.DataFrame:\n", 157 | " live_predictions = model.predict(live_features[features])\n", 158 | " submission = pd.Series(live_predictions, index=live_features.index)\n", 159 | " return submission.to_frame(\"prediction\")\n", 160 | "\n", 161 | "# Pickle predict function\n", 162 | "import cloudpickle\n", 163 | "p = cloudpickle.dumps(predict)\n", 164 | "with open(\"example_model.pkl\", \"wb\") as f:\n", 165 | " f.write(p)\n", 166 | "\n", 167 | "# Download file if running in Google Colab\n", 168 | "try:\n", 169 | " from google.colab import files\n", 170 | " files.download('example_model.pkl')\n", 171 | "except:\n", 172 | " pass" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "colab": { 178 | "provenance": [] 179 | }, 180 | "kernelspec": { 181 | "display_name": "venv", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.10.12" 196 | }, 197 | "orig_nbformat": 4 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 0 201 | } 202 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | THIS MODULE IS DEPRECATED. Use numerai-tools: 3 | https://github.com/numerai/numerai-tools 4 | 5 | If there is a feature missing from numerai-tools, please 6 | open an issue with a link to the function in this file you'd 7 | like to see. 8 | """ 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import scipy 13 | from tqdm import tqdm 14 | from pathlib import Path 15 | import json 16 | 17 | ERA_COL = "era" 18 | TARGET_COL = "target_cyrus_v4_20" 19 | DATA_TYPE_COL = "data_type" 20 | EXAMPLE_PREDS_COL = "example_preds" 21 | MODEL_FOLDER = "models" 22 | MODEL_CONFIGS_FOLDER = "model_configs" 23 | PREDICTION_FILES_FOLDER = "prediction_files" 24 | 25 | 26 | def save_prediction(df, name): 27 | """DEPRECATED""" 28 | try: 29 | Path(PREDICTION_FILES_FOLDER).mkdir(exist_ok=True, parents=True) 30 | except Exception as ex: 31 | pass 32 | df.to_csv(f"{PREDICTION_FILES_FOLDER}/{name}.csv", index=True) 33 | 34 | 35 | def save_model(model, name): 36 | """DEPRECATED""" 37 | try: 38 | Path(MODEL_FOLDER).mkdir(exist_ok=True, parents=True) 39 | except Exception as ex: 40 | pass 41 | pd.to_pickle(model, f"{MODEL_FOLDER}/{name}.pkl") 42 | 43 | 44 | def load_model(name): 45 | """DEPRECATED""" 46 | path = Path(f"{MODEL_FOLDER}/{name}.pkl") 47 | if path.is_file(): 48 | model = pd.read_pickle(f"{MODEL_FOLDER}/{name}.pkl") 49 | else: 50 | model = False 51 | return model 52 | 53 | 54 | def save_model_config(model_config, model_name): 55 | """DEPRECATED""" 56 | try: 57 | Path(MODEL_CONFIGS_FOLDER).mkdir(exist_ok=True, parents=True) 58 | except Exception as ex: 59 | pass 60 | with open(f"{MODEL_CONFIGS_FOLDER}/{model_name}.json", "w") as fp: 61 | json.dump(model_config, fp) 62 | 63 | 64 | def load_model_config(model_name): 65 | """DEPRECATED""" 66 | path_str = f"{MODEL_CONFIGS_FOLDER}/{model_name}.json" 67 | path = Path(path_str) 68 | if path.is_file(): 69 | with open(path_str, "r") as fp: 70 | model_config = json.load(fp) 71 | else: 72 | model_config = False 73 | return model_config 74 | 75 | 76 | def get_biggest_change_features(corrs, n): 77 | """DEPRECATED""" 78 | all_eras = corrs.index.sort_values() 79 | h1_eras = all_eras[: len(all_eras) // 2] 80 | h2_eras = all_eras[len(all_eras) // 2 :] 81 | 82 | h1_corr_means = corrs.loc[h1_eras, :].mean() 83 | h2_corr_means = corrs.loc[h2_eras, :].mean() 84 | 85 | corr_diffs = h2_corr_means - h1_corr_means 86 | worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist() 87 | return worst_n 88 | 89 | 90 | def get_time_series_cross_val_splits(data, cv=3, embargo=12): 91 | """DEPRECATED""" 92 | all_train_eras = data[ERA_COL].unique() 93 | len_split = len(all_train_eras) // cv 94 | test_splits = [ 95 | all_train_eras[i * len_split : (i + 1) * len_split] for i in range(cv) 96 | ] 97 | # fix the last test split to have all the last eras, in case the number of eras wasn't divisible by cv 98 | remainder = len(all_train_eras) % cv 99 | if remainder != 0: 100 | test_splits[-1] = np.append(test_splits[-1], all_train_eras[-remainder:]) 101 | 102 | train_splits = [] 103 | for test_split in test_splits: 104 | test_split_max = int(np.max(test_split)) 105 | test_split_min = int(np.min(test_split)) 106 | # get all of the eras that aren't in the test split 107 | train_split_not_embargoed = [ 108 | e 109 | for e in all_train_eras 110 | if not (test_split_min <= int(e) <= test_split_max) 111 | ] 112 | # embargo the train split so we have no leakage. 113 | # one era is length 5, so we need to embargo by target_length/5 eras. 114 | # To be consistent for all targets, let's embargo everything by 60/5 == 12 eras. 115 | train_split = [ 116 | e 117 | for e in train_split_not_embargoed 118 | if abs(int(e) - test_split_max) > embargo 119 | and abs(int(e) - test_split_min) > embargo 120 | ] 121 | train_splits.append(train_split) 122 | 123 | # convenient way to iterate over train and test splits 124 | train_test_zip = zip(train_splits, test_splits) 125 | return train_test_zip 126 | 127 | 128 | def neutralize( 129 | df, 130 | columns, 131 | neutralizers=None, 132 | proportion=1.0, 133 | normalize=True, 134 | era_col="era", 135 | verbose=False, 136 | ): 137 | """DEPRECATED""" 138 | if neutralizers is None: 139 | neutralizers = [] 140 | unique_eras = df[era_col].unique() 141 | computed = [] 142 | if verbose: 143 | iterator = tqdm(unique_eras) 144 | else: 145 | iterator = unique_eras 146 | for u in iterator: 147 | df_era = df[df[era_col] == u] 148 | scores = df_era[columns].values 149 | if normalize: 150 | scores2 = [] 151 | for x in scores.T: 152 | x = (scipy.stats.rankdata(x, method="ordinal") - 0.5) / len(x) 153 | x = scipy.stats.norm.ppf(x) 154 | scores2.append(x) 155 | scores = np.array(scores2).T 156 | exposures = df_era[neutralizers].values 157 | 158 | scores -= proportion * exposures.dot( 159 | np.linalg.pinv(exposures.astype(np.float32), rcond=1e-6).dot( 160 | scores.astype(np.float32) 161 | ) 162 | ) 163 | 164 | scores /= scores.std(ddof=0) 165 | 166 | computed.append(scores) 167 | 168 | return pd.DataFrame(np.concatenate(computed), columns=columns, index=df.index) 169 | 170 | 171 | def neutralize_series(series, by, proportion=1.0): 172 | """DEPRECATED""" 173 | scores = series.values.reshape(-1, 1) 174 | exposures = by.values.reshape(-1, 1) 175 | 176 | # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures 177 | exposures = np.hstack( 178 | (exposures, np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)) 179 | ) 180 | 181 | correction = proportion * ( 182 | exposures.dot(np.linalg.lstsq(exposures, scores, rcond=None)[0]) 183 | ) 184 | corrected_scores = scores - correction 185 | neutralized = pd.Series(corrected_scores.ravel(), index=series.index) 186 | return neutralized 187 | 188 | 189 | def unif(df): 190 | """DEPRECATED""" 191 | x = (df.rank(method="first") - 0.5) / len(df) 192 | return pd.Series(x, index=df.index) 193 | 194 | 195 | def numerai_corr(preds, target): 196 | """DEPRECATED""" 197 | # rank (keeping ties) then gaussianize predictions to standardize prediction distributions 198 | ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count() 199 | gauss_ranked_preds = scipy.stats.norm.ppf(ranked_preds) 200 | # center targets around 0 201 | centered_target = target - target.mean() 202 | # raise both preds and target to the power of 1.5 to accentuate the tails 203 | preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5 204 | target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5 205 | # finally return the Pearson correlation 206 | return np.corrcoef(preds_p15, target_p15)[0, 1] 207 | 208 | 209 | def get_feature_neutral_mean( 210 | df, prediction_col, target_col, features_for_neutralization=None 211 | ): 212 | """DEPRECATED""" 213 | if features_for_neutralization is None: 214 | features_for_neutralization = [c for c in df.columns if c.startswith("feature")] 215 | df.loc[:, "neutral_sub"] = neutralize( 216 | df, [prediction_col], features_for_neutralization 217 | )[prediction_col] 218 | scores = ( 219 | df.groupby("era") 220 | .apply(lambda x: numerai_corr(x["neutral_sub"], x[target_col])) 221 | .mean() 222 | ) 223 | return np.mean(scores) 224 | 225 | 226 | def get_feature_neutral_mean_tb_era( 227 | df, prediction_col, target_col, tb, features_for_neutralization=None 228 | ): 229 | """DEPRECATED""" 230 | if features_for_neutralization is None: 231 | features_for_neutralization = [c for c in df.columns if c.startswith("feature")] 232 | temp_df = df.reset_index( 233 | drop=True 234 | ).copy() # Reset index due to use of argsort later 235 | temp_df.loc[:, "neutral_sub"] = neutralize( 236 | temp_df, [prediction_col], features_for_neutralization 237 | )[prediction_col] 238 | temp_df_argsort = temp_df.loc[:, "neutral_sub"].argsort() 239 | temp_df_tb_idx = pd.concat([temp_df_argsort.iloc[:tb], temp_df_argsort.iloc[-tb:]]) 240 | temp_df_tb = temp_df.loc[temp_df_tb_idx] 241 | tb_fnc = numerai_corr(temp_df_tb["neutral_sub"], temp_df_tb[target_col]) 242 | return tb_fnc 243 | 244 | 245 | def fast_score_by_date(df, columns, target, tb=None, era_col="era"): 246 | """DEPRECATED""" 247 | unique_eras = df[era_col].unique() 248 | computed = [] 249 | for u in unique_eras: 250 | df_era = df[df[era_col] == u] 251 | era_pred = np.float64(df_era[columns].values.T) 252 | era_target = np.float64(df_era[target].values.T) 253 | 254 | if tb is None: 255 | ccs = numerai_corr(era_pred, era_target) 256 | else: 257 | tbidx = np.argsort(era_pred, axis=1) 258 | tbidx = np.concatenate([tbidx[:, :tb], tbidx[:, -tb:]], axis=1) 259 | ccs = [ 260 | numerai_corr(pd.Series(era_target[tmpidx]), pd.Series(tmppred[tmpidx])) 261 | for tmpidx, tmppred in zip(tbidx, era_pred) 262 | ] 263 | ccs = np.array(ccs) 264 | 265 | computed.append(ccs) 266 | 267 | return pd.DataFrame(np.array(computed), columns=columns, index=df[era_col].unique()) 268 | 269 | 270 | def exposure_dissimilarity_per_era(df, prediction_col, example_col, feature_cols=None): 271 | """DEPRECATED""" 272 | if feature_cols is None: 273 | feature_cols = [c for c in df.columns if c.startswith("feature")] 274 | u = df.loc[:, feature_cols].corrwith(df[prediction_col]) 275 | e = df.loc[:, feature_cols].corrwith(df[example_col]) 276 | return 1 - (np.dot(u, e) / np.dot(e, e)) 277 | 278 | 279 | def validation_metrics( 280 | validation_data, 281 | pred_cols, 282 | example_col, 283 | fast_mode=False, 284 | target_col=TARGET_COL, 285 | features_for_neutralization=None, 286 | ): 287 | """DEPRECATED""" 288 | validation_stats = pd.DataFrame() 289 | feature_cols = [c for c in validation_data if c.startswith("feature_")] 290 | for pred_col in pred_cols: 291 | # Check the per-era correlations on the validation set (out of sample) 292 | validation_correlations = validation_data.groupby(ERA_COL).apply( 293 | lambda d: numerai_corr(d[pred_col], d[target_col]) 294 | ) 295 | 296 | mean = validation_correlations.mean() 297 | std = validation_correlations.std(ddof=0) 298 | sharpe = mean / std 299 | 300 | validation_stats.loc["mean", pred_col] = mean 301 | validation_stats.loc["std", pred_col] = std 302 | validation_stats.loc["sharpe", pred_col] = sharpe 303 | 304 | rolling_max = ( 305 | (validation_correlations + 1) 306 | .cumprod() 307 | .rolling(window=9000, min_periods=1) # arbitrarily large 308 | .max() 309 | ) 310 | daily_value = (validation_correlations + 1).cumprod() 311 | max_drawdown = -((rolling_max - daily_value) / rolling_max).max() 312 | validation_stats.loc["max_drawdown", pred_col] = max_drawdown 313 | 314 | payout_scores = validation_correlations.clip(-0.25, 0.25) 315 | payout_daily_value = (payout_scores + 1).cumprod() 316 | 317 | apy = ( 318 | ((payout_daily_value.dropna().iloc[-1]) ** (1 / len(payout_scores))) 319 | ** 49 # 52 weeks of compounding minus 3 for stake compounding lag 320 | - 1 321 | ) * 100 322 | 323 | validation_stats.loc["apy", pred_col] = apy 324 | 325 | if not fast_mode: 326 | # Check the feature exposure of your validation predictions 327 | max_per_era = validation_data.groupby(ERA_COL).apply( 328 | lambda d: d[feature_cols].corrwith(d[pred_col]).abs().max() 329 | ) 330 | max_feature_exposure = max_per_era.mean() 331 | validation_stats.loc["max_feature_exposure", pred_col] = ( 332 | max_feature_exposure 333 | ) 334 | 335 | # Check feature neutral mean 336 | feature_neutral_mean = get_feature_neutral_mean( 337 | validation_data, pred_col, target_col, features_for_neutralization 338 | ) 339 | validation_stats.loc["feature_neutral_mean", pred_col] = ( 340 | feature_neutral_mean 341 | ) 342 | 343 | # Check TB200 feature neutral mean 344 | tb200_feature_neutral_mean_era = validation_data.groupby(ERA_COL).apply( 345 | lambda df: get_feature_neutral_mean_tb_era( 346 | df, pred_col, target_col, 200, features_for_neutralization 347 | ) 348 | ) 349 | validation_stats.loc["tb200_feature_neutral_mean", pred_col] = ( 350 | tb200_feature_neutral_mean_era.mean() 351 | ) 352 | 353 | # Check top and bottom 200 metrics (TB200) 354 | tb200_validation_correlations = fast_score_by_date( 355 | validation_data, [pred_col], target_col, tb=200, era_col=ERA_COL 356 | ) 357 | 358 | tb200_mean = tb200_validation_correlations.mean()[pred_col] 359 | tb200_std = tb200_validation_correlations.std(ddof=0)[pred_col] 360 | tb200_sharpe = tb200_mean / tb200_std 361 | 362 | validation_stats.loc["tb200_mean", pred_col] = tb200_mean 363 | validation_stats.loc["tb200_std", pred_col] = tb200_std 364 | validation_stats.loc["tb200_sharpe", pred_col] = tb200_sharpe 365 | 366 | # MMC over validation 367 | mmc_scores = [] 368 | corr_scores = [] 369 | for _, x in validation_data.groupby(ERA_COL): 370 | series = neutralize_series(unif(x[pred_col]), (x[example_col])) 371 | mmc_scores.append(np.cov(series, x[target_col])[0, 1] / (0.29**2)) 372 | corr_scores.append(unif(x[pred_col]).corr(x[target_col])) 373 | 374 | val_mmc_mean = np.mean(mmc_scores) 375 | val_mmc_std = np.std(mmc_scores) 376 | corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] 377 | corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) 378 | 379 | validation_stats.loc["mmc_mean", pred_col] = val_mmc_mean 380 | validation_stats.loc["corr_plus_mmc_sharpe", pred_col] = corr_plus_mmc_sharpe 381 | 382 | # Check correlation with example predictions 383 | per_era_corrs = validation_data.groupby(ERA_COL).apply( 384 | lambda d: unif(d[pred_col]).corr(unif(d[example_col])) 385 | ) 386 | corr_with_example_preds = per_era_corrs.mean() 387 | validation_stats.loc["corr_with_example_preds", pred_col] = ( 388 | corr_with_example_preds 389 | ) 390 | 391 | # Check exposure dissimilarity per era 392 | tdf = validation_data.groupby(ERA_COL).apply( 393 | lambda df: exposure_dissimilarity_per_era( 394 | df, pred_col, example_col, feature_cols 395 | ) 396 | ) 397 | validation_stats.loc["exposure_dissimilarity_mean", pred_col] = tdf.mean() 398 | 399 | # .transpose so that stats are columns and the model_name is the row 400 | return validation_stats.transpose() 401 | --------------------------------------------------------------------------------