├── .github └── workflows │ ├── python-package-cd.yml │ └── python-package-ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── examples └── basic_demo.ipynb ├── noxfile.py ├── pyproject.toml ├── src └── llm_regression │ ├── __init__.py │ ├── models.py │ ├── py.typed │ └── utils.py └── tests ├── test_models.py └── test_utils.py /.github/workflows/python-package-cd.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Python package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | build-and-deploy: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | python-version: ["3.12"] 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install Nox 21 | run: | 22 | python -m pip install --upgrade pip 23 | python -m pip install --upgrade setuptools 24 | python -m pip install "nox==2023.4.22" 25 | - name: Build wheel and push to PyPI 26 | env: 27 | PYPI_USR: ${{ secrets.PYPI_USR }} 28 | PYPI_PWD: ${{ secrets.PYPI_PWD }} 29 | run: | 30 | nox -s build_and_deploy 31 | -------------------------------------------------------------------------------- /.github/workflows/python-package-ci.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | python-version: ["3.12"] 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install Nox 21 | run: | 22 | python -m pip install --upgrade pip 23 | python -m pip install --upgrade setuptools 24 | python -m pip install "nox==2023.4.22" 25 | - name: Run code formatting checks 26 | run: | 27 | nox -s check_code_formatting-${{ matrix.python-version }} 28 | - name: Run static type checking 29 | run: | 30 | nox -s check_types-${{ matrix.python-version }} 31 | - name: Run tests 32 | run: | 33 | nox -s run_tests-${{ matrix.python-version }} 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Misc 2 | TODO.md 3 | .vscode/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | .ruff_cache 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Alex Ioannides 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Regression using LLMs 2 | 3 | The llm-regression package demonstrates how LLMs can be used to solve classical regression problems, and exposes these capabilities for you to experiment with. Example: 4 | 5 | ```python 6 | from llm_regression import OpenAiRegressor 7 | 8 | llm_regressor = OpenAiRegressor(model="gpt-3.5-turbo") 9 | llm_regressor.fit(X_train, y_train) 10 | y_pred = llm_regressor.predict(X_test) 11 | ``` 12 | 13 | This work was motivated by the paper, 14 | 15 | ["_From Words to Numbers: You LLM is Secretly a Capable Regressor_", by Vacareanu et al. (2024)](https://arxiv.org/abs/2404.07544). 16 | 17 | Which is well worth a read! 18 | 19 | ## Installing 20 | 21 | You can install the llm_regression package, together with the dependencies required to run the example notebooks, directly from this repo, 22 | 23 | ```text 24 | pip install -U pip 25 | pip install "llm-regression[examples] @ git+https://github.com/AlexIoannides/llm-regression.git" 26 | ``` 27 | 28 | ## Examples 29 | 30 | Checkout the [basic_demo notebook](https://github.com/AlexIoannides/llm-regression/tree/main/examples/basic_demo.ipynb). 31 | 32 | ## Developer Setup 33 | 34 | If you want to modify or extend the work in this repo, then the information in this section is for you. 35 | 36 | ### Install Developer Tools 37 | 38 | Install the package as an [editable dependency](https://setuptools.pypa.io/en/latest/userguide/development_mode.html), together with all the developer tools required to format code, check types and run tests: 39 | 40 | ```text 41 | pip install -e ".[dev]" 42 | ``` 43 | 44 | ### Developer Task Execution with Nox 45 | 46 | We use [Nox](https://nox.thea.codes/en/stable/) for scripting developer tasks, such as formatting code, checking types and running tests. These tasks are defined in `noxfile.py`, a list of which can be returned on the command line, 47 | 48 | ```text 49 | $ nox --list 50 | 51 | Sessions defined in /Users/.../noxfile.py: 52 | 53 | * run_tests-3.12 -> Run unit tests. 54 | - format_code-3.12 -> Lint code and re-format where necessary. 55 | * check_code_formatting-3.12 -> Check code for formatting errors. 56 | * check_types-3.12 -> Run static type checking. 57 | - build_and_deploy-3.12 -> Build wheel and deploy to PyPI. 58 | 59 | sessions marked with * are selected, sessions marked with - are skipped. 60 | ``` 61 | 62 | Single tasks can be executed easily - e.g., 63 | 64 | ```text 65 | $ nox -s run_tests 66 | 67 | nox > Running session run_tests-3.12 68 | nox > Creating virtual environment (virtualenv) using python3.12 in .nox/run_tests-3-10 69 | nox > python -m pip install '.[dev]' 70 | nox > pytest 71 | ======================================== test session starts ======================================== 72 | platform darwin -- Python 3.12.2, pytest-7.4.2, pluggy-1.3.0 73 | rootdir: /Users/.../llm_regression 74 | configfile: pyproject.toml 75 | testpaths: tests 76 | collected 1 item 77 | 78 | tests/test_hello_world.py [100%] 79 | 80 | ========================================== 1 passed in 0.00s ========================================= 81 | nox > Session run_tests-3.12 was successful. 82 | ``` 83 | 84 | ### CI/CD 85 | 86 | This repo comes configured to run two [GitHub Actions](https://docs.github.com/en/actions) workflows: 87 | 88 | - **Test Python Package (CI)**, defined in `.github/workflows/python-package-ci.yml` 89 | - **Deploy Python Package (CD)**, defined in `.github/workflows/python-package-cd.yml` 90 | 91 | The CI workflow has been configured to run whenever a pull request to the `main` branch is created. The CD workflow has been configured to run whenever a [release](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository) is created on GitHub. 92 | -------------------------------------------------------------------------------- /examples/basic_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting LLMs to Perform Regression\n", 8 | "\n", 9 | "This notebook explores the claim that LLMs can perform regression when training data is included as in-context examples - i.e., when labelled feature data is included in the prompt sent to the model. The paper that motivated this work is,\n", 10 | "\n", 11 | "[\"_From Words to Numbers: You LLM is Secretly a Capable Regressor_\", by Vacareanu et al. (2024)](https://arxiv.org/abs/2404.07544).\n", 12 | "\n", 13 | "We've focused exclusively on OpenAI LLMs (accessed via API), and wrapped all of the code into single class, `OpenAiRegressor`. This implements the Scikit-Learn [BaseEstimator](https://arxiv.org/abs/2404.07544) interface - i.e., the class has `fit` and `predict` methods with the same signature as those found in Scikit-Learn models. You can find the source code in `src/llm_regression/models.py` and/or install the package and import it directly, as we've done below.\n", 14 | "\n", 15 | "## Configuring OpenAI API Keys\n", 16 | "\n", 17 | "If you want to try this code for yourself, then you'll need an OpenAI API key. Can be set as an environment variable - e.g.,\n", 18 | "\n", 19 | "```python\n", 20 | "import os\n", 21 | "\n", 22 | "os.environ[\"OPENAI_API_KEY\"] = \"your-open-api-key\"\n", 23 | "```\n", 24 | "\n", 25 | "Alternatively, you can add the environment variable to a `.env` file in your working directory and `OpenAiRegressor` will pick it up for there (make sure to keep this file out of source control),\n", 26 | "\n", 27 | "```\n", 28 | "touch .env\n", 29 | "\"OPENAI_API_KEY=your-open-api-key\" >> .env\n", 30 | "```\n", 31 | "\n", 32 | "## How do we get LLMs to Perform Regression?\n", 33 | "\n", 34 | "By sending them prompts that includes labelled feature data and then asking the model to provide a missing input - e.g.,\n", 35 | "\n", 36 | "```text\n", 37 | "Your task is to provide your best estimate for ”Output”. Please provide that\n", 38 | "and only that, without any additional text.\n", 39 | "\n", 40 | "Feature 0: -0.8333688059494058\n", 41 | "Output: -0.6929185295193974\n", 42 | "\n", 43 | "Feature 0: -0.3356073681186462\n", 44 | "Output: -0.24922638778859663\n", 45 | "\n", 46 | "...\n", 47 | "\n", 48 | "Feature 0: 0.5336073681182466\n", 49 | "Output:\n", 50 | "```\n", 51 | "\n", 52 | "This prompt format is taken directly from Vacareanu et al's paper." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Imports & Configuration\n", 60 | "\n", 61 | "If you've cloned this repo you install the required dependencies using,\n", 62 | "\n", 63 | "```text\n", 64 | "pip install \".[examples]\"\n", 65 | "```\n", 66 | "\n", 67 | "Alternatively, you can install everything you need directly from GitHub,\n", 68 | "\n", 69 | "```text\n", 70 | "pip install \"llm-regression[examples] @ git+https://github.com/AlexIoannides/llm-regression.git\"\n", 71 | "```" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 1, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "import pandas as pd\n", 81 | "import seaborn as sns\n", 82 | "from llm_regression import OpenAiRegressor, make_univariate_linear_test_data\n", 83 | "from sklearn.linear_model import LinearRegression\n", 84 | "from sklearn.metrics import mean_absolute_error, r2_score\n", 85 | "from sklearn.model_selection import train_test_split\n", 86 | "\n", 87 | "sns.set_theme()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Make Regression Data\n", 95 | "\n", 96 | "To keep this example simple we're going to generate univariate regression data by sampling from the following model,\n", 97 | "\n", 98 | "$\n", 99 | "\\tilde{y} = \\rho \\cdot \\tilde{x} + \\sqrt{1 - \\rho^2} * \\tilde{\\epsilon}\n", 100 | "$\n", 101 | "\n", 102 | "Where both $\\tilde{y}$ and $\\tilde{\\epsilon}$ are drawn from the standardised Normal distribution. This model has the property that $\\rho$ determines the correlation between $\\tilde{y}$ and $\\tilde{\\epsilon}$, and that the variance of $\\tilde{y}$ will also be 1 (like that for $\\tilde{x}$).\n", 103 | "\n", 104 | "We will restrict ourselves to 500 samples so that we don't breach GPT-3.5-Turbo's context window of 16k tokens." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 2, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "image/png": "", 115 | "text/plain": [ 116 | "
" 117 | ] 118 | }, 119 | "metadata": {}, 120 | "output_type": "display_data" 121 | } 122 | ], 123 | "source": [ 124 | "n_samples = 500\n", 125 | "dataset = make_univariate_linear_test_data(n_samples, rho=0.9)\n", 126 | "train_data, test_data = train_test_split(dataset, test_size=0.05, random_state=42)\n", 127 | "\n", 128 | "_ = sns.lmplot(train_data, x=\"x\", y=\"y\")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Establish Baseline Results with OLS Regression\n", 136 | "\n", 137 | "We need something 'sensible' to compare the LLM regressor to. We use Scikit-Learn's `LinearRegression` class - a reference implementation of Ordinary Least Squares (OLS) regression." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 3, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "mean_abs_error = 0.338\n", 150 | "r_squared = 0.819\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "ols_regressor = LinearRegression()\n", 156 | "ols_regressor.fit(train_data[[\"x\"]], train_data[[\"y\"]])\n", 157 | "y_pred_ols = ols_regressor.predict(test_data[[\"x\"]])\n", 158 | "\n", 159 | "ols_results = (\n", 160 | " test_data.copy()\n", 161 | " .reset_index(drop=True)\n", 162 | " .assign(y_pred=y_pred_ols)\n", 163 | ")\n", 164 | "\n", 165 | "mean_abs_err_ols = mean_absolute_error(ols_results[\"y\"], ols_results[\"y_pred\"])\n", 166 | "r_squared_ols = r2_score(ols_results[\"y\"], ols_results[\"y_pred\"])\n", 167 | "\n", 168 | "print(f\"mean_abs_error = {mean_abs_err_ols:.3f}\")\n", 169 | "print(f\"r_squared = {r_squared_ols:.3f}\")" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Results using GPT-3.5-Turbo\n", 177 | "\n", 178 | "A reassuringly expensive (and slow) regression..." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 4, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stderr", 188 | "output_type": "stream", 189 | "text": [ 190 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [02:24<00:00, 5.77s/it]" 191 | ] 192 | }, 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "mean_abs_error = 0.34449656403978396\n", 198 | "r_squared = 0.7956923855240899\n" 199 | ] 200 | }, 201 | { 202 | "name": "stderr", 203 | "output_type": "stream", 204 | "text": [ 205 | "\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "gpt35_regressor = OpenAiRegressor(model=\"gpt-3.5-turbo\")\n", 211 | "gpt35_regressor.fit(train_data[[\"x\"]], train_data[[\"y\"]]) # -> add in-context examples to prompt\n", 212 | "y_pred_gpt35 = gpt35_regressor.predict(test_data[[\"x\"]])\n", 213 | "\n", 214 | "gpt35_results = (\n", 215 | " test_data.copy()\n", 216 | " .assign(y_pred=y_pred_gpt35)\n", 217 | " .reset_index(drop=True)\n", 218 | ")\n", 219 | "\n", 220 | "mean_abs_err_gpt35 = mean_absolute_error(gpt35_results[\"y\"], gpt35_results[\"y_pred\"])\n", 221 | "r_squared_gpt35 = r2_score(gpt35_results[\"y\"], gpt35_results[\"y_pred\"])\n", 222 | "\n", 223 | "print(f\"mean_abs_error = {mean_abs_err_gpt35}\")\n", 224 | "print(f\"r_squared = {r_squared_gpt35}\")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "Comparable with OLS regression!" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## Results using GPT-4o\n", 239 | "\n", 240 | "Like wheeling out a Ferrari to pop to the shops in heavy traffic..." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 5, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stderr", 250 | "output_type": "stream", 251 | "text": [ 252 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [05:16<00:00, 12.68s/it]" 253 | ] 254 | }, 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "mean_abs_error = 0.34576103704931194\n", 260 | "r_squared = 0.8005441443484316\n" 261 | ] 262 | }, 263 | { 264 | "name": "stderr", 265 | "output_type": "stream", 266 | "text": [ 267 | "\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "gpt4o_regressor = OpenAiRegressor(model=\"gpt-4o\")\n", 273 | "gpt4o_regressor.fit(train_data[[\"x\"]], train_data[[\"y\"]]) # -> add in-context examples to prompt\n", 274 | "y_pred_gpt4o = gpt4o_regressor.predict(test_data[[\"x\"]])\n", 275 | "\n", 276 | "gpt4o_results = (\n", 277 | " test_data.copy()\n", 278 | " .assign(y_pred=y_pred_gpt4o)\n", 279 | " .reset_index(drop=True)\n", 280 | ")\n", 281 | "\n", 282 | "mean_abs_err_gpt4o = mean_absolute_error(gpt4o_results[\"y\"], gpt4o_results[\"y_pred\"])\n", 283 | "r_squared_gpt4o = r2_score(gpt4o_results[\"y\"], gpt4o_results[\"y_pred\"])\n", 284 | "\n", 285 | "print(f\"mean_abs_error = {mean_abs_err_gpt4o}\")\n", 286 | "print(f\"r_squared = {r_squared_gpt4o}\")" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Even better, although I'm not sure it was worth the extra £ or time (or environmental impact...). If we were exploiting the larger context windown that comes with GPT-4o, then maybe we could justify using it." 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## Comparing Results\n", 301 | "\n", 302 | "Let's take a look at 'predicted vs. actual' for OLS regression and GPT-4o (to make sure there's nothing odd going on)." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 6, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "image/png": "", 313 | "text/plain": [ 314 | "
" 315 | ] 316 | }, 317 | "metadata": {}, 318 | "output_type": "display_data" 319 | } 320 | ], 321 | "source": [ 322 | "combined_results = pd.concat(\n", 323 | " [ols_results.assign(model=\"OLS\"), gpt4o_results.assign(model=\"GPT-4o\")]\n", 324 | ")\n", 325 | "\n", 326 | "_ = sns.lmplot(combined_results, x=\"y\", y=\"y_pred\", hue=\"model\")" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "The two models produce comparable results, with GPT-4o 'a bit off' on a couple of predictions, which is probably what is driving the difference in performance metrics." 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3 (ipykernel)", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.12.2" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 4 358 | } 359 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Developer task automation.""" 2 | import os 3 | 4 | import nox 5 | 6 | nox.options.sessions = [ 7 | "check_code_formatting", 8 | "check_types", 9 | "run_tests", 10 | ] 11 | 12 | PYTHON = ["3.12"] 13 | 14 | 15 | @nox.session(python=PYTHON, reuse_venv=True) 16 | def run_tests(session: nox.Session): 17 | """Run unit tests.""" 18 | session.install(".[dev]") 19 | pytest_args = session.posargs if session.posargs else [] 20 | session.run("pytest", *pytest_args) 21 | 22 | 23 | @nox.session(python=PYTHON, reuse_venv=True) 24 | def format_code(session: nox.Session): 25 | """Lint code and re-format where necessary.""" 26 | session.install(".[dev]") 27 | session.run("black", "--config=pyproject.toml", ".") 28 | session.run("ruff", "check", ".", "--config=pyproject.toml", "--fix") 29 | 30 | 31 | @nox.session(python=PYTHON, reuse_venv=True) 32 | def check_code_formatting(session: nox.Session): 33 | """Check code for formatting errors.""" 34 | session.install(".[dev]") 35 | session.run("black", "--config=pyproject.toml", "--check", ".") 36 | session.run("ruff", "check", ".", "--config=pyproject.toml") 37 | 38 | 39 | @nox.session(python=PYTHON, reuse_venv=True) 40 | def check_types(session: nox.Session): 41 | """Run static type checking.""" 42 | session.install(".[dev]") 43 | session.run("mypy") 44 | 45 | 46 | @nox.session(python=PYTHON, reuse_venv=True) 47 | def build_and_deploy(session: nox.Session): 48 | """Build wheel and deploy to PyPI.""" 49 | try: 50 | from dotenv import load_dotenv 51 | 52 | load_dotenv() 53 | except ModuleNotFoundError: 54 | session.warn("Expecting PYPI_USR and PYPI_PWD in local environment variables.") 55 | 56 | try: 57 | PYPI_USR = os.environ["PYPI_USR"] 58 | PYPI_PWD = os.environ["PYPI_PWD"] 59 | except KeyError as e: 60 | session.error(f"{str(e)} not found in local environment variables.") 61 | session.install(".[deploy]") 62 | session.run("rm", "-rf", "dist") 63 | session.run("python", "-m", "build") 64 | session.run("twine", "upload", "dist/*", "-u", PYPI_USR, "-p", PYPI_PWD) 65 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llm-regression" 3 | version = "0.1.0.dev0" 4 | description = "The llm-regression Python package." 5 | readme = "README.md" 6 | authors = [ 7 | { name="Alex Ioannides", email="alex@bodywork.com" }, 8 | ] 9 | dependencies = [ 10 | "openai==1.30.*", 11 | "numpy==1.26.*", 12 | "pandas==2.2.*", 13 | "python-dotenv==1.0.*", 14 | "scikit-learn==1.4.*", 15 | "tqdm==4.66.*", 16 | ] 17 | 18 | [project.optional-dependencies] 19 | examples = [ 20 | "pandas==2.2.2", 21 | "scikit-learn==1.4.2", 22 | "seaborn==0.13.2", 23 | ] 24 | dev = [ 25 | "black==23.9.1", 26 | "python-dotenv>=1.0.0", 27 | "icecream", 28 | "ipython", 29 | "mypy==1.5.1", 30 | "nox==2023.4.22", 31 | "pandas-stubs==2.2.2.240514", 32 | "pytest==7.4.2", 33 | "ruff==0.0.290", 34 | "types-pytz==2024.1.0.20240417", 35 | "types-tqdm==4.66.0.20240417", 36 | ] 37 | deploy = [ 38 | "build>=1.0.0", 39 | "pip>=23.2.0", 40 | "setuptools>=68.0.0", 41 | "twine>=4.0.0", 42 | "wheel>=0.41.0", 43 | ] 44 | 45 | [project.urls] 46 | "Homepage" = "https://github.com/AlexIoannides/llm-regression" 47 | "Bug Tracker" = "https://github.com/AlexIoannides/llm-regression/issues" 48 | 49 | [build-system] 50 | requires = ["setuptools>=68.0"] 51 | build-backend = "setuptools.build_meta" 52 | 53 | [tool.setuptools] 54 | include-package-data = true 55 | 56 | [tool.setuptools.packages.find] 57 | where = ["src"] 58 | 59 | [tool.black] 60 | line-length = 88 61 | 62 | [tool.ruff] 63 | src = ["src"] 64 | target-version = "py310" 65 | line-length = 88 66 | select = [ 67 | "D", # pydocstyle 68 | "E", # pycodestyle errors 69 | "F", # pyflakes 70 | "I", # isort 71 | "UP", # pyupgrade 72 | "W", # pycodestyle warnings 73 | ] 74 | ignore = [ 75 | "D203", # fix pydocstyle warning 76 | "D213", # fix pydocstyle warning 77 | ] 78 | 79 | [tool.ruff.per-file-ignores] 80 | "tests/*" = [ 81 | "D103", 82 | ] 83 | 84 | [tool.pytest.ini_options] 85 | testpaths = ["tests"] 86 | 87 | [tool.mypy] 88 | python_version = "3.12" 89 | files = [ 90 | "src", 91 | "tests", 92 | "noxfile.py", 93 | ] 94 | 95 | [[tool.mypy.overrides]] 96 | module = [ 97 | "sklearn.*", 98 | ] 99 | ignore_missing_imports = true 100 | -------------------------------------------------------------------------------- /src/llm_regression/__init__.py: -------------------------------------------------------------------------------- 1 | """The llm_regression package.""" 2 | from .models import OpenAiRegressor 3 | from .utils import make_univariate_linear_test_data 4 | 5 | __all__ = [ 6 | "OpenAiRegressor", 7 | "make_univariate_linear_test_data", 8 | ] 9 | -------------------------------------------------------------------------------- /src/llm_regression/models.py: -------------------------------------------------------------------------------- 1 | """Regression modelling using LLMs.""" 2 | from __future__ import annotations 3 | 4 | import re 5 | from logging import getLogger 6 | from typing import Literal 7 | 8 | import numpy as np 9 | from dotenv import load_dotenv 10 | from numpy import ndarray 11 | from openai import OpenAI 12 | from pandas import DataFrame 13 | from tqdm import tqdm 14 | 15 | OpenAiModel = Literal["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"] 16 | 17 | log = getLogger("OpenAIRegressionLogger") 18 | 19 | 20 | class OpenAiRegressor: 21 | """Generic regression using Open AI LLMs.""" 22 | 23 | def __init__(self, model: OpenAiModel = "gpt-3.5-turbo", seed: int = 42): 24 | """Initialise object. 25 | 26 | Args: 27 | ---- 28 | model: Open AI model to use. Defaults to "gpt-3.5-turbo". 29 | seed: Random seed to use with OpenAI model. Defaults to 42 30 | """ 31 | load_dotenv() # load OPEN_API_KEY from .env file (if present) 32 | self._client = OpenAI() 33 | self._model = model 34 | self._model_seed = seed 35 | self._prompt_instruction = ( 36 | "Your task is to provide your best estimate for ”Output”. Please provide " 37 | "that and only that, without any additional text." 38 | ) 39 | self._prompt_train_data: str = "" 40 | 41 | def __repr__(self) -> str: 42 | """Create string representation.""" 43 | return f"OpenAiRegressor(model={self._model})" 44 | 45 | def fit(self, X: DataFrame | ndarray, y: DataFrame | ndarray) -> OpenAiRegressor: 46 | """Create a prompt based on training data to use when predicting with an LLM. 47 | 48 | Args: 49 | ---- 50 | X: Feature data. 51 | y: Labels. 52 | 53 | Raises: 54 | ------ 55 | ValueError: If the dimensions of X or y are invalid and/or inconsistent with 56 | one another. 57 | 58 | Returns: 59 | ------- 60 | The OpenAiRegressor object. 61 | """ 62 | if X.ndim < 2: 63 | raise ValueError("X.ndim must be >= 2") 64 | if y.ndim < 2: 65 | raise ValueError("y.ndim must be == 2") 66 | if len(X) != len(y): 67 | raise ValueError("len(y) != len(X)") 68 | 69 | _X = X.tolist() if isinstance(X, ndarray) else X.values.tolist() 70 | _y = y.tolist() if isinstance(y, ndarray) else y.values.tolist() 71 | 72 | self._prompt_train_data = "\n\n".join( 73 | [self._format_data_row(row, _y[n_row]) for n_row, row in enumerate(_X)] 74 | ) 75 | 76 | return self 77 | 78 | def predict(self, X: DataFrame | ndarray, logging: bool = True) -> ndarray: 79 | """Predict labels using model and feature data. 80 | 81 | Any prediction failures will return `numpy.nan` - prediction won't be halted, 82 | given the expense of querying LLMs. 83 | 84 | Args: 85 | ---- 86 | X: Feature data to use for predictions. 87 | logging: Enable logging. Default to True. 88 | 89 | Raises: 90 | ------ 91 | RuntimeError: If `.fit` has not been called. 92 | 93 | Returns: 94 | ------- 95 | Model predictions 96 | """ 97 | if not self._prompt_train_data: 98 | raise RuntimeError("please fit model before trying to generate predictions") 99 | 100 | _X = X if isinstance(X, ndarray) else X.values 101 | y_pred: list[float] = [] 102 | 103 | for n, row in tqdm(enumerate(_X), total=len(_X)): 104 | try: 105 | prediction_prompt = self._compose_prediction_prompt( 106 | self._prompt_instruction, 107 | self._prompt_train_data, 108 | self._format_data_row(row), 109 | ) 110 | llm_response = self._client.chat.completions.create( 111 | model=self._model, 112 | messages=[{"role": "user", "content": prediction_prompt}], 113 | temperature=0, 114 | response_format={"type": "text"}, 115 | seed=self._model_seed, 116 | ) 117 | llm_generation = llm_response.choices[0].message.content 118 | if llm_generation: 119 | y_pred += [self._parse_model_output(llm_generation)] 120 | else: 121 | y_pred += [np.nan] 122 | except Exception as e: 123 | if logging: 124 | log.warning(f"LLM error for test data row #{n} - {str(e)}") 125 | y_pred += [np.nan] 126 | 127 | return np.array(y_pred).reshape(-1, 1) 128 | 129 | @staticmethod 130 | def _compose_prediction_prompt( 131 | instruction: str, train_data: str, test_data: str 132 | ) -> str: 133 | """Compose full prompt from constituent parts.""" 134 | return instruction + "\n" + train_data + "\n\n" + test_data 135 | 136 | @staticmethod 137 | def _format_data_row(x_row: ndarray, y_row: ndarray | None = None) -> str: 138 | """Format a data row for inclusion in model prompt.""" 139 | output = y_row[0] if y_row else "" 140 | prompt_data = "\n".join( 141 | [f"Feature {n}: {x}" for n, x in enumerate(x_row)] + [f"Output: {output}"] 142 | ) 143 | return prompt_data 144 | 145 | @staticmethod 146 | def _parse_model_output(output: str) -> float: 147 | """Parse the models's output.""" 148 | result = re.findall(r"-?\d+\.?\d*", output)[0] 149 | return float(result) 150 | -------------------------------------------------------------------------------- /src/llm_regression/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlexIoannides/llm-regression/631c8ff3ba4e0675a8c40a770223b0cd9b04aa1b/src/llm_regression/py.typed -------------------------------------------------------------------------------- /src/llm_regression/utils.py: -------------------------------------------------------------------------------- 1 | """Helpful functions.""" 2 | from numpy import sqrt 3 | from numpy.random import default_rng 4 | from pandas import DataFrame 5 | 6 | 7 | def make_univariate_linear_test_data( 8 | n_samples: int = 1000, *, rho: float = 0.75, seed: int = 42 9 | ) -> DataFrame: 10 | """Simulate a y = rho * x + sqrt(1 - rho ** 2) * epsilon. 11 | 12 | This paradign ensures that the standard deviation of x and y is always 1, and that 13 | x has correlation with y given by rho. 14 | 15 | Args: 16 | ---- 17 | n_samples: Number of samples to generate. Defaults to 1000. 18 | rho: Rho coeffcient (correlation coefficient). Defaults to 0.75. 19 | seed: Random seed. Defaults to 42. 20 | 21 | Returns: 22 | ------- 23 | Dataframe of test data. 24 | """ 25 | if not (rho >= 0 and rho <= 1): 26 | raise ValueError(f"rho = {rho} - must in [0, 1]") 27 | rng = default_rng(seed) 28 | x = rng.standard_normal(n_samples) 29 | epsilon = rng.standard_normal(n_samples) 30 | y = rho * x + sqrt(1 - rho * rho) * epsilon 31 | return DataFrame({"x": x, "y": y}) 32 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | """Tests for LLM regression modelling.""" 2 | from re import escape 3 | from unittest.mock import DEFAULT, Mock, patch 4 | 5 | from numpy import array, nan 6 | from numpy.testing import assert_array_equal 7 | from pandas import DataFrame 8 | from pytest import LogCaptureFixture, raises 9 | 10 | from llm_regression import OpenAiRegressor 11 | 12 | 13 | def test_OpeanAiRegressor__repr__(): 14 | with patch.multiple("llm_regression.models", load_dotenv=Mock, OpenAI=Mock): 15 | model = OpenAiRegressor() 16 | assert repr(model) == "OpenAiRegressor(model=gpt-3.5-turbo)" 17 | 18 | 19 | def test_OpeanAiRegressor_fit_makes_prompt_train_data_pandas_dataframe(): 20 | with patch.multiple("llm_regression.models", load_dotenv=Mock, OpenAI=Mock): 21 | train_data = DataFrame({"x0": [1.0, -0.1], "x1": [0.1, -1.0], "y": [1.0, 2.0]}) 22 | X = train_data[["x0", "x1"]] 23 | y = train_data[["y"]] 24 | 25 | model = OpenAiRegressor() 26 | 27 | expected_prompt = ( 28 | "Feature 0: 1.0\nFeature 1: 0.1\nOutput: 1.0\n\n" 29 | "Feature 0: -0.1\nFeature 1: -1.0\nOutput: 2.0" 30 | ) 31 | 32 | assert model.fit(X, y)._prompt_train_data == expected_prompt 33 | 34 | 35 | def test_OpeanAiRegressor_fit_makes_prompt_train_data_numpy_array(): 36 | with patch.multiple("llm_regression.models", load_dotenv=Mock, OpenAI=Mock): 37 | X = array([[1.0, 0.1], [-0.1, -1.0]]) 38 | y = array([[1.0], [2.0]]) 39 | model = OpenAiRegressor() 40 | 41 | expected_prompt = ( 42 | "Feature 0: 1.0\nFeature 1: 0.1\nOutput: 1.0\n\n" 43 | "Feature 0: -0.1\nFeature 1: -1.0\nOutput: 2.0" 44 | ) 45 | 46 | assert model.fit(X, y)._prompt_train_data == expected_prompt 47 | 48 | 49 | def test_OpeanAiRegressor_fit_raises_errors_on_inconsistent_inputs(): 50 | with patch.multiple("llm_regression.models", load_dotenv=Mock, OpenAI=Mock): 51 | model = OpenAiRegressor() 52 | 53 | with raises(ValueError, match=escape("X.ndim must be >= 2")): 54 | model.fit(array([1.0, 0.1, -0.1, -1.0]), array([[1.0], [2.0]])) 55 | 56 | with raises(ValueError, match=escape("y.ndim must be == 2")): 57 | model.fit(array([[1.0, 0.1], [-0.1, -1.0]]), array([1.0, 2.0])) 58 | 59 | with raises(ValueError, match=escape("len(y) != len(X)")): 60 | model.fit(array([[1.0, 0.1], [-0.1, -1.0]]), array([[1.0]])) 61 | 62 | 63 | def test_OpenAiRegressor_predict_returns_predictions(): 64 | def make_mock_api_response(content: str | None) -> Mock: 65 | mock_response = Mock() 66 | mock_response.choices = [Mock()] 67 | mock_response.choices[0].message.content = content 68 | return mock_response 69 | 70 | with patch.multiple( 71 | "llm_regression.models", load_dotenv=DEFAULT, OpenAI=DEFAULT 72 | ) as mock_objs: 73 | mock_client = mock_objs["OpenAI"].return_value 74 | mock_client.chat.completions.create.side_effect = [ 75 | make_mock_api_response("Output: 1.0"), 76 | make_mock_api_response("Output: -1.0"), 77 | make_mock_api_response(None), 78 | ] 79 | model = OpenAiRegressor() 80 | model._prompt_train_data = "Predict some stuff." 81 | y_pred = model.predict(array([[1.0], [0.1], [0.0]])) 82 | assert_array_equal(y_pred, array([[1.0], [-1.0], [nan]])) 83 | 84 | 85 | def test_OpenAiRegressor_predict_handles_response_errors(): 86 | with patch.multiple( 87 | "llm_regression.models", load_dotenv=DEFAULT, OpenAI=DEFAULT 88 | ) as mock_objs: 89 | mock_client = mock_objs["OpenAI"].return_value 90 | mock_client.chat.completions.create.side_effect = [Exception, Exception] 91 | model = OpenAiRegressor() 92 | model._prompt_train_data = "Predict some stuff." 93 | y_pred = model.predict(array([[1.0], [0.1]])) 94 | assert_array_equal(y_pred, array([[nan], [nan]])) 95 | 96 | 97 | def test_OpenAiRegressor_predict_logs_errors(caplog: LogCaptureFixture): 98 | with patch.multiple( 99 | "llm_regression.models", load_dotenv=DEFAULT, OpenAI=DEFAULT 100 | ) as mock_objs: 101 | mock_client = mock_objs["OpenAI"].return_value 102 | mock_client.chat.completions.create.side_effect = Exception("foo") 103 | model = OpenAiRegressor() 104 | model._prompt_train_data = "Predict some stuff." 105 | model.predict(array([[1.0]])) 106 | 107 | log_record_one = caplog.records[0] 108 | assert len(caplog.records) == 1 109 | assert log_record_one.levelname == "WARNING" 110 | assert log_record_one.message == "LLM error for test data row #0 - foo" 111 | 112 | # make sure we can switch logging off 113 | model.predict(array([[1.0]]), logging=False) 114 | assert len(caplog.records) == 1 115 | 116 | 117 | def test_OpeanAiRegressor_compose_prediction_prompt(): 118 | assert OpenAiRegressor._compose_prediction_prompt("a", "b", "c") == "a\nb\n\nc" 119 | 120 | 121 | def test_OpeanAiRegressor_format_data_row(): 122 | assert OpenAiRegressor._format_data_row([1.0]) == "Feature 0: 1.0\nOutput: " 123 | assert ( 124 | OpenAiRegressor._format_data_row([1.0, -0.1]) 125 | == "Feature 0: 1.0\nFeature 1: -0.1\nOutput: " 126 | ) # noqa 127 | assert ( 128 | OpenAiRegressor._format_data_row([1.0], [0.1]) == "Feature 0: 1.0\nOutput: 0.1" 129 | ) # noqa 130 | 131 | 132 | def test_OpenAiRegressor_parse_model_output(): 133 | assert OpenAiRegressor._parse_model_output("\nOutput: 0.101") == 0.101 134 | assert OpenAiRegressor._parse_model_output("Output: -1.101") == -1.101 135 | assert OpenAiRegressor._parse_model_output("Output: 1") == 1.0 136 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for utlity code.""" 2 | from numpy import corrcoef 3 | from pytest import approx 4 | 5 | from llm_regression import make_univariate_linear_test_data 6 | 7 | 8 | def test_make_univariate_linear_test_data(): 9 | corr_coeff = 0.8 10 | dataset = make_univariate_linear_test_data(100000, rho=corr_coeff) 11 | assert "x" in dataset.columns 12 | assert "y" in dataset.columns 13 | assert dataset["x"].mean() == approx(0.0, abs=0.01) 14 | assert dataset["x"].std() == approx(1.0, abs=0.01) 15 | assert dataset["y"].mean() == approx(0.0, abs=0.01) 16 | assert dataset["y"].std() == approx(1.0, abs=0.01) 17 | assert corrcoef(dataset["x"], dataset["y"])[0][1] == approx(corr_coeff, abs=0.01) 18 | --------------------------------------------------------------------------------