├── .DS_Store ├── .env.template ├── .github └── workflows │ ├── run_bot_on_quarterly_cup.yaml │ ├── run_bot_on_tournament.yaml │ └── test_bot.yaml ├── .gitignore ├── README.md ├── community_benchmark.py ├── main.py ├── main_with_no_framework.py ├── poetry.lock └── pyproject.toml /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Metaculus/metac-bot-template/593f29f8cf65cbfda103c5791b4735a2dd35d9b6/.DS_Store -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | # Required 2 | METACULUS_TOKEN=1234567890 3 | 4 | # Optional 5 | PERPLEXITY_API_KEY=1234567890 6 | OPENAI_API_KEY=1234567890 7 | EXA_API_KEY=1234567890 8 | ASKNEWS_CLIENT_ID=1234567890 9 | ASKNEWS_SECRET=1234567890 10 | ANTHROPIC_API_KEY=1234567890 -------------------------------------------------------------------------------- /.github/workflows/run_bot_on_quarterly_cup.yaml: -------------------------------------------------------------------------------- 1 | name: Forecast on Quarterly Cup 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 */2 * *" # runs at midnight every 2 days 7 | 8 | # Add concurrency group to prevent parallel runs 9 | concurrency: 10 | group: ${{ github.workflow }} 11 | cancel-in-progress: false 12 | 13 | 14 | # Daily job to run the simple forecast bot 15 | jobs: 16 | daily_build: 17 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is 18 | steps: # sets up the steps that will be run in order 19 | # setup repository with all necessary dependencies - keep as is 20 | - name: Check out repository 21 | uses: actions/checkout@v3 22 | - uses: actions/setup-python@v4 23 | with: 24 | python-version: "3.11" 25 | - name: Install poetry 26 | uses: snok/install-poetry@v1 27 | with: 28 | virtualenvs-create: true 29 | virtualenvs-in-project: true 30 | installer-parallel: true 31 | - name: Load cached venv 32 | id: cached-poetry-dependencies 33 | uses: actions/cache@v4 34 | with: 35 | path: .venv 36 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 37 | - name: Install dependencies 38 | run: poetry install --no-interaction --no-root 39 | - name: Run bot 40 | run: | 41 | poetry run python main.py --mode quarterly_cup 42 | # this reads the environment variables from the github repository. 43 | # Store under Settings --> Secrets and variables --> Actions 44 | env: 45 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token 46 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} 47 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }} 48 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 49 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 50 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 51 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} 52 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} -------------------------------------------------------------------------------- /.github/workflows/run_bot_on_tournament.yaml: -------------------------------------------------------------------------------- 1 | name: Forecast on new AI tournament questions 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "*/30 * * * *" # runs every 30 minutes. Make sure to skip already forecasted questions! 7 | 8 | # Add concurrency group to prevent parallel runs 9 | concurrency: 10 | group: ${{ github.workflow }} 11 | cancel-in-progress: false 12 | 13 | 14 | # Daily job to run the simple forecast bot 15 | jobs: 16 | daily_build: 17 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is 18 | steps: # sets up the steps that will be run in order 19 | # setup repository with all necessary dependencies - keep as is 20 | - name: Check out repository 21 | uses: actions/checkout@v3 22 | - uses: actions/setup-python@v4 23 | with: 24 | python-version: "3.11" 25 | - name: Install poetry 26 | uses: snok/install-poetry@v1 27 | with: 28 | virtualenvs-create: true 29 | virtualenvs-in-project: true 30 | installer-parallel: true 31 | - name: Load cached venv 32 | id: cached-poetry-dependencies 33 | uses: actions/cache@v4 34 | with: 35 | path: .venv 36 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 37 | - name: Install dependencies 38 | run: poetry install --no-interaction --no-root 39 | - name: Run bot 40 | run: | 41 | poetry run python main.py 42 | # this reads the environment variables from the github repository. 43 | # Store under Settings --> Secrets and variables --> Actions 44 | env: 45 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token 46 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} 47 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }} 48 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 49 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 50 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 51 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} 52 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} -------------------------------------------------------------------------------- /.github/workflows/test_bot.yaml: -------------------------------------------------------------------------------- 1 | name: Test Bot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | # Add concurrency group to prevent parallel runs 7 | concurrency: 8 | group: ${{ github.workflow }} 9 | cancel-in-progress: false 10 | 11 | 12 | # Daily job to run the simple forecast bot 13 | jobs: 14 | daily_build: 15 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is 16 | steps: # sets up the steps that will be run in order 17 | # setup repository with all necessary dependencies - keep as is 18 | - name: Check out repository 19 | uses: actions/checkout@v3 20 | - uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.11" 23 | - name: Install poetry 24 | uses: snok/install-poetry@v1 25 | with: 26 | virtualenvs-create: true 27 | virtualenvs-in-project: true 28 | installer-parallel: true 29 | - name: Load cached venv 30 | id: cached-poetry-dependencies 31 | uses: actions/cache@v4 32 | with: 33 | path: .venv 34 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 35 | - name: Install dependencies 36 | run: poetry install --no-interaction --no-root 37 | - name: Run bot 38 | run: | 39 | poetry run python main.py --mode test_questions 40 | # this reads the environment variables from the github repository. 41 | # Store under Settings --> Secrets and variables --> Actions 42 | env: 43 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token 44 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} 45 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }} 46 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 47 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 48 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 49 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} 50 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Specific to Project ### 2 | benchmarks/ 3 | sandbox.py 4 | 5 | 6 | ### General gitignore ### 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | # *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/#use-with-ide 117 | .pdm.toml 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # Ruff 163 | .ruff_cache/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | # .idea/ 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Metaculus forecasting bot 2 | This repository contains a simple bot meant to get you started with creating your own bot for the AI Forecasting Tournament. Go to https://www.metaculus.com/aib/ for more info and tournament rules. 3 | 4 | In this project are 2 files: 5 | - **main.py**: Our recommended template option that uses [forecasting-tools](https://github.com/Metaculus/forecasting-tools) package to handle a lot of stuff in the background for you (such as API calls). We will update the package, thus allowing you to gain new features with minimal changes to your code. 6 | - **main_with_no_framework.py**: A copy of main.py but implemented with minimal dependencies. Useful if you want a more custom approach. 7 | 8 | Join the conversation about bot creation, get support, and follow updates on the [Metaculus Discord](https://discord.com/invite/NJgCC2nDfh) 'build a forecasting bot' channel. 9 | 10 | ## 30min Video Tutorial 11 | This tutorial shows you how to set our template bot so you can start forecasting in the tournament. 12 | 13 | [![Watch the tutorial](https://cdn.loom.com/sessions/thumbnails/fc3c1a643b984a15b510647d8f760685-42b452e1ab7d2afa-full-play.gif)](https://www.loom.com/share/fc3c1a643b984a15b510647d8f760685?sid=29b502e0-cf64-421e-82c0-3a78451159ed) 14 | 15 | If you run into trouble, reach out to `ben [at] metaculus [.com]` 16 | 17 | 18 | ## Quick start -> Fork and use Github Actions 19 | The easiest way to use this repo is to fork it, enable github workflow/actions, and then set repository secrets. Then your bot will run every 30min, pick up new questions, and forecast on them. Automation is handled in the `.github/workflows/` folder. The `daily_run_simple_bot.yaml` file runs the simple bot every 30 min and will skip questions it has already forecasted on. 20 | 21 | 1) **Fork the repository**: Go to the [repository](https://github.com/Metaculus/metac-bot-template) and click 'fork'. 22 | 2) **Set secrets**: Go to `Settings -> Secrets and variables -> Actions -> New respository secret` and set API keys/Tokens as secrets. You will want to set your METACULUS_TOKEN. This will be used to post questions to Metaculus, and to use our OpenAI/Anthropic LLM proxy (reach out to `ben [at] metaculus [.com]` with your bot description to apply for credits. See the relevant section below). 23 | 3) **Enable Actions**: Go to 'Actions' then click 'Enable'. Then go to the 'Regularly forecast new questions' workflow, and click 'Enable'. To test if the workflow is working, click 'Run workflow', choose the main branch, then click the green 'Run workflow' button. This will check for new questions and forecast only on ones it has not yet successfully forecast on. 24 | 25 | The bot should just work as is at this point. You can disable the workflow by clicking `Actions > Regularly forecast new questions > Triple dots > disable workflow` 26 | 27 | ## Getting your Metaculus Token 28 | To get a bot account and your API Token: 29 | 1) Go to https://metaculus.com/aib 30 | 2) Click "Log Out" if you are using your personal account 31 | 3) Click "Create a Bot Account" 32 | 4) Create your account 33 | 5) Go back to https://metaculus.com/aib 34 | 6) Click 'Show My Token' 35 | 36 | If your regular Metaculus account uses Gmail, you can create a separate bot account while keeping your existing email by adding a '+bot' before the @ symbol. For example, if your email is 'youremail@gmail.com', you can use 'youremail+bot1@gmail.com' for your bot account. 37 | 38 | ## Search Provider API Keys 39 | 40 | ### Getting AskNews Setup 41 | Metaculus is collaborating with AskNews to give free access for news searches. Each registered bot builder gets 3k calls per month, 9k calls total for the tournament (please note that latest news requests (48 hours back) are 1 call and archive news requests are 5 calls), and 5M tokens. Bots have access to the /news and /deepnews endpoints. To sign up: 42 | 1. Make an account on AskNews (if you have not yet, https://my.asknews.app) 43 | 2. Join the [AskNews discord](https://discord.gg/99qt5HGgUn), send your bot name + AskNews registered email to the #api-support channel. 44 | 3. AskNews will make sure you have free calls and your account is ready to go for you to make API keys and get going 45 | 4. Generate your `ASKNEWS_CLIENT_ID` and `ASKNEWS_SECRET` [here](https://my.asknews.app/en/settings/api-credentials) and add that to the .env 46 | 5. Run the AskNewsSearcher from the forecasting-tools repo or use the AskNews SDK python package 47 | 48 | Your account will be active for the duration of the tournament. There is only one account allowed per participant. 49 | 50 | Example usage of /news and /deepnews: 51 | 52 | ```python 53 | from asknews_sdk import AsyncAskNewsSDK 54 | import asyncio 55 | 56 | """ 57 | More information available here: 58 | https://docs.asknews.app/en/news 59 | https://docs.asknews.app/en/deepnews 60 | 61 | Installation: 62 | pip install asknews 63 | """ 64 | 65 | client_id = "" 66 | client_secret = "" 67 | 68 | ask = AsyncAskNewsSDK( 69 | client_id=client_id, 70 | client_secret=client_secret, 71 | scopes=["chat", "news", "stories", "analytics"], 72 | ) 73 | 74 | # /news endpoint example 75 | async def search_news(query): 76 | 77 | hot_response = await ask.news.search_news( 78 | query=query, # your natural language query 79 | n_articles=5, # control the number of articles to include in the context 80 | return_type="both", 81 | strategy="latest news" # enforces looking at the latest news only 82 | ) 83 | 84 | print(hot_response.as_string) 85 | 86 | # get context from the "historical" database that contains a news archive going back to 2023 87 | historical_response = await ask.news.search_news( 88 | query=query, 89 | n_articles=10, 90 | return_type="both", 91 | strategy="news knowledge" # looks for relevant news within the past 60 days 92 | ) 93 | 94 | print(historical_response.as_string) 95 | 96 | # /deepnews endpoint example: 97 | async def deep_research( 98 | query, sources, model, search_depth=2, max_depth=2 99 | ): 100 | 101 | response = await ask.chat.get_deep_news( 102 | messages=[{"role": "user", "content": query}], 103 | search_depth=search_depth, 104 | max_depth=max_depth, 105 | sources=sources, 106 | stream=False, 107 | return_sources=False, 108 | model=model, 109 | inline_citations="numbered" 110 | ) 111 | 112 | print(response) 113 | 114 | 115 | if __name__ == "__main__": 116 | query = "What is the TAM of the global market for electric vehicles in 2025? With your final report, please report the TAM in USD using the tags ... " 117 | 118 | sources = ["asknews"] 119 | model = "deepseek-basic" 120 | search_depth = 2 121 | max_depth = 2 122 | asyncio.run( 123 | deep_research( 124 | query, sources, model, search_depth, max_depth 125 | ) 126 | ) 127 | 128 | asyncio.run(search_news(query)) 129 | ``` 130 | 131 | Some tips for DeepNews: 132 | 133 | You will get tags in your response, including: 134 | 135 | 136 | 137 | 138 | 139 | These tags are likely useful for extracting the pieces that you need for your pipeline. For example, if you dont want to include all the thinking/searching, you could just extract 140 | 141 | ### Getting Perplexity Set Up 142 | Perplexity works as an internet powered LLM, and costs half a cent per search (if you pick the right model) plus token costs. It is less customizable but generally cheaper. 143 | 1. Create an account on the free tier at www.perplexity.ai 144 | 2. Go to https://www.perplexity.ai/settings/account 145 | 3. Click "API" in the top bar 146 | 4. Click "Generate" in the "API Keys" section 147 | 5. Add funds to your account with the 'Buy Credits' button 148 | 6. Add it to the .env as `PERPLEXITY_API_KEY=your-key-here` 149 | 150 | ### Getting Exa Set Up 151 | Exa is closer to a more traditional search provider. Exa takes in a search query and a list of filters and returns a list of websites. Each site returned can have scraped text, semantic higlights, AI summary, and more. By putting GPT on top of Exa, you can recreate Perplexity with more control. An implementation of this is available in the `SmartSearcher` of the `forecasting-tools` python package. Each Exa search costs half a cent per search plus a tenth of a cent per 'text-content' requested per site requested. Content items include: highlights from a source, summary of a source, or full text. 152 | 1. Make an account with Exa at Exa.ai 153 | 2. Go to https://dashboard.exa.ai/playground 154 | 3. Click on "API Keys" in the left sidebar 155 | 4. Create a new key 156 | 5. Go to 'Billing' in the left sidebar and add funds to your acount with the 'Top Up Balance' 157 | 6. Add it to the .env as `EXA_API_KEY=your-key-here` 158 | 159 | ### Other Search 160 | Here are some other unvetted but interesting options for search and website reading: 161 | - Tavily 162 | - Google Search API 163 | - crawl4ai 164 | - Firecrawl 165 | - Playwright 166 | 167 | ## Accessing the Metaculus LLM Proxy 168 | OpenAI and Anthropic have generously donated credits to bot builders in the tournament which we are providing through an llm proxy. 169 | 170 | To get credits assigned to your model choices (or if you need renewed credits from a previous quarter), please send an email to `ben [at] metaculus [.com]` with the below: 171 | * The username of your bot 172 | * A couple paragraph description of how your existing bot works, or what you plan to build 173 | * An estimate of how much budget/tokens you might productively use 174 | * Your preferred Anthropic/OpenAI model(s) and how you want the budget distributed between them (there is budget distributed to each individual model name rather than to your account on whole) 175 | 176 | Metaculus will add new OpenAI and Anthropic completion models to the proxy as they come out. If you want to use a new model, please send us an email with the model you desire, and how much budget you want removed from one model and transferred to another. Alternatively, if you have a new idea that needs more support, pitch it to us, and we can add give additional credits. Reach out if you run out. 177 | 178 | Visit [this page](https://www.notion.so/metaculus/OpenAI-and-Anthropic-credits-0e1f7bf8c8a248e4a38da8758cc04de4) for instructions on how to call the Metaculus proxy directly. 179 | 180 | You can also use the `forecasting-tools` package to call the proxy. To do this, call `await forecasting-tools.GeneralLlm(model="metaculus/{openai_or_anthropic_model_name}").invoke(prompt)`. You will need METACULUS_TOKEN set in your .env file and have already had credits assigned to your account and model choice. GeneralLlm is a wrapper around the litellm package which provides one API for every major model and provider and can be used for other providers like Gemini, XAI, or OpenRouter. For more information about how to use GeneralLlm/litellm see [forecasting-tools](https://github.com/Metaculus/forecasting-tools) and [litellm](https://github.com/BerriAI/litellm) 181 | 182 | 183 | ## Run the bot locally 184 | Clone the repository. Find your terminal and run the following commands: 185 | ```bash 186 | git clone https://github.com/Metaculus/metac-bot-template.git 187 | ``` 188 | 189 | If you forked the repository first, you have to replace the url in the `git clone` command with the url to your fork. Just go to your forked repository and copy the url from the address bar in the browser. 190 | 191 | ### Installing dependencies 192 | Make sure you have python and [poetry](https://python-poetry.org/docs/#installing-with-pipx) installed (poetry is a python package manager). 193 | 194 | If you don't have poetry installed run the below: 195 | ```bash 196 | sudo apt update -y 197 | sudo apt install -y pipx 198 | pipx install poetry 199 | 200 | # Optional 201 | poetry config virtualenvs.in-project true 202 | ``` 203 | 204 | 205 | Inside the terminal, go to the directory you cloned the repository into and run the following command: 206 | ```bash 207 | poetry install 208 | ``` 209 | to install all required dependencies. 210 | 211 | ### Setting environment variables 212 | 213 | Running the bot requires various environment variables. If you run the bot locally, the easiest way to set them is to create a file called `.env` in the root directory of the repository (copy the `.env.template`). 214 | 215 | ### Running the bot 216 | 217 | To test the simple bot, execute the following command in your terminal: 218 | ```bash 219 | poetry run python main.py --mode test_questions 220 | ``` 221 | Make sure to set the environment variables as described above and to set the parameters in the code to your liking. In particular, to submit predictions, make sure that `submit_predictions` is set to `True` (it is set to `True` by default in main.py). 222 | 223 | ## Early Benchmarking 224 | Provided in this project is an example of how to benchmark your bot's forecasts against the community prediction for questions on Metaculus. Running `community_benchmark.py` will run versions of your bot defined by you (e.g. with different LLMs or research paths) and score them on how close they are to the community prediction using expected baseline score (a proper score assuming the community prediction is the true probability). You will want to edit the file to choose which bot configurations you want to test and how many questions you want to test on. Any class inheriting from `forecasting-tools.Forecastbot` can be passed into the benchmarker. As of March 28, 2025 the benchmarker only works with binary questions. 225 | 226 | To run a benchmark: 227 | `poetry run python community_benchmark.py --mode run` 228 | 229 | To run a custom benchmark (e.g. remove background info from questions to test retrival): 230 | `poetry run python community_benchmark.py --mode custom` 231 | 232 | To view a UI showing your scores, statistical error bars, and your bot's reasoning: 233 | `poetry run streamlit run community_benchmark.py` 234 | 235 | See more information in the benchmarking section of the [forecasting-tools repo](https://github.com/Metaculus/forecasting-tools?tab=readme-ov-file#benchmarking) 236 | 237 | ## Ideas for bot improvements 238 | Below are some ideas for making a novel bot. 239 | - Finetuned LLM on Metaculus Data: Create an optimized prompt (using DSPY or a similar toolset) and/or a fine-tuned LLM using all past Metaculus data. The thought is that this will train the LLM to be well-calibrated on real-life questions. Consider knowledge cutoffs and data leakage from search providers. 240 | - Dataset explorer: Create a tool that can find if there are datasets or graphs related to a question online, download them if they exist, and then run data science on them to answer a question. 241 | - Question decomposer: A tool that takes a complex question and breaks it down into simpler questions to answer those instead 242 | - Meta-Forecast Researcher: A tool that searches all major prediction markets, prediction aggregators, and possibly thought leaders to find relevant forecasts, and then combines them into an assessment for the current question (see [Metaforecast](https://metaforecast.org/)). 243 | - Base rate researcher: Create a tool to find accurate base rates. There is an experimental version [here](https://forecasting-tools.streamlit.app/base-rate-generator) in [forecasting-tools](https://github.com/Metaculus/forecasting-tools) that works 50% of the time. 244 | - Key factors researcher: Improve our experimental [key factors researcher](https://forecasting-tools.streamlit.app/key-factors) to find higher significance key factors for a given question. 245 | - Monte Carlo Simulations: Experiment with combining some tools to run effective Monte Carlo simulations. This could include experimenting with combining Squiggle with the question decomposer. 246 | - Adding personality diversity, LLM diversity, and other variations: Have GPT come up with a number of different ‘expert personalities’ or 'world-models' that it runs the forecasting bot with and then aggregates the median. Additionally, run the bot on different LLMs and see if the median of different LLMs improves the forecast. Finally, try simulating up to hundreds of personalities/LLM combinations to create large diverse crowds. Each individual could have a backstory, thinking process, biases they are resistant to, etc. This will ideally improve accuracy and give more useful bot reasoning outputs to help humans reading the output consider things from multiple angles. 247 | - Worldbuilding: Have GPT world build different future scenarios and then forecast all the different parts of those scenarios. It then would choose the most likely future world. In addition to a forecast, descriptions of future ‘worlds’ are created. This can take inspiration from Feinman paths. 248 | - Consistency Forecasting: Forecast many tangential questions all at once (in a single prompt) and prompts for consistency rules. 249 | - Extremize & Calibrate Predictions: Using the historical performance of a bot, adjust forecasts to be better calibrated. For instance, if predictions of 30% from the bot actually happen 40% of the time, then transform predictions of 30% to 40%. 250 | - Assigning points to evidence: Starting with some ideas from a [blog post from Ozzie Gooen](https://forum.effectivealtruism.org/posts/mrAZFnEjsQAQPJvLh/using-points-to-rate-different-kinds-of-evidence), you could experiment with assigning ‘points’ to major types of evidence and having GPT categorize the evidence it finds related to a forecast so that the ‘total points’ can be calculated. This can then be turned into a forecast, and potentially optimized using machine learning on past Metaculus data. 251 | - Search provider benchmark: Run bots using different combinations of search providers (e.g. Google, Bing, Exa.ai, Tavily, AskNews, Perplexity, etc) and search filters (e.g. only recent data, sites with a certain search rank, etc) and see if any specific one is better than others, or if using multiple of them makes a difference. 252 | - Timeline researcher: Make a tool that can take a niche topic and make a timeline for all major and minor events relevant to that topic. 253 | -------------------------------------------------------------------------------- /community_benchmark.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import asyncio 5 | import logging 6 | import sys 7 | from datetime import datetime, timedelta 8 | from typing import Literal 9 | 10 | import typeguard 11 | from forecasting_tools import ( 12 | Benchmarker, 13 | ForecastBot, 14 | GeneralLlm, 15 | MonetaryCostManager, 16 | MetaculusApi, 17 | ApiFilter, 18 | run_benchmark_streamlit_page, 19 | ) 20 | 21 | from main import TemplateForecaster 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | 27 | async def benchmark_forecast_bot(mode: str) -> None: 28 | """ 29 | Run a benchmark that compares your forecasts against the community prediction 30 | """ 31 | 32 | number_of_questions = 30 # Recommend 100+ for meaningful error bars, but 30 is faster/cheaper 33 | if mode == "display": 34 | run_benchmark_streamlit_page() 35 | return 36 | elif mode == "run": 37 | questions = MetaculusApi.get_benchmark_questions(number_of_questions) 38 | elif mode == "custom": 39 | # Below is an example of getting custom questions 40 | one_year_from_now = datetime.now() + timedelta(days=365) 41 | api_filter = ApiFilter( 42 | allowed_statuses=["open"], 43 | allowed_types=["binary"], 44 | num_forecasters_gte=40, 45 | scheduled_resolve_time_lt=one_year_from_now, 46 | includes_bots_in_aggregates=False, 47 | community_prediction_exists=True, 48 | ) 49 | questions = await MetaculusApi.get_questions_matching_filter( 50 | api_filter, 51 | num_questions=number_of_questions, 52 | randomly_sample=True, 53 | ) 54 | for question in questions: 55 | question.background_info = None # Test ability to find new information 56 | else: 57 | raise ValueError(f"Invalid mode: {mode}") 58 | 59 | with MonetaryCostManager() as cost_manager: 60 | bots = [ 61 | TemplateForecaster( 62 | predictions_per_research_report=5, 63 | llms={ 64 | "default": GeneralLlm( 65 | model="gpt-4o-mini", 66 | temperature=0.3, 67 | ), 68 | }, 69 | ), 70 | TemplateForecaster( 71 | predictions_per_research_report=1, 72 | llms={ 73 | "default": GeneralLlm( 74 | model="gpt-4o-mini", 75 | temperature=0.3, 76 | ), 77 | }, 78 | ), 79 | # Add other ForecastBots here (or same bot with different parameters) 80 | ] 81 | bots = typeguard.check_type(bots, list[ForecastBot]) 82 | benchmarks = await Benchmarker( 83 | questions_to_use=questions, 84 | forecast_bots=bots, 85 | file_path_to_save_reports="benchmarks/", 86 | concurrent_question_batch_size=10, 87 | ).run_benchmark() 88 | for i, benchmark in enumerate(benchmarks): 89 | logger.info( 90 | f"Benchmark {i+1} of {len(benchmarks)}: {benchmark.name}" 91 | ) 92 | logger.info( 93 | f"- Final Score: {benchmark.average_expected_baseline_score}" 94 | ) 95 | logger.info(f"- Total Cost: {benchmark.total_cost}") 96 | logger.info(f"- Time taken: {benchmark.time_taken_in_minutes}") 97 | logger.info(f"Total Cost: {cost_manager.current_usage}") 98 | 99 | 100 | if __name__ == "__main__": 101 | logging.basicConfig( 102 | level=logging.INFO, 103 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 104 | handlers=[ 105 | logging.StreamHandler(sys.stdout), 106 | logging.FileHandler(f"benchmarks/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log") 107 | ] 108 | ) 109 | 110 | # Suppress LiteLLM logging 111 | litellm_logger = logging.getLogger("LiteLLM") 112 | litellm_logger.setLevel(logging.WARNING) 113 | litellm_logger.propagate = False 114 | 115 | # Parse command line arguments 116 | parser = argparse.ArgumentParser( 117 | description="Benchmark a list of bots" 118 | ) 119 | parser.add_argument( 120 | "--mode", 121 | type=str, 122 | choices=["run", "custom", "display"], 123 | default="display", 124 | help="Specify the run mode (default: display)", 125 | ) 126 | args = parser.parse_args() 127 | mode: Literal["run", "custom", "display"] = ( 128 | args.mode 129 | ) 130 | asyncio.run(benchmark_forecast_bot(mode)) 131 | 132 | 133 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import logging 4 | import os 5 | from datetime import datetime 6 | from typing import Literal 7 | 8 | from forecasting_tools import ( 9 | AskNewsSearcher, 10 | BinaryQuestion, 11 | ForecastBot, 12 | GeneralLlm, 13 | MetaculusApi, 14 | MetaculusQuestion, 15 | MultipleChoiceQuestion, 16 | NumericDistribution, 17 | NumericQuestion, 18 | PredictedOptionList, 19 | PredictionExtractor, 20 | ReasonedPrediction, 21 | SmartSearcher, 22 | clean_indents, 23 | ) 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class TemplateForecaster(ForecastBot): 29 | """ 30 | This is a copy of the template bot for Q2 2025 Metaculus AI Tournament. 31 | The official bots on the leaderboard use AskNews in Q2. 32 | Main template bot changes since Q1 33 | - Support for new units parameter was added 34 | - You now set your llms when you initialize the bot (making it easier to switch between and benchmark different models) 35 | 36 | The main entry point of this bot is `forecast_on_tournament` in the parent class. 37 | See the script at the bottom of the file for more details on how to run the bot. 38 | Ignoring the finer details, the general flow is: 39 | - Load questions from Metaculus 40 | - For each question 41 | - Execute run_research a number of times equal to research_reports_per_question 42 | - Execute respective run_forecast function `predictions_per_research_report * research_reports_per_question` times 43 | - Aggregate the predictions 44 | - Submit prediction (if publish_reports_to_metaculus is True) 45 | - Return a list of ForecastReport objects 46 | 47 | Only the research and forecast functions need to be implemented in ForecastBot subclasses. 48 | 49 | If you end up having trouble with rate limits and want to try a more sophisticated rate limiter try: 50 | ``` 51 | from forecasting_tools.ai_models.resource_managers.refreshing_bucket_rate_limiter import RefreshingBucketRateLimiter 52 | rate_limiter = RefreshingBucketRateLimiter( 53 | capacity=2, 54 | refresh_rate=1, 55 | ) # Allows 1 request per second on average with a burst of 2 requests initially. Set this as a class variable 56 | await self.rate_limiter.wait_till_able_to_acquire_resources(1) # 1 because it's consuming 1 request (use more if you are adding a token limit) 57 | ``` 58 | Additionally OpenRouter has large rate limits immediately on account creation 59 | """ 60 | 61 | _max_concurrent_questions = 2 # Set this to whatever works for your search-provider/ai-model rate limits 62 | _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions) 63 | 64 | async def run_research(self, question: MetaculusQuestion) -> str: 65 | async with self._concurrency_limiter: 66 | research = "" 67 | if os.getenv("ASKNEWS_CLIENT_ID") and os.getenv("ASKNEWS_SECRET"): 68 | research = await AskNewsSearcher().get_formatted_news_async( 69 | question.question_text 70 | ) 71 | elif os.getenv("EXA_API_KEY"): 72 | research = await self._call_exa_smart_searcher( 73 | question.question_text 74 | ) 75 | elif os.getenv("PERPLEXITY_API_KEY"): 76 | research = await self._call_perplexity(question.question_text) 77 | elif os.getenv("OPENROUTER_API_KEY"): 78 | research = await self._call_perplexity( 79 | question.question_text, use_open_router=True 80 | ) 81 | else: 82 | logger.warning( 83 | f"No research provider found when processing question URL {question.page_url}. Will pass back empty string." 84 | ) 85 | research = "" 86 | logger.info( 87 | f"Found Research for URL {question.page_url}:\n{research}" 88 | ) 89 | return research 90 | 91 | async def _call_perplexity( 92 | self, question: str, use_open_router: bool = False 93 | ) -> str: 94 | prompt = clean_indents( 95 | f""" 96 | You are an assistant to a superforecaster. 97 | The superforecaster will give you a question they intend to forecast on. 98 | To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. 99 | You do not produce forecasts yourself. 100 | 101 | Question: 102 | {question} 103 | """ 104 | ) # NOTE: The metac bot in Q1 put everything but the question in the system prompt. 105 | if use_open_router: 106 | model_name = "openrouter/perplexity/sonar-reasoning" 107 | else: 108 | model_name = "perplexity/sonar-pro" # perplexity/sonar-reasoning and perplexity/sonar are cheaper, but do only 1 search 109 | model = GeneralLlm( 110 | model=model_name, 111 | temperature=0.1, 112 | ) 113 | response = await model.invoke(prompt) 114 | return response 115 | 116 | async def _call_exa_smart_searcher(self, question: str) -> str: 117 | """ 118 | SmartSearcher is a custom class that is a wrapper around an search on Exa.ai 119 | """ 120 | searcher = SmartSearcher( 121 | model=self.get_llm("default", "llm"), 122 | temperature=0, 123 | num_searches_to_run=2, 124 | num_sites_per_search=10, 125 | ) 126 | prompt = ( 127 | "You are an assistant to a superforecaster. The superforecaster will give" 128 | "you a question they intend to forecast on. To be a great assistant, you generate" 129 | "a concise but detailed rundown of the most relevant news, including if the question" 130 | "would resolve Yes or No based on current information. You do not produce forecasts yourself." 131 | f"\n\nThe question is: {question}" 132 | ) # You can ask the searcher to filter by date, exclude/include a domain, and run specific searches for finding sources vs finding highlights within a source 133 | response = await searcher.invoke(prompt) 134 | return response 135 | 136 | async def _run_forecast_on_binary( 137 | self, question: BinaryQuestion, research: str 138 | ) -> ReasonedPrediction[float]: 139 | prompt = clean_indents( 140 | f""" 141 | You are a professional forecaster interviewing for a job. 142 | 143 | Your interview question is: 144 | {question.question_text} 145 | 146 | Question background: 147 | {question.background_info} 148 | 149 | 150 | This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied: 151 | {question.resolution_criteria} 152 | 153 | {question.fine_print} 154 | 155 | 156 | Your research assistant says: 157 | {research} 158 | 159 | Today is {datetime.now().strftime("%Y-%m-%d")}. 160 | 161 | Before answering you write: 162 | (a) The time left until the outcome to the question is known. 163 | (b) The status quo outcome if nothing changed. 164 | (c) A brief description of a scenario that results in a No outcome. 165 | (d) A brief description of a scenario that results in a Yes outcome. 166 | 167 | You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. 168 | 169 | The last thing you write is your final answer as: "Probability: ZZ%", 0-100 170 | """ 171 | ) 172 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 173 | prediction: float = PredictionExtractor.extract_last_percentage_value( 174 | reasoning, max_prediction=1, min_prediction=0 175 | ) 176 | logger.info( 177 | f"Forecasted URL {question.page_url} as {prediction} with reasoning:\n{reasoning}" 178 | ) 179 | return ReasonedPrediction( 180 | prediction_value=prediction, reasoning=reasoning 181 | ) 182 | 183 | async def _run_forecast_on_multiple_choice( 184 | self, question: MultipleChoiceQuestion, research: str 185 | ) -> ReasonedPrediction[PredictedOptionList]: 186 | prompt = clean_indents( 187 | f""" 188 | You are a professional forecaster interviewing for a job. 189 | 190 | Your interview question is: 191 | {question.question_text} 192 | 193 | The options are: {question.options} 194 | 195 | 196 | Background: 197 | {question.background_info} 198 | 199 | {question.resolution_criteria} 200 | 201 | {question.fine_print} 202 | 203 | 204 | Your research assistant says: 205 | {research} 206 | 207 | Today is {datetime.now().strftime("%Y-%m-%d")}. 208 | 209 | Before answering you write: 210 | (a) The time left until the outcome to the question is known. 211 | (b) The status quo outcome if nothing changed. 212 | (c) A description of an scenario that results in an unexpected outcome. 213 | 214 | You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. 215 | 216 | The last thing you write is your final probabilities for the N options in this order {question.options} as: 217 | Option_A: Probability_A 218 | Option_B: Probability_B 219 | ... 220 | Option_N: Probability_N 221 | """ 222 | ) 223 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 224 | prediction: PredictedOptionList = ( 225 | PredictionExtractor.extract_option_list_with_percentage_afterwards( 226 | reasoning, question.options 227 | ) 228 | ) 229 | logger.info( 230 | f"Forecasted URL {question.page_url} as {prediction} with reasoning:\n{reasoning}" 231 | ) 232 | return ReasonedPrediction( 233 | prediction_value=prediction, reasoning=reasoning 234 | ) 235 | 236 | async def _run_forecast_on_numeric( 237 | self, question: NumericQuestion, research: str 238 | ) -> ReasonedPrediction[NumericDistribution]: 239 | upper_bound_message, lower_bound_message = ( 240 | self._create_upper_and_lower_bound_messages(question) 241 | ) 242 | prompt = clean_indents( 243 | f""" 244 | You are a professional forecaster interviewing for a job. 245 | 246 | Your interview question is: 247 | {question.question_text} 248 | 249 | Background: 250 | {question.background_info} 251 | 252 | {question.resolution_criteria} 253 | 254 | {question.fine_print} 255 | 256 | Units for answer: {question.unit_of_measure if question.unit_of_measure else "Not stated (please infer this)"} 257 | 258 | Your research assistant says: 259 | {research} 260 | 261 | Today is {datetime.now().strftime("%Y-%m-%d")}. 262 | 263 | {lower_bound_message} 264 | {upper_bound_message} 265 | 266 | Formatting Instructions: 267 | - Please notice the units requested (e.g. whether you represent a number as 1,000,000 or 1 million). 268 | - Never use scientific notation. 269 | - Always start with a smaller number (more negative if negative) and then increase from there 270 | 271 | Before answering you write: 272 | (a) The time left until the outcome to the question is known. 273 | (b) The outcome if nothing changed. 274 | (c) The outcome if the current trend continued. 275 | (d) The expectations of experts and markets. 276 | (e) A brief description of an unexpected scenario that results in a low outcome. 277 | (f) A brief description of an unexpected scenario that results in a high outcome. 278 | 279 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. 280 | 281 | The last thing you write is your final answer as: 282 | " 283 | Percentile 10: XX 284 | Percentile 20: XX 285 | Percentile 40: XX 286 | Percentile 60: XX 287 | Percentile 80: XX 288 | Percentile 90: XX 289 | " 290 | """ 291 | ) 292 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 293 | prediction: NumericDistribution = ( 294 | PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability( 295 | reasoning, question 296 | ) 297 | ) 298 | logger.info( 299 | f"Forecasted URL {question.page_url} as {prediction.declared_percentiles} with reasoning:\n{reasoning}" 300 | ) 301 | return ReasonedPrediction( 302 | prediction_value=prediction, reasoning=reasoning 303 | ) 304 | 305 | def _create_upper_and_lower_bound_messages( 306 | self, question: NumericQuestion 307 | ) -> tuple[str, str]: 308 | if question.open_upper_bound: 309 | upper_bound_message = "" 310 | else: 311 | upper_bound_message = ( 312 | f"The outcome can not be higher than {question.upper_bound}." 313 | ) 314 | if question.open_lower_bound: 315 | lower_bound_message = "" 316 | else: 317 | lower_bound_message = ( 318 | f"The outcome can not be lower than {question.lower_bound}." 319 | ) 320 | return upper_bound_message, lower_bound_message 321 | 322 | 323 | if __name__ == "__main__": 324 | logging.basicConfig( 325 | level=logging.INFO, 326 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 327 | ) 328 | 329 | # Suppress LiteLLM logging 330 | litellm_logger = logging.getLogger("LiteLLM") 331 | litellm_logger.setLevel(logging.WARNING) 332 | litellm_logger.propagate = False 333 | 334 | parser = argparse.ArgumentParser( 335 | description="Run the Q1TemplateBot forecasting system" 336 | ) 337 | parser.add_argument( 338 | "--mode", 339 | type=str, 340 | choices=["tournament", "quarterly_cup", "test_questions"], 341 | default="tournament", 342 | help="Specify the run mode (default: tournament)", 343 | ) 344 | args = parser.parse_args() 345 | run_mode: Literal["tournament", "quarterly_cup", "test_questions"] = ( 346 | args.mode 347 | ) 348 | assert run_mode in [ 349 | "tournament", 350 | "quarterly_cup", 351 | "test_questions", 352 | ], "Invalid run mode" 353 | 354 | template_bot = TemplateForecaster( 355 | research_reports_per_question=1, 356 | predictions_per_research_report=5, 357 | use_research_summary_to_forecast=False, 358 | publish_reports_to_metaculus=True, 359 | folder_to_save_reports_to=None, 360 | skip_previously_forecasted_questions=True, 361 | # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you 362 | # "default": GeneralLlm( 363 | # model="metaculus/anthropic/claude-3-5-sonnet-20241022", 364 | # temperature=0.3, 365 | # timeout=40, 366 | # allowed_tries=2, 367 | # ), 368 | # "summarizer": "openai/gpt-4o-mini", 369 | # }, 370 | ) 371 | 372 | if run_mode == "tournament": 373 | forecast_reports = asyncio.run( 374 | template_bot.forecast_on_tournament( 375 | MetaculusApi.CURRENT_AI_COMPETITION_ID, return_exceptions=True 376 | ) 377 | ) 378 | elif run_mode == "quarterly_cup": 379 | # The quarterly cup is a good way to test the bot's performance on regularly open questions. You can also use AXC_2025_TOURNAMENT_ID = 32564 380 | # The new quarterly cup may not be initialized near the beginning of a quarter 381 | template_bot.skip_previously_forecasted_questions = False 382 | forecast_reports = asyncio.run( 383 | template_bot.forecast_on_tournament( 384 | MetaculusApi.CURRENT_QUARTERLY_CUP_ID, return_exceptions=True 385 | ) 386 | ) 387 | elif run_mode == "test_questions": 388 | # Example questions are a good way to test the bot's performance on a single question 389 | EXAMPLE_QUESTIONS = [ 390 | "https://www.metaculus.com/questions/578/human-extinction-by-2100/", # Human Extinction - Binary 391 | "https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/", # Age of Oldest Human - Numeric 392 | "https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/", # Number of New Leading AI Labs - Multiple Choice 393 | ] 394 | template_bot.skip_previously_forecasted_questions = False 395 | questions = [ 396 | MetaculusApi.get_question_by_url(question_url) 397 | for question_url in EXAMPLE_QUESTIONS 398 | ] 399 | forecast_reports = asyncio.run( 400 | template_bot.forecast_questions(questions, return_exceptions=True) 401 | ) 402 | TemplateForecaster.log_report_summary(forecast_reports) # type: ignore 403 | -------------------------------------------------------------------------------- /main_with_no_framework.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import datetime 3 | import json 4 | import os 5 | import re 6 | import dotenv 7 | dotenv.load_dotenv() 8 | 9 | from openai import AsyncOpenAI 10 | import numpy as np 11 | import requests 12 | import forecasting_tools 13 | from asknews_sdk import AskNewsSDK 14 | 15 | 16 | ######################### CONSTANTS ######################### 17 | # Constants 18 | SUBMIT_PREDICTION = True # set to True to publish your predictions to Metaculus 19 | USE_EXAMPLE_QUESTIONS = False # set to True to forecast example questions rather than the tournament questions 20 | NUM_RUNS_PER_QUESTION = 5 # The median forecast is taken between NUM_RUNS_PER_QUESTION runs 21 | SKIP_PREVIOUSLY_FORECASTED_QUESTIONS = True 22 | 23 | # Environment variables 24 | # You only need *either* Exa or Perplexity or AskNews keys for online research 25 | METACULUS_TOKEN = os.getenv("METACULUS_TOKEN") 26 | PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") 27 | ASKNEWS_CLIENT_ID = os.getenv("ASKNEWS_CLIENT_ID") 28 | ASKNEWS_SECRET = os.getenv("ASKNEWS_SECRET") 29 | EXA_API_KEY = os.getenv("EXA_API_KEY") 30 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # You'll also need the OpenAI API Key if you want to use the Exa Smart Searcher 31 | 32 | # The tournament IDs below can be used for testing your bot. 33 | Q4_2024_AI_BENCHMARKING_ID = 32506 34 | Q1_2025_AI_BENCHMARKING_ID = 32627 35 | Q4_2024_QUARTERLY_CUP_ID = 3672 36 | Q1_2025_QUARTERLY_CUP_ID = 32630 37 | AXC_2025_TOURNAMENT_ID = 32564 38 | GIVEWELL_ID = 3600 39 | RESPIRATORY_OUTLOOK_ID = 3411 40 | 41 | TOURNAMENT_ID = Q1_2025_AI_BENCHMARKING_ID 42 | 43 | # The example questions can be used for testing your bot. (note that question and post id are not always the same) 44 | EXAMPLE_QUESTIONS = [ # (question_id, post_id) 45 | (578, 578), # Human Extinction - Binary - https://www.metaculus.com/questions/578/human-extinction-by-2100/ 46 | (14333, 14333), # Age of Oldest Human - Numeric - https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/ 47 | (22427, 22427), # Number of New Leading AI Labs - Multiple Choice - https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/ 48 | ] 49 | 50 | # Also, we realize the below code could probably be cleaned up a bit in a few places 51 | # Though we are assuming most people will dissect it enough to make this not matter much 52 | 53 | ######################### HELPER FUNCTIONS ######################### 54 | 55 | # @title Helper functions 56 | AUTH_HEADERS = {"headers": {"Authorization": f"Token {METACULUS_TOKEN}"}} 57 | API_BASE_URL = "https://www.metaculus.com/api" 58 | 59 | 60 | def post_question_comment(post_id: int, comment_text: str) -> None: 61 | """ 62 | Post a comment on the question page as the bot user. 63 | """ 64 | 65 | response = requests.post( 66 | f"{API_BASE_URL}/comments/create/", 67 | json={ 68 | "text": comment_text, 69 | "parent": None, 70 | "included_forecast": True, 71 | "is_private": True, 72 | "on_post": post_id, 73 | }, 74 | **AUTH_HEADERS, # type: ignore 75 | ) 76 | if not response.ok: 77 | raise RuntimeError(response.text) 78 | 79 | 80 | def post_question_prediction(question_id: int, forecast_payload: dict) -> None: 81 | """ 82 | Post a forecast on a question. 83 | """ 84 | url = f"{API_BASE_URL}/questions/forecast/" 85 | response = requests.post( 86 | url, 87 | json=[ 88 | { 89 | "question": question_id, 90 | **forecast_payload, 91 | }, 92 | ], 93 | **AUTH_HEADERS, # type: ignore 94 | ) 95 | print(f"Prediction Post status code: {response.status_code}") 96 | if not response.ok: 97 | raise RuntimeError(response.text) 98 | 99 | 100 | def create_forecast_payload( 101 | forecast: float | dict[str, float] | list[float], 102 | question_type: str, 103 | ) -> dict: 104 | """ 105 | Accepts a forecast and generates the api payload in the correct format. 106 | 107 | If the question is binary, forecast must be a float. 108 | If the question is multiple choice, forecast must be a dictionary that 109 | maps question.options labels to floats. 110 | If the question is numeric, forecast must be a dictionary that maps 111 | quartiles or percentiles to datetimes, or a 201 value cdf. 112 | """ 113 | if question_type == "binary": 114 | return { 115 | "probability_yes": forecast, 116 | "probability_yes_per_category": None, 117 | "continuous_cdf": None, 118 | } 119 | if question_type == "multiple_choice": 120 | return { 121 | "probability_yes": None, 122 | "probability_yes_per_category": forecast, 123 | "continuous_cdf": None, 124 | } 125 | # numeric or date 126 | return { 127 | "probability_yes": None, 128 | "probability_yes_per_category": None, 129 | "continuous_cdf": forecast, 130 | } 131 | 132 | 133 | def list_posts_from_tournament( 134 | tournament_id: int = TOURNAMENT_ID, offset: int = 0, count: int = 50 135 | ) -> list[dict]: 136 | """ 137 | List (all details) {count} posts from the {tournament_id} 138 | """ 139 | url_qparams = { 140 | "limit": count, 141 | "offset": offset, 142 | "order_by": "-hotness", 143 | "forecast_type": ",".join( 144 | [ 145 | "binary", 146 | "multiple_choice", 147 | "numeric", 148 | ] 149 | ), 150 | "tournaments": [tournament_id], 151 | "statuses": "open", 152 | "include_description": "true", 153 | } 154 | url = f"{API_BASE_URL}/posts/" 155 | response = requests.get(url, **AUTH_HEADERS, params=url_qparams) # type: ignore 156 | if not response.ok: 157 | raise Exception(response.text) 158 | data = json.loads(response.content) 159 | return data 160 | 161 | 162 | def get_open_question_ids_from_tournament() -> list[tuple[int, int]]: 163 | posts = list_posts_from_tournament() 164 | 165 | post_dict = dict() 166 | for post in posts["results"]: 167 | if question := post.get("question"): 168 | # single question post 169 | post_dict[post["id"]] = [question] 170 | 171 | open_question_id_post_id = [] # [(question_id, post_id)] 172 | for post_id, questions in post_dict.items(): 173 | for question in questions: 174 | if question.get("status") == "open": 175 | print( 176 | f"ID: {question['id']}\nQ: {question['title']}\nCloses: " 177 | f"{question['scheduled_close_time']}" 178 | ) 179 | open_question_id_post_id.append((question["id"], post_id)) 180 | 181 | return open_question_id_post_id 182 | 183 | 184 | def get_post_details(post_id: int) -> dict: 185 | """ 186 | Get all details about a post from the Metaculus API. 187 | """ 188 | url = f"{API_BASE_URL}/posts/{post_id}/" 189 | print(f"Getting details for {url}") 190 | response = requests.get( 191 | url, 192 | **AUTH_HEADERS, # type: ignore 193 | ) 194 | if not response.ok: 195 | raise Exception(response.text) 196 | details = json.loads(response.content) 197 | return details 198 | 199 | CONCURRENT_REQUESTS_LIMIT = 5 200 | llm_rate_limiter = asyncio.Semaphore(CONCURRENT_REQUESTS_LIMIT) 201 | 202 | 203 | async def call_llm(prompt: str, model: str = "gpt-4o", temperature: float = 0.3) -> str: 204 | """ 205 | Makes a streaming completion request to OpenAI's API with concurrent request limiting. 206 | """ 207 | 208 | # Remove the base_url parameter to call the OpenAI API directly 209 | # Also checkout the package 'litellm' for one function that can call any model from any provider 210 | # Email ben@metaculus.com if you need credit for the Metaculus OpenAI/Anthropic proxy 211 | client = AsyncOpenAI( 212 | base_url="https://llm-proxy.metaculus.com/proxy/openai/v1", 213 | default_headers={ 214 | "Content-Type": "application/json", 215 | "Authorization": f"Token {METACULUS_TOKEN}", 216 | }, 217 | api_key="Fake API Key since openai requires this not to be NONE. This isn't used", 218 | max_retries=2, 219 | ) 220 | 221 | async with llm_rate_limiter: 222 | response = await client.chat.completions.create( 223 | model=model, 224 | messages=[{"role": "user", "content": prompt}], 225 | temperature=temperature, 226 | stream=False, 227 | ) 228 | answer = response.choices[0].message.content 229 | if answer is None: 230 | raise ValueError("No answer returned from LLM") 231 | return answer 232 | 233 | 234 | def run_research(question: str) -> str: 235 | research = "" 236 | if ASKNEWS_CLIENT_ID and ASKNEWS_SECRET: 237 | research = call_asknews(question) 238 | elif EXA_API_KEY: 239 | research = call_exa_smart_searcher(question) 240 | elif PERPLEXITY_API_KEY: 241 | research = call_perplexity(question) 242 | else: 243 | research = "No research done" 244 | 245 | print(f"########################\nResearch Found:\n{research}\n########################") 246 | 247 | return research 248 | 249 | def call_perplexity(question: str) -> str: 250 | url = "https://api.perplexity.ai/chat/completions" 251 | api_key = PERPLEXITY_API_KEY 252 | headers = { 253 | "accept": "application/json", 254 | "authorization": f"Bearer {api_key}", 255 | "content-type": "application/json", 256 | } 257 | payload = { 258 | "model": "llama-3.1-sonar-huge-128k-online", 259 | "messages": [ 260 | { 261 | "role": "system", # this is a system prompt designed to guide the perplexity assistant 262 | "content": """ 263 | You are an assistant to a superforecaster. 264 | The superforecaster will give you a question they intend to forecast on. 265 | To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. 266 | You do not produce forecasts yourself. 267 | """, 268 | }, 269 | { 270 | "role": "user", # this is the actual prompt we ask the perplexity assistant to answer 271 | "content": question, 272 | }, 273 | ], 274 | } 275 | response = requests.post(url=url, json=payload, headers=headers) 276 | if not response.ok: 277 | raise Exception(response.text) 278 | content = response.json()["choices"][0]["message"]["content"] 279 | return content 280 | 281 | def call_exa_smart_searcher(question: str) -> str: 282 | if OPENAI_API_KEY is None: 283 | searcher = forecasting_tools.ExaSearcher( 284 | include_highlights=True, 285 | num_results=10, 286 | ) 287 | highlights = asyncio.run(searcher.invoke_for_highlights_in_relevance_order(question)) 288 | prioritized_highlights = highlights[:10] 289 | combined_highlights = "" 290 | for i, highlight in enumerate(prioritized_highlights): 291 | combined_highlights += f'[Highlight {i+1}]:\nTitle: {highlight.source.title}\nURL: {highlight.source.url}\nText: "{highlight.highlight_text}"\n\n' 292 | response = combined_highlights 293 | else: 294 | searcher = forecasting_tools.SmartSearcher( 295 | temperature=0, 296 | num_searches_to_run=2, 297 | num_sites_per_search=10, 298 | ) 299 | prompt = ( 300 | "You are an assistant to a superforecaster. The superforecaster will give" 301 | "you a question they intend to forecast on. To be a great assistant, you generate" 302 | "a concise but detailed rundown of the most relevant news, including if the question" 303 | "would resolve Yes or No based on current information. You do not produce forecasts yourself." 304 | f"\n\nThe question is: {question}" 305 | ) 306 | response = asyncio.run(searcher.invoke(prompt)) 307 | 308 | return response 309 | 310 | def call_asknews(question: str) -> str: 311 | """ 312 | Use the AskNews `news` endpoint to get news context for your query. 313 | The full API reference can be found here: https://docs.asknews.app/en/reference#get-/v1/news/search 314 | """ 315 | ask = AskNewsSDK( 316 | client_id=ASKNEWS_CLIENT_ID, client_secret=ASKNEWS_SECRET, scopes=set(["news"]) 317 | ) 318 | 319 | # get the latest news related to the query (within the past 48 hours) 320 | hot_response = ask.news.search_news( 321 | query=question, # your natural language query 322 | n_articles=6, # control the number of articles to include in the context, originally 5 323 | return_type="both", 324 | strategy="latest news", # enforces looking at the latest news only 325 | ) 326 | 327 | # get context from the "historical" database that contains a news archive going back to 2023 328 | historical_response = ask.news.search_news( 329 | query=question, 330 | n_articles=10, 331 | return_type="both", 332 | strategy="news knowledge", # looks for relevant news within the past 60 days 333 | ) 334 | 335 | hot_articles = hot_response.as_dicts 336 | historical_articles = historical_response.as_dicts 337 | formatted_articles = "Here are the relevant news articles:\n\n" 338 | 339 | if hot_articles: 340 | hot_articles = [article.__dict__ for article in hot_articles] 341 | hot_articles = sorted(hot_articles, key=lambda x: x["pub_date"], reverse=True) 342 | 343 | for article in hot_articles: 344 | pub_date = article["pub_date"].strftime("%B %d, %Y %I:%M %p") 345 | formatted_articles += f"**{article['eng_title']}**\n{article['summary']}\nOriginal language: {article['language']}\nPublish date: {pub_date}\nSource:[{article['source_id']}]({article['article_url']})\n\n" 346 | 347 | if historical_articles: 348 | historical_articles = [article.__dict__ for article in historical_articles] 349 | historical_articles = sorted( 350 | historical_articles, key=lambda x: x["pub_date"], reverse=True 351 | ) 352 | 353 | for article in historical_articles: 354 | pub_date = article["pub_date"].strftime("%B %d, %Y %I:%M %p") 355 | formatted_articles += f"**{article['eng_title']}**\n{article['summary']}\nOriginal language: {article['language']}\nPublish date: {pub_date}\nSource:[{article['source_id']}]({article['article_url']})\n\n" 356 | 357 | if not hot_articles and not historical_articles: 358 | formatted_articles += "No articles were found.\n\n" 359 | return formatted_articles 360 | 361 | return formatted_articles 362 | 363 | ############### BINARY ############### 364 | # @title Binary prompt & functions 365 | 366 | # This section includes functionality for binary questions. 367 | 368 | BINARY_PROMPT_TEMPLATE = """ 369 | You are a professional forecaster interviewing for a job. 370 | 371 | Your interview question is: 372 | {title} 373 | 374 | Question background: 375 | {background} 376 | 377 | 378 | This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied: 379 | {resolution_criteria} 380 | 381 | {fine_print} 382 | 383 | 384 | Your research assistant says: 385 | {summary_report} 386 | 387 | Today is {today}. 388 | 389 | Before answering you write: 390 | (a) The time left until the outcome to the question is known. 391 | (b) The status quo outcome if nothing changed. 392 | (c) A brief description of a scenario that results in a No outcome. 393 | (d) A brief description of a scenario that results in a Yes outcome. 394 | 395 | You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. 396 | 397 | The last thing you write is your final answer as: "Probability: ZZ%", 0-100 398 | """ 399 | 400 | 401 | def extract_probability_from_response_as_percentage_not_decimal( 402 | forecast_text: str, 403 | ) -> float: 404 | matches = re.findall(r"(\d+)%", forecast_text) 405 | if matches: 406 | # Return the last number found before a '%' 407 | number = int(matches[-1]) 408 | number = min(99, max(1, number)) # clamp the number between 1 and 99 409 | return number 410 | else: 411 | raise ValueError(f"Could not extract prediction from response: {forecast_text}") 412 | 413 | 414 | async def get_binary_gpt_prediction( 415 | question_details: dict, num_runs: int 416 | ) -> tuple[float, str]: 417 | 418 | today = datetime.datetime.now().strftime("%Y-%m-%d") 419 | title = question_details["title"] 420 | resolution_criteria = question_details["resolution_criteria"] 421 | background = question_details["description"] 422 | fine_print = question_details["fine_print"] 423 | question_type = question_details["type"] 424 | 425 | summary_report = run_research(title) 426 | 427 | content = BINARY_PROMPT_TEMPLATE.format( 428 | title=title, 429 | today=today, 430 | background=background, 431 | resolution_criteria=resolution_criteria, 432 | fine_print=fine_print, 433 | summary_report=summary_report, 434 | ) 435 | 436 | async def get_rationale_and_probability(content: str) -> tuple[float, str]: 437 | rationale = await call_llm(content) 438 | 439 | probability = extract_probability_from_response_as_percentage_not_decimal( 440 | rationale 441 | ) 442 | comment = ( 443 | f"Extracted Probability: {probability}%\n\nGPT's Answer: " 444 | f"{rationale}\n\n\n" 445 | ) 446 | return probability, comment 447 | 448 | probability_and_comment_pairs = await asyncio.gather( 449 | *[get_rationale_and_probability(content) for _ in range(num_runs)] 450 | ) 451 | comments = [pair[1] for pair in probability_and_comment_pairs] 452 | final_comment_sections = [ 453 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments) 454 | ] 455 | probabilities = [pair[0] for pair in probability_and_comment_pairs] 456 | median_probability = float(np.median(probabilities)) / 100 457 | 458 | final_comment = f"Median Probability: {median_probability}\n\n" + "\n\n".join( 459 | final_comment_sections 460 | ) 461 | return median_probability, final_comment 462 | 463 | 464 | ####################### NUMERIC ############### 465 | # @title Numeric prompt & functions 466 | 467 | NUMERIC_PROMPT_TEMPLATE = """ 468 | You are a professional forecaster interviewing for a job. 469 | 470 | Your interview question is: 471 | {title} 472 | 473 | Background: 474 | {background} 475 | 476 | {resolution_criteria} 477 | 478 | {fine_print} 479 | 480 | Units for answer: {units} 481 | 482 | Your research assistant says: 483 | {summary_report} 484 | 485 | Today is {today}. 486 | 487 | {lower_bound_message} 488 | {upper_bound_message} 489 | 490 | 491 | Formatting Instructions: 492 | - Please notice the units requested (e.g. whether you represent a number as 1,000,000 or 1m). 493 | - Never use scientific notation. 494 | - Always start with a smaller number (more negative if negative) and then increase from there 495 | 496 | Before answering you write: 497 | (a) The time left until the outcome to the question is known. 498 | (b) The outcome if nothing changed. 499 | (c) The outcome if the current trend continued. 500 | (d) The expectations of experts and markets. 501 | (e) A brief description of an unexpected scenario that results in a low outcome. 502 | (f) A brief description of an unexpected scenario that results in a high outcome. 503 | 504 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unkowns. 505 | 506 | The last thing you write is your final answer as: 507 | " 508 | Percentile 10: XX 509 | Percentile 20: XX 510 | Percentile 40: XX 511 | Percentile 60: XX 512 | Percentile 80: XX 513 | Percentile 90: XX 514 | " 515 | """ 516 | 517 | 518 | def extract_percentiles_from_response(forecast_text: str) -> dict: 519 | 520 | # Helper function that returns a list of tuples with numbers for all lines with Percentile 521 | def extract_percentile_numbers(text) -> dict: 522 | pattern = r"^.*(?:P|p)ercentile.*$" 523 | number_pattern = r"-\s*(?:[^\d\-]*\s*)?(\d+(?:,\d{3})*(?:\.\d+)?)|(\d+(?:,\d{3})*(?:\.\d+)?)" 524 | results = [] 525 | 526 | for line in text.split("\n"): 527 | if re.match(pattern, line): 528 | numbers = re.findall(number_pattern, line) 529 | numbers_no_commas = [ 530 | next(num for num in match if num).replace(",", "") 531 | for match in numbers 532 | ] 533 | numbers = [ 534 | float(num) if "." in num else int(num) 535 | for num in numbers_no_commas 536 | ] 537 | if len(numbers) > 1: 538 | first_number = numbers[0] 539 | last_number = numbers[-1] 540 | # Check if the original line had a negative sign before the last number 541 | if "-" in line.split(":")[-1]: 542 | last_number = -abs(last_number) 543 | results.append((first_number, last_number)) 544 | 545 | # Convert results to dictionary 546 | percentile_values = {} 547 | for first_num, second_num in results: 548 | key = first_num 549 | percentile_values[key] = second_num 550 | 551 | return percentile_values 552 | 553 | percentile_values = extract_percentile_numbers(forecast_text) 554 | 555 | if len(percentile_values) > 0: 556 | return percentile_values 557 | else: 558 | raise ValueError(f"Could not extract prediction from response: {forecast_text}") 559 | 560 | 561 | def generate_continuous_cdf( 562 | percentile_values: dict, 563 | question_type: str, 564 | open_upper_bound: bool, 565 | open_lower_bound: bool, 566 | upper_bound: float, 567 | lower_bound: float, 568 | zero_point: float | None, 569 | ) -> list[float]: 570 | """ 571 | Returns: list[float]: A list of 201 float values representing the CDF. 572 | """ 573 | 574 | percentile_max = max(float(key) for key in percentile_values.keys()) 575 | percentile_min = min(float(key) for key in percentile_values.keys()) 576 | range_min = lower_bound 577 | range_max = upper_bound 578 | range_size = range_max - range_min 579 | buffer = 1 if range_size > 100 else 0.01 * range_size 580 | 581 | # Adjust any values that are exactly at the bounds 582 | for percentile, value in list(percentile_values.items()): 583 | if not open_lower_bound and value <= range_min + buffer: 584 | percentile_values[percentile] = range_min + buffer 585 | if not open_upper_bound and value >= range_max - buffer: 586 | percentile_values[percentile] = range_max - buffer 587 | 588 | # Set cdf values outside range 589 | if open_upper_bound: 590 | if range_max > percentile_values[percentile_max]: 591 | percentile_values[int(100 - (0.5 * (100 - percentile_max)))] = range_max 592 | else: 593 | percentile_values[100] = range_max 594 | 595 | # Set cdf values outside range 596 | if open_lower_bound: 597 | if range_min < percentile_values[percentile_min]: 598 | percentile_values[int(0.5 * percentile_min)] = range_min 599 | else: 600 | percentile_values[0] = range_min 601 | 602 | sorted_percentile_values = dict(sorted(percentile_values.items())) 603 | 604 | # Normalize percentile keys 605 | normalized_percentile_values = {} 606 | for key, value in sorted_percentile_values.items(): 607 | percentile = float(key) / 100 608 | normalized_percentile_values[percentile] = value 609 | 610 | 611 | value_percentiles = { 612 | value: key for key, value in normalized_percentile_values.items() 613 | } 614 | 615 | # function for log scaled questions 616 | def generate_cdf_locations(range_min, range_max, zero_point): 617 | if zero_point is None: 618 | scale = lambda x: range_min + (range_max - range_min) * x 619 | else: 620 | deriv_ratio = (range_max - zero_point) / (range_min - zero_point) 621 | scale = lambda x: range_min + (range_max - range_min) * ( 622 | deriv_ratio**x - 1 623 | ) / (deriv_ratio - 1) 624 | return [scale(x) for x in np.linspace(0, 1, 201)] 625 | 626 | cdf_xaxis = generate_cdf_locations(range_min, range_max, zero_point) 627 | 628 | def linear_interpolation(x_values, xy_pairs): 629 | # Sort the xy_pairs by x-values 630 | sorted_pairs = sorted(xy_pairs.items()) 631 | 632 | # Extract sorted x and y values 633 | known_x = [pair[0] for pair in sorted_pairs] 634 | known_y = [pair[1] for pair in sorted_pairs] 635 | 636 | # Initialize the result list 637 | y_values = [] 638 | 639 | for x in x_values: 640 | # Check if x is exactly in the known x values 641 | if x in known_x: 642 | y_values.append(known_y[known_x.index(x)]) 643 | else: 644 | # Find the indices of the two nearest known x-values 645 | i = 0 646 | while i < len(known_x) and known_x[i] < x: 647 | i += 1 648 | 649 | list_index_2 = i 650 | 651 | # If x is outside the range of known x-values, use the nearest endpoint 652 | if i == 0: 653 | y_values.append(known_y[0]) 654 | elif i == len(known_x): 655 | y_values.append(known_y[-1]) 656 | else: 657 | # Perform linear interpolation 658 | x0, x1 = known_x[i - 1], known_x[i] 659 | y0, y1 = known_y[i - 1], known_y[i] 660 | 661 | # Linear interpolation formula 662 | y = y0 + (x - x0) * (y1 - y0) / (x1 - x0) 663 | y_values.append(y) 664 | 665 | return y_values 666 | 667 | continuous_cdf = linear_interpolation(cdf_xaxis, value_percentiles) 668 | return continuous_cdf 669 | 670 | 671 | async def get_numeric_gpt_prediction( 672 | question_details: dict, num_runs: int 673 | ) -> tuple[list[float], str]: 674 | 675 | today = datetime.datetime.now().strftime("%Y-%m-%d") 676 | title = question_details["title"] 677 | resolution_criteria = question_details["resolution_criteria"] 678 | background = question_details["description"] 679 | fine_print = question_details["fine_print"] 680 | question_type = question_details["type"] 681 | scaling = question_details["scaling"] 682 | open_upper_bound = question_details["open_upper_bound"] 683 | open_lower_bound = question_details["open_lower_bound"] 684 | unit_of_measure = question_details["unit"] if question_details["unit"] else "Not stated (please infer this)" 685 | upper_bound = scaling["range_max"] 686 | lower_bound = scaling["range_min"] 687 | zero_point = scaling["zero_point"] 688 | 689 | # Create messages about the bounds that are passed in the LLM prompt 690 | if open_upper_bound: 691 | upper_bound_message = "" 692 | else: 693 | upper_bound_message = f"The outcome can not be higher than {upper_bound}." 694 | if open_lower_bound: 695 | lower_bound_message = "" 696 | else: 697 | lower_bound_message = f"The outcome can not be lower than {lower_bound}." 698 | 699 | summary_report = run_research(title) 700 | 701 | content = NUMERIC_PROMPT_TEMPLATE.format( 702 | title=title, 703 | today=today, 704 | background=background, 705 | resolution_criteria=resolution_criteria, 706 | fine_print=fine_print, 707 | summary_report=summary_report, 708 | lower_bound_message=lower_bound_message, 709 | upper_bound_message=upper_bound_message, 710 | units=unit_of_measure, 711 | ) 712 | 713 | async def ask_llm_to_get_cdf(content: str) -> tuple[list[float], str]: 714 | rationale = await call_llm(content) 715 | percentile_values = extract_percentiles_from_response(rationale) 716 | 717 | comment = ( 718 | f"Extracted Percentile_values: {percentile_values}\n\nGPT's Answer: " 719 | f"{rationale}\n\n\n" 720 | ) 721 | 722 | cdf = generate_continuous_cdf( 723 | percentile_values, 724 | question_type, 725 | open_upper_bound, 726 | open_lower_bound, 727 | upper_bound, 728 | lower_bound, 729 | zero_point, 730 | ) 731 | 732 | return cdf, comment 733 | 734 | cdf_and_comment_pairs = await asyncio.gather( 735 | *[ask_llm_to_get_cdf(content) for _ in range(num_runs)] 736 | ) 737 | comments = [pair[1] for pair in cdf_and_comment_pairs] 738 | final_comment_sections = [ 739 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments) 740 | ] 741 | cdfs: list[list[float]] = [pair[0] for pair in cdf_and_comment_pairs] 742 | all_cdfs = np.array(cdfs) 743 | median_cdf: list[float] = np.median(all_cdfs, axis=0).tolist() 744 | 745 | final_comment = f"Median CDF: `{str(median_cdf)[:100]}...`\n\n" + "\n\n".join( 746 | final_comment_sections 747 | ) 748 | return median_cdf, final_comment 749 | 750 | 751 | ########################## MULTIPLE CHOICE ############### 752 | # @title Multiple Choice prompt & functions 753 | 754 | MULTIPLE_CHOICE_PROMPT_TEMPLATE = """ 755 | You are a professional forecaster interviewing for a job. 756 | 757 | Your interview question is: 758 | {title} 759 | 760 | The options are: {options} 761 | 762 | 763 | Background: 764 | {background} 765 | 766 | {resolution_criteria} 767 | 768 | {fine_print} 769 | 770 | 771 | Your research assistant says: 772 | {summary_report} 773 | 774 | Today is {today}. 775 | 776 | Before answering you write: 777 | (a) The time left until the outcome to the question is known. 778 | (b) The status quo outcome if nothing changed. 779 | (c) A description of an scenario that results in an unexpected outcome. 780 | 781 | You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. 782 | 783 | The last thing you write is your final probabilities for the N options in this order {options} as: 784 | Option_A: Probability_A 785 | Option_B: Probability_B 786 | ... 787 | Option_N: Probability_N 788 | """ 789 | 790 | 791 | def extract_option_probabilities_from_response(forecast_text: str, options) -> float: 792 | 793 | # Helper function that returns a list of tuples with numbers for all lines with Percentile 794 | def extract_option_probabilities(text): 795 | 796 | # Number extraction pattern 797 | number_pattern = r"-?\d+(?:,\d{3})*(?:\.\d+)?" 798 | 799 | results = [] 800 | 801 | # Iterate through each line in the text 802 | for line in text.split("\n"): 803 | # Extract all numbers from the line 804 | numbers = re.findall(number_pattern, line) 805 | numbers_no_commas = [num.replace(",", "") for num in numbers] 806 | # Convert strings to float or int 807 | numbers = [ 808 | float(num) if "." in num else int(num) for num in numbers_no_commas 809 | ] 810 | # Add the tuple of numbers to results 811 | if len(numbers) >= 1: 812 | last_number = numbers[-1] 813 | results.append(last_number) 814 | 815 | return results 816 | 817 | option_probabilities = extract_option_probabilities(forecast_text) 818 | 819 | NUM_OPTIONS = len(options) 820 | 821 | if len(option_probabilities) > 0: 822 | # return the last NUM_OPTIONS items 823 | return option_probabilities[-NUM_OPTIONS:] 824 | else: 825 | raise ValueError(f"Could not extract prediction from response: {forecast_text}") 826 | 827 | 828 | def generate_multiple_choice_forecast(options, option_probabilities) -> dict: 829 | """ 830 | Returns: dict corresponding to the probabilities of each option. 831 | """ 832 | 833 | # confirm that there is a probability for each option 834 | if len(options) != len(option_probabilities): 835 | raise ValueError( 836 | f"Number of options ({len(options)}) does not match number of probabilities ({len(option_probabilities)})" 837 | ) 838 | 839 | # Ensure we are using decimals 840 | total_sum = sum(option_probabilities) 841 | decimal_list = [x / total_sum for x in option_probabilities] 842 | 843 | def normalize_list(float_list): 844 | # Step 1: Clamp values 845 | clamped_list = [max(min(x, 0.99), 0.01) for x in float_list] 846 | 847 | # Step 2: Calculate the sum of all elements 848 | total_sum = sum(clamped_list) 849 | 850 | # Step 3: Normalize the list so that all elements add up to 1 851 | normalized_list = [x / total_sum for x in clamped_list] 852 | 853 | # Step 4: Adjust for any small floating-point errors 854 | adjustment = 1.0 - sum(normalized_list) 855 | normalized_list[-1] += adjustment 856 | 857 | return normalized_list 858 | 859 | normalized_option_probabilities = normalize_list(decimal_list) 860 | 861 | probability_yes_per_category = {} 862 | for i in range(len(options)): 863 | probability_yes_per_category[options[i]] = normalized_option_probabilities[i] 864 | 865 | return probability_yes_per_category 866 | 867 | 868 | async def get_multiple_choice_gpt_prediction( 869 | question_details: dict, 870 | num_runs: int, 871 | ) -> tuple[dict[str, float], str]: 872 | 873 | today = datetime.datetime.now().strftime("%Y-%m-%d") 874 | title = question_details["title"] 875 | resolution_criteria = question_details["resolution_criteria"] 876 | background = question_details["description"] 877 | fine_print = question_details["fine_print"] 878 | question_type = question_details["type"] 879 | options = question_details["options"] 880 | 881 | summary_report = run_research(title) 882 | 883 | content = MULTIPLE_CHOICE_PROMPT_TEMPLATE.format( 884 | title=title, 885 | today=today, 886 | background=background, 887 | resolution_criteria=resolution_criteria, 888 | fine_print=fine_print, 889 | summary_report=summary_report, 890 | options=options, 891 | ) 892 | 893 | async def ask_llm_for_multiple_choice_probabilities( 894 | content: str, 895 | ) -> tuple[dict[str, float], str]: 896 | rationale = await call_llm(content) 897 | 898 | 899 | option_probabilities = extract_option_probabilities_from_response( 900 | rationale, options 901 | ) 902 | 903 | comment = ( 904 | f"EXTRACTED_PROBABILITIES: {option_probabilities}\n\nGPT's Answer: " 905 | f"{rationale}\n\n\n" 906 | ) 907 | 908 | probability_yes_per_category = generate_multiple_choice_forecast( 909 | options, option_probabilities 910 | ) 911 | return probability_yes_per_category, comment 912 | 913 | probability_yes_per_category_and_comment_pairs = await asyncio.gather( 914 | *[ask_llm_for_multiple_choice_probabilities(content) for _ in range(num_runs)] 915 | ) 916 | comments = [pair[1] for pair in probability_yes_per_category_and_comment_pairs] 917 | final_comment_sections = [ 918 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments) 919 | ] 920 | probability_yes_per_category_dicts: list[dict[str, float]] = [ 921 | pair[0] for pair in probability_yes_per_category_and_comment_pairs 922 | ] 923 | average_probability_yes_per_category: dict[str, float] = {} 924 | for option in options: 925 | probabilities_for_current_option: list[float] = [ 926 | dict[option] for dict in probability_yes_per_category_dicts 927 | ] 928 | average_probability_yes_per_category[option] = sum( 929 | probabilities_for_current_option 930 | ) / len(probabilities_for_current_option) 931 | 932 | final_comment = ( 933 | f"Average Probability Yes Per Category: `{average_probability_yes_per_category}`\n\n" 934 | + "\n\n".join(final_comment_sections) 935 | ) 936 | return average_probability_yes_per_category, final_comment 937 | 938 | 939 | ################### FORECASTING ################### 940 | def forecast_is_already_made(post_details: dict) -> bool: 941 | """ 942 | Check if a forecast has already been made by looking at my_forecasts in the question data. 943 | 944 | question.my_forecasts.latest.forecast_values has the following values for each question type: 945 | Binary: [probability for no, probability for yes] 946 | Numeric: [cdf value 1, cdf value 2, ..., cdf value 201] 947 | Multiple Choice: [probability for option 1, probability for option 2, ...] 948 | """ 949 | try: 950 | forecast_values = post_details["question"]["my_forecasts"]["latest"][ 951 | "forecast_values" 952 | ] 953 | return forecast_values is not None 954 | except Exception: 955 | return False 956 | 957 | 958 | async def forecast_individual_question( 959 | question_id: int, 960 | post_id: int, 961 | submit_prediction: bool, 962 | num_runs_per_question: int, 963 | skip_previously_forecasted_questions: bool, 964 | ) -> str: 965 | post_details = get_post_details(post_id) 966 | question_details = post_details["question"] 967 | title = question_details["title"] 968 | question_type = question_details["type"] 969 | 970 | summary_of_forecast = "" 971 | summary_of_forecast += f"-----------------------------------------------\nQuestion: {title}\n" 972 | summary_of_forecast += f"URL: https://www.metaculus.com/questions/{post_id}/\n" 973 | 974 | if question_type == "multiple_choice": 975 | options = question_details["options"] 976 | summary_of_forecast += f"options: {options}\n" 977 | 978 | if ( 979 | forecast_is_already_made(post_details) 980 | and skip_previously_forecasted_questions == True 981 | ): 982 | summary_of_forecast += f"Skipped: Forecast already made\n" 983 | return summary_of_forecast 984 | 985 | if question_type == "binary": 986 | forecast, comment = await get_binary_gpt_prediction( 987 | question_details, num_runs_per_question 988 | ) 989 | elif question_type == "numeric": 990 | forecast, comment = await get_numeric_gpt_prediction( 991 | question_details, num_runs_per_question 992 | ) 993 | elif question_type == "multiple_choice": 994 | forecast, comment = await get_multiple_choice_gpt_prediction( 995 | question_details, num_runs_per_question 996 | ) 997 | else: 998 | raise ValueError(f"Unknown question type: {question_type}") 999 | 1000 | print(f"-----------------------------------------------\nPost {post_id} Question {question_id}:\n") 1001 | print(f"Forecast for post {post_id} (question {question_id}):\n{forecast}") 1002 | print(f"Comment for post {post_id} (question {question_id}):\n{comment}") 1003 | 1004 | if question_type == "numeric": 1005 | summary_of_forecast += f"Forecast: {str(forecast)[:200]}...\n" 1006 | else: 1007 | summary_of_forecast += f"Forecast: {forecast}\n" 1008 | 1009 | summary_of_forecast += f"Comment:\n```\n{comment[:200]}...\n```\n\n" 1010 | 1011 | if submit_prediction == True: 1012 | forecast_payload = create_forecast_payload(forecast, question_type) 1013 | post_question_prediction(question_id, forecast_payload) 1014 | post_question_comment(post_id, comment) 1015 | summary_of_forecast += "Posted: Forecast was posted to Metaculus.\n" 1016 | 1017 | return summary_of_forecast 1018 | 1019 | 1020 | async def forecast_questions( 1021 | open_question_id_post_id: list[tuple[int, int]], 1022 | submit_prediction: bool, 1023 | num_runs_per_question: int, 1024 | skip_previously_forecasted_questions: bool, 1025 | ) -> None: 1026 | forecast_tasks = [ 1027 | forecast_individual_question( 1028 | question_id, 1029 | post_id, 1030 | submit_prediction, 1031 | num_runs_per_question, 1032 | skip_previously_forecasted_questions, 1033 | ) 1034 | for question_id, post_id in open_question_id_post_id 1035 | ] 1036 | forecast_summaries = await asyncio.gather(*forecast_tasks, return_exceptions=True) 1037 | print("\n", "#" * 100, "\nForecast Summaries\n", "#" * 100) 1038 | 1039 | errors = [] 1040 | for question_id_post_id, forecast_summary in zip( 1041 | open_question_id_post_id, forecast_summaries 1042 | ): 1043 | question_id, post_id = question_id_post_id 1044 | if isinstance(forecast_summary, Exception): 1045 | print( 1046 | f"-----------------------------------------------\nPost {post_id} Question {question_id}:\nError: {forecast_summary.__class__.__name__} {forecast_summary}\nURL: https://www.metaculus.com/questions/{post_id}/\n" 1047 | ) 1048 | errors.append(forecast_summary) 1049 | else: 1050 | print(forecast_summary) 1051 | 1052 | if errors: 1053 | print("-----------------------------------------------\nErrors:\n") 1054 | error_message = f"Errors were encountered: {errors}" 1055 | print(error_message) 1056 | raise RuntimeError(error_message) 1057 | 1058 | 1059 | 1060 | 1061 | ######################## FINAL RUN ######################### 1062 | if __name__ == "__main__": 1063 | if USE_EXAMPLE_QUESTIONS: 1064 | open_question_id_post_id = EXAMPLE_QUESTIONS 1065 | else: 1066 | open_question_id_post_id = get_open_question_ids_from_tournament() 1067 | 1068 | asyncio.run( 1069 | forecast_questions( 1070 | open_question_id_post_id, 1071 | SUBMIT_PREDICTION, 1072 | NUM_RUNS_PER_QUESTION, 1073 | SKIP_PREVIOUSLY_FORECASTED_QUESTIONS, 1074 | ) 1075 | ) 1076 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "metac-bot-template" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Vasile Popescu "] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | python-decouple = "^3.8" 12 | requests = "^2.32.3" 13 | asknews = "^0.9.1" 14 | numpy = "^1.26.0" 15 | openai = "^1.57.4" 16 | python-dotenv = "^1.0.1" 17 | forecasting-tools = "^0.2.23" 18 | 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | ipykernel = "^6.29.5" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | 27 | --------------------------------------------------------------------------------