├── .DS_Store
├── .env.template
├── .github
└── workflows
│ ├── run_bot_on_quarterly_cup.yaml
│ ├── run_bot_on_tournament.yaml
│ └── test_bot.yaml
├── .gitignore
├── README.md
├── community_benchmark.py
├── main.py
├── main_with_no_framework.py
├── poetry.lock
└── pyproject.toml
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Metaculus/metac-bot-template/593f29f8cf65cbfda103c5791b4735a2dd35d9b6/.DS_Store
--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | # Required
2 | METACULUS_TOKEN=1234567890
3 |
4 | # Optional
5 | PERPLEXITY_API_KEY=1234567890
6 | OPENAI_API_KEY=1234567890
7 | EXA_API_KEY=1234567890
8 | ASKNEWS_CLIENT_ID=1234567890
9 | ASKNEWS_SECRET=1234567890
10 | ANTHROPIC_API_KEY=1234567890
--------------------------------------------------------------------------------
/.github/workflows/run_bot_on_quarterly_cup.yaml:
--------------------------------------------------------------------------------
1 | name: Forecast on Quarterly Cup
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule:
6 | - cron: "0 0 */2 * *" # runs at midnight every 2 days
7 |
8 | # Add concurrency group to prevent parallel runs
9 | concurrency:
10 | group: ${{ github.workflow }}
11 | cancel-in-progress: false
12 |
13 |
14 | # Daily job to run the simple forecast bot
15 | jobs:
16 | daily_build:
17 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is
18 | steps: # sets up the steps that will be run in order
19 | # setup repository with all necessary dependencies - keep as is
20 | - name: Check out repository
21 | uses: actions/checkout@v3
22 | - uses: actions/setup-python@v4
23 | with:
24 | python-version: "3.11"
25 | - name: Install poetry
26 | uses: snok/install-poetry@v1
27 | with:
28 | virtualenvs-create: true
29 | virtualenvs-in-project: true
30 | installer-parallel: true
31 | - name: Load cached venv
32 | id: cached-poetry-dependencies
33 | uses: actions/cache@v4
34 | with:
35 | path: .venv
36 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
37 | - name: Install dependencies
38 | run: poetry install --no-interaction --no-root
39 | - name: Run bot
40 | run: |
41 | poetry run python main.py --mode quarterly_cup
42 | # this reads the environment variables from the github repository.
43 | # Store under Settings --> Secrets and variables --> Actions
44 | env:
45 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token
46 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }}
47 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }}
48 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
49 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
50 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
51 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }}
52 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }}
--------------------------------------------------------------------------------
/.github/workflows/run_bot_on_tournament.yaml:
--------------------------------------------------------------------------------
1 | name: Forecast on new AI tournament questions
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule:
6 | - cron: "*/30 * * * *" # runs every 30 minutes. Make sure to skip already forecasted questions!
7 |
8 | # Add concurrency group to prevent parallel runs
9 | concurrency:
10 | group: ${{ github.workflow }}
11 | cancel-in-progress: false
12 |
13 |
14 | # Daily job to run the simple forecast bot
15 | jobs:
16 | daily_build:
17 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is
18 | steps: # sets up the steps that will be run in order
19 | # setup repository with all necessary dependencies - keep as is
20 | - name: Check out repository
21 | uses: actions/checkout@v3
22 | - uses: actions/setup-python@v4
23 | with:
24 | python-version: "3.11"
25 | - name: Install poetry
26 | uses: snok/install-poetry@v1
27 | with:
28 | virtualenvs-create: true
29 | virtualenvs-in-project: true
30 | installer-parallel: true
31 | - name: Load cached venv
32 | id: cached-poetry-dependencies
33 | uses: actions/cache@v4
34 | with:
35 | path: .venv
36 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
37 | - name: Install dependencies
38 | run: poetry install --no-interaction --no-root
39 | - name: Run bot
40 | run: |
41 | poetry run python main.py
42 | # this reads the environment variables from the github repository.
43 | # Store under Settings --> Secrets and variables --> Actions
44 | env:
45 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token
46 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }}
47 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }}
48 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
49 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
50 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
51 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }}
52 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }}
--------------------------------------------------------------------------------
/.github/workflows/test_bot.yaml:
--------------------------------------------------------------------------------
1 | name: Test Bot
2 |
3 | on:
4 | workflow_dispatch:
5 |
6 | # Add concurrency group to prevent parallel runs
7 | concurrency:
8 | group: ${{ github.workflow }}
9 | cancel-in-progress: false
10 |
11 |
12 | # Daily job to run the simple forecast bot
13 | jobs:
14 | daily_build:
15 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is
16 | steps: # sets up the steps that will be run in order
17 | # setup repository with all necessary dependencies - keep as is
18 | - name: Check out repository
19 | uses: actions/checkout@v3
20 | - uses: actions/setup-python@v4
21 | with:
22 | python-version: "3.11"
23 | - name: Install poetry
24 | uses: snok/install-poetry@v1
25 | with:
26 | virtualenvs-create: true
27 | virtualenvs-in-project: true
28 | installer-parallel: true
29 | - name: Load cached venv
30 | id: cached-poetry-dependencies
31 | uses: actions/cache@v4
32 | with:
33 | path: .venv
34 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
35 | - name: Install dependencies
36 | run: poetry install --no-interaction --no-root
37 | - name: Run bot
38 | run: |
39 | poetry run python main.py --mode test_questions
40 | # this reads the environment variables from the github repository.
41 | # Store under Settings --> Secrets and variables --> Actions
42 | env:
43 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token
44 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }}
45 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }}
46 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
47 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
48 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
49 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }}
50 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Specific to Project ###
2 | benchmarks/
3 | sandbox.py
4 |
5 |
6 | ### General gitignore ###
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | *.py,cover
57 | .hypothesis/
58 | .pytest_cache/
59 | cover/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | # *.log
67 | local_settings.py
68 | db.sqlite3
69 | db.sqlite3-journal
70 |
71 | # Flask stuff:
72 | instance/
73 | .webassets-cache
74 |
75 | # Scrapy stuff:
76 | .scrapy
77 |
78 | # Sphinx documentation
79 | docs/_build/
80 |
81 | # PyBuilder
82 | .pybuilder/
83 | target/
84 |
85 | # Jupyter Notebook
86 | .ipynb_checkpoints
87 |
88 | # IPython
89 | profile_default/
90 | ipython_config.py
91 |
92 | # pyenv
93 | # For a library or package, you might want to ignore these files since the code is
94 | # intended to run in multiple environments; otherwise, check them in:
95 | # .python-version
96 |
97 | # pipenv
98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | # install all needed dependencies.
102 | #Pipfile.lock
103 |
104 | # poetry
105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | # This is especially recommended for binary packages to ensure reproducibility, and is more
107 | # commonly ignored for libraries.
108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 |
111 | # pdm
112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | # in version control.
116 | # https://pdm.fming.dev/#use-with-ide
117 | .pdm.toml
118 |
119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120 | __pypackages__/
121 |
122 | # Celery stuff
123 | celerybeat-schedule
124 | celerybeat.pid
125 |
126 | # SageMath parsed files
127 | *.sage.py
128 |
129 | # Environments
130 | .env
131 | .venv
132 | env/
133 | venv/
134 | ENV/
135 | env.bak/
136 | venv.bak/
137 |
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 |
142 | # Rope project settings
143 | .ropeproject
144 |
145 | # mkdocs documentation
146 | /site
147 |
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 |
153 | # Pyre type checker
154 | .pyre/
155 |
156 | # pytype static type analyzer
157 | .pytype/
158 |
159 | # Cython debug symbols
160 | cython_debug/
161 |
162 | # Ruff
163 | .ruff_cache/
164 |
165 | # PyCharm
166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168 | # and can be added to the global gitignore or merged into this file. For a more nuclear
169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
170 | # .idea/
171 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Simple Metaculus forecasting bot
2 | This repository contains a simple bot meant to get you started with creating your own bot for the AI Forecasting Tournament. Go to https://www.metaculus.com/aib/ for more info and tournament rules.
3 |
4 | In this project are 2 files:
5 | - **main.py**: Our recommended template option that uses [forecasting-tools](https://github.com/Metaculus/forecasting-tools) package to handle a lot of stuff in the background for you (such as API calls). We will update the package, thus allowing you to gain new features with minimal changes to your code.
6 | - **main_with_no_framework.py**: A copy of main.py but implemented with minimal dependencies. Useful if you want a more custom approach.
7 |
8 | Join the conversation about bot creation, get support, and follow updates on the [Metaculus Discord](https://discord.com/invite/NJgCC2nDfh) 'build a forecasting bot' channel.
9 |
10 | ## 30min Video Tutorial
11 | This tutorial shows you how to set our template bot so you can start forecasting in the tournament.
12 |
13 | [](https://www.loom.com/share/fc3c1a643b984a15b510647d8f760685?sid=29b502e0-cf64-421e-82c0-3a78451159ed)
14 |
15 | If you run into trouble, reach out to `ben [at] metaculus [.com]`
16 |
17 |
18 | ## Quick start -> Fork and use Github Actions
19 | The easiest way to use this repo is to fork it, enable github workflow/actions, and then set repository secrets. Then your bot will run every 30min, pick up new questions, and forecast on them. Automation is handled in the `.github/workflows/` folder. The `daily_run_simple_bot.yaml` file runs the simple bot every 30 min and will skip questions it has already forecasted on.
20 |
21 | 1) **Fork the repository**: Go to the [repository](https://github.com/Metaculus/metac-bot-template) and click 'fork'.
22 | 2) **Set secrets**: Go to `Settings -> Secrets and variables -> Actions -> New respository secret` and set API keys/Tokens as secrets. You will want to set your METACULUS_TOKEN. This will be used to post questions to Metaculus, and to use our OpenAI/Anthropic LLM proxy (reach out to `ben [at] metaculus [.com]` with your bot description to apply for credits. See the relevant section below).
23 | 3) **Enable Actions**: Go to 'Actions' then click 'Enable'. Then go to the 'Regularly forecast new questions' workflow, and click 'Enable'. To test if the workflow is working, click 'Run workflow', choose the main branch, then click the green 'Run workflow' button. This will check for new questions and forecast only on ones it has not yet successfully forecast on.
24 |
25 | The bot should just work as is at this point. You can disable the workflow by clicking `Actions > Regularly forecast new questions > Triple dots > disable workflow`
26 |
27 | ## Getting your Metaculus Token
28 | To get a bot account and your API Token:
29 | 1) Go to https://metaculus.com/aib
30 | 2) Click "Log Out" if you are using your personal account
31 | 3) Click "Create a Bot Account"
32 | 4) Create your account
33 | 5) Go back to https://metaculus.com/aib
34 | 6) Click 'Show My Token'
35 |
36 | If your regular Metaculus account uses Gmail, you can create a separate bot account while keeping your existing email by adding a '+bot' before the @ symbol. For example, if your email is 'youremail@gmail.com', you can use 'youremail+bot1@gmail.com' for your bot account.
37 |
38 | ## Search Provider API Keys
39 |
40 | ### Getting AskNews Setup
41 | Metaculus is collaborating with AskNews to give free access for news searches. Each registered bot builder gets 3k calls per month, 9k calls total for the tournament (please note that latest news requests (48 hours back) are 1 call and archive news requests are 5 calls), and 5M tokens. Bots have access to the /news and /deepnews endpoints. To sign up:
42 | 1. Make an account on AskNews (if you have not yet, https://my.asknews.app)
43 | 2. Join the [AskNews discord](https://discord.gg/99qt5HGgUn), send your bot name + AskNews registered email to the #api-support channel.
44 | 3. AskNews will make sure you have free calls and your account is ready to go for you to make API keys and get going
45 | 4. Generate your `ASKNEWS_CLIENT_ID` and `ASKNEWS_SECRET` [here](https://my.asknews.app/en/settings/api-credentials) and add that to the .env
46 | 5. Run the AskNewsSearcher from the forecasting-tools repo or use the AskNews SDK python package
47 |
48 | Your account will be active for the duration of the tournament. There is only one account allowed per participant.
49 |
50 | Example usage of /news and /deepnews:
51 |
52 | ```python
53 | from asknews_sdk import AsyncAskNewsSDK
54 | import asyncio
55 |
56 | """
57 | More information available here:
58 | https://docs.asknews.app/en/news
59 | https://docs.asknews.app/en/deepnews
60 |
61 | Installation:
62 | pip install asknews
63 | """
64 |
65 | client_id = ""
66 | client_secret = ""
67 |
68 | ask = AsyncAskNewsSDK(
69 | client_id=client_id,
70 | client_secret=client_secret,
71 | scopes=["chat", "news", "stories", "analytics"],
72 | )
73 |
74 | # /news endpoint example
75 | async def search_news(query):
76 |
77 | hot_response = await ask.news.search_news(
78 | query=query, # your natural language query
79 | n_articles=5, # control the number of articles to include in the context
80 | return_type="both",
81 | strategy="latest news" # enforces looking at the latest news only
82 | )
83 |
84 | print(hot_response.as_string)
85 |
86 | # get context from the "historical" database that contains a news archive going back to 2023
87 | historical_response = await ask.news.search_news(
88 | query=query,
89 | n_articles=10,
90 | return_type="both",
91 | strategy="news knowledge" # looks for relevant news within the past 60 days
92 | )
93 |
94 | print(historical_response.as_string)
95 |
96 | # /deepnews endpoint example:
97 | async def deep_research(
98 | query, sources, model, search_depth=2, max_depth=2
99 | ):
100 |
101 | response = await ask.chat.get_deep_news(
102 | messages=[{"role": "user", "content": query}],
103 | search_depth=search_depth,
104 | max_depth=max_depth,
105 | sources=sources,
106 | stream=False,
107 | return_sources=False,
108 | model=model,
109 | inline_citations="numbered"
110 | )
111 |
112 | print(response)
113 |
114 |
115 | if __name__ == "__main__":
116 | query = "What is the TAM of the global market for electric vehicles in 2025? With your final report, please report the TAM in USD using the tags ... "
117 |
118 | sources = ["asknews"]
119 | model = "deepseek-basic"
120 | search_depth = 2
121 | max_depth = 2
122 | asyncio.run(
123 | deep_research(
124 | query, sources, model, search_depth, max_depth
125 | )
126 | )
127 |
128 | asyncio.run(search_news(query))
129 | ```
130 |
131 | Some tips for DeepNews:
132 |
133 | You will get tags in your response, including:
134 |
135 |
136 |
137 |
138 |
139 | These tags are likely useful for extracting the pieces that you need for your pipeline. For example, if you dont want to include all the thinking/searching, you could just extract
140 |
141 | ### Getting Perplexity Set Up
142 | Perplexity works as an internet powered LLM, and costs half a cent per search (if you pick the right model) plus token costs. It is less customizable but generally cheaper.
143 | 1. Create an account on the free tier at www.perplexity.ai
144 | 2. Go to https://www.perplexity.ai/settings/account
145 | 3. Click "API" in the top bar
146 | 4. Click "Generate" in the "API Keys" section
147 | 5. Add funds to your account with the 'Buy Credits' button
148 | 6. Add it to the .env as `PERPLEXITY_API_KEY=your-key-here`
149 |
150 | ### Getting Exa Set Up
151 | Exa is closer to a more traditional search provider. Exa takes in a search query and a list of filters and returns a list of websites. Each site returned can have scraped text, semantic higlights, AI summary, and more. By putting GPT on top of Exa, you can recreate Perplexity with more control. An implementation of this is available in the `SmartSearcher` of the `forecasting-tools` python package. Each Exa search costs half a cent per search plus a tenth of a cent per 'text-content' requested per site requested. Content items include: highlights from a source, summary of a source, or full text.
152 | 1. Make an account with Exa at Exa.ai
153 | 2. Go to https://dashboard.exa.ai/playground
154 | 3. Click on "API Keys" in the left sidebar
155 | 4. Create a new key
156 | 5. Go to 'Billing' in the left sidebar and add funds to your acount with the 'Top Up Balance'
157 | 6. Add it to the .env as `EXA_API_KEY=your-key-here`
158 |
159 | ### Other Search
160 | Here are some other unvetted but interesting options for search and website reading:
161 | - Tavily
162 | - Google Search API
163 | - crawl4ai
164 | - Firecrawl
165 | - Playwright
166 |
167 | ## Accessing the Metaculus LLM Proxy
168 | OpenAI and Anthropic have generously donated credits to bot builders in the tournament which we are providing through an llm proxy.
169 |
170 | To get credits assigned to your model choices (or if you need renewed credits from a previous quarter), please send an email to `ben [at] metaculus [.com]` with the below:
171 | * The username of your bot
172 | * A couple paragraph description of how your existing bot works, or what you plan to build
173 | * An estimate of how much budget/tokens you might productively use
174 | * Your preferred Anthropic/OpenAI model(s) and how you want the budget distributed between them (there is budget distributed to each individual model name rather than to your account on whole)
175 |
176 | Metaculus will add new OpenAI and Anthropic completion models to the proxy as they come out. If you want to use a new model, please send us an email with the model you desire, and how much budget you want removed from one model and transferred to another. Alternatively, if you have a new idea that needs more support, pitch it to us, and we can add give additional credits. Reach out if you run out.
177 |
178 | Visit [this page](https://www.notion.so/metaculus/OpenAI-and-Anthropic-credits-0e1f7bf8c8a248e4a38da8758cc04de4) for instructions on how to call the Metaculus proxy directly.
179 |
180 | You can also use the `forecasting-tools` package to call the proxy. To do this, call `await forecasting-tools.GeneralLlm(model="metaculus/{openai_or_anthropic_model_name}").invoke(prompt)`. You will need METACULUS_TOKEN set in your .env file and have already had credits assigned to your account and model choice. GeneralLlm is a wrapper around the litellm package which provides one API for every major model and provider and can be used for other providers like Gemini, XAI, or OpenRouter. For more information about how to use GeneralLlm/litellm see [forecasting-tools](https://github.com/Metaculus/forecasting-tools) and [litellm](https://github.com/BerriAI/litellm)
181 |
182 |
183 | ## Run the bot locally
184 | Clone the repository. Find your terminal and run the following commands:
185 | ```bash
186 | git clone https://github.com/Metaculus/metac-bot-template.git
187 | ```
188 |
189 | If you forked the repository first, you have to replace the url in the `git clone` command with the url to your fork. Just go to your forked repository and copy the url from the address bar in the browser.
190 |
191 | ### Installing dependencies
192 | Make sure you have python and [poetry](https://python-poetry.org/docs/#installing-with-pipx) installed (poetry is a python package manager).
193 |
194 | If you don't have poetry installed run the below:
195 | ```bash
196 | sudo apt update -y
197 | sudo apt install -y pipx
198 | pipx install poetry
199 |
200 | # Optional
201 | poetry config virtualenvs.in-project true
202 | ```
203 |
204 |
205 | Inside the terminal, go to the directory you cloned the repository into and run the following command:
206 | ```bash
207 | poetry install
208 | ```
209 | to install all required dependencies.
210 |
211 | ### Setting environment variables
212 |
213 | Running the bot requires various environment variables. If you run the bot locally, the easiest way to set them is to create a file called `.env` in the root directory of the repository (copy the `.env.template`).
214 |
215 | ### Running the bot
216 |
217 | To test the simple bot, execute the following command in your terminal:
218 | ```bash
219 | poetry run python main.py --mode test_questions
220 | ```
221 | Make sure to set the environment variables as described above and to set the parameters in the code to your liking. In particular, to submit predictions, make sure that `submit_predictions` is set to `True` (it is set to `True` by default in main.py).
222 |
223 | ## Early Benchmarking
224 | Provided in this project is an example of how to benchmark your bot's forecasts against the community prediction for questions on Metaculus. Running `community_benchmark.py` will run versions of your bot defined by you (e.g. with different LLMs or research paths) and score them on how close they are to the community prediction using expected baseline score (a proper score assuming the community prediction is the true probability). You will want to edit the file to choose which bot configurations you want to test and how many questions you want to test on. Any class inheriting from `forecasting-tools.Forecastbot` can be passed into the benchmarker. As of March 28, 2025 the benchmarker only works with binary questions.
225 |
226 | To run a benchmark:
227 | `poetry run python community_benchmark.py --mode run`
228 |
229 | To run a custom benchmark (e.g. remove background info from questions to test retrival):
230 | `poetry run python community_benchmark.py --mode custom`
231 |
232 | To view a UI showing your scores, statistical error bars, and your bot's reasoning:
233 | `poetry run streamlit run community_benchmark.py`
234 |
235 | See more information in the benchmarking section of the [forecasting-tools repo](https://github.com/Metaculus/forecasting-tools?tab=readme-ov-file#benchmarking)
236 |
237 | ## Ideas for bot improvements
238 | Below are some ideas for making a novel bot.
239 | - Finetuned LLM on Metaculus Data: Create an optimized prompt (using DSPY or a similar toolset) and/or a fine-tuned LLM using all past Metaculus data. The thought is that this will train the LLM to be well-calibrated on real-life questions. Consider knowledge cutoffs and data leakage from search providers.
240 | - Dataset explorer: Create a tool that can find if there are datasets or graphs related to a question online, download them if they exist, and then run data science on them to answer a question.
241 | - Question decomposer: A tool that takes a complex question and breaks it down into simpler questions to answer those instead
242 | - Meta-Forecast Researcher: A tool that searches all major prediction markets, prediction aggregators, and possibly thought leaders to find relevant forecasts, and then combines them into an assessment for the current question (see [Metaforecast](https://metaforecast.org/)).
243 | - Base rate researcher: Create a tool to find accurate base rates. There is an experimental version [here](https://forecasting-tools.streamlit.app/base-rate-generator) in [forecasting-tools](https://github.com/Metaculus/forecasting-tools) that works 50% of the time.
244 | - Key factors researcher: Improve our experimental [key factors researcher](https://forecasting-tools.streamlit.app/key-factors) to find higher significance key factors for a given question.
245 | - Monte Carlo Simulations: Experiment with combining some tools to run effective Monte Carlo simulations. This could include experimenting with combining Squiggle with the question decomposer.
246 | - Adding personality diversity, LLM diversity, and other variations: Have GPT come up with a number of different ‘expert personalities’ or 'world-models' that it runs the forecasting bot with and then aggregates the median. Additionally, run the bot on different LLMs and see if the median of different LLMs improves the forecast. Finally, try simulating up to hundreds of personalities/LLM combinations to create large diverse crowds. Each individual could have a backstory, thinking process, biases they are resistant to, etc. This will ideally improve accuracy and give more useful bot reasoning outputs to help humans reading the output consider things from multiple angles.
247 | - Worldbuilding: Have GPT world build different future scenarios and then forecast all the different parts of those scenarios. It then would choose the most likely future world. In addition to a forecast, descriptions of future ‘worlds’ are created. This can take inspiration from Feinman paths.
248 | - Consistency Forecasting: Forecast many tangential questions all at once (in a single prompt) and prompts for consistency rules.
249 | - Extremize & Calibrate Predictions: Using the historical performance of a bot, adjust forecasts to be better calibrated. For instance, if predictions of 30% from the bot actually happen 40% of the time, then transform predictions of 30% to 40%.
250 | - Assigning points to evidence: Starting with some ideas from a [blog post from Ozzie Gooen](https://forum.effectivealtruism.org/posts/mrAZFnEjsQAQPJvLh/using-points-to-rate-different-kinds-of-evidence), you could experiment with assigning ‘points’ to major types of evidence and having GPT categorize the evidence it finds related to a forecast so that the ‘total points’ can be calculated. This can then be turned into a forecast, and potentially optimized using machine learning on past Metaculus data.
251 | - Search provider benchmark: Run bots using different combinations of search providers (e.g. Google, Bing, Exa.ai, Tavily, AskNews, Perplexity, etc) and search filters (e.g. only recent data, sites with a certain search rank, etc) and see if any specific one is better than others, or if using multiple of them makes a difference.
252 | - Timeline researcher: Make a tool that can take a niche topic and make a timeline for all major and minor events relevant to that topic.
253 |
--------------------------------------------------------------------------------
/community_benchmark.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import argparse
4 | import asyncio
5 | import logging
6 | import sys
7 | from datetime import datetime, timedelta
8 | from typing import Literal
9 |
10 | import typeguard
11 | from forecasting_tools import (
12 | Benchmarker,
13 | ForecastBot,
14 | GeneralLlm,
15 | MonetaryCostManager,
16 | MetaculusApi,
17 | ApiFilter,
18 | run_benchmark_streamlit_page,
19 | )
20 |
21 | from main import TemplateForecaster
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 |
27 | async def benchmark_forecast_bot(mode: str) -> None:
28 | """
29 | Run a benchmark that compares your forecasts against the community prediction
30 | """
31 |
32 | number_of_questions = 30 # Recommend 100+ for meaningful error bars, but 30 is faster/cheaper
33 | if mode == "display":
34 | run_benchmark_streamlit_page()
35 | return
36 | elif mode == "run":
37 | questions = MetaculusApi.get_benchmark_questions(number_of_questions)
38 | elif mode == "custom":
39 | # Below is an example of getting custom questions
40 | one_year_from_now = datetime.now() + timedelta(days=365)
41 | api_filter = ApiFilter(
42 | allowed_statuses=["open"],
43 | allowed_types=["binary"],
44 | num_forecasters_gte=40,
45 | scheduled_resolve_time_lt=one_year_from_now,
46 | includes_bots_in_aggregates=False,
47 | community_prediction_exists=True,
48 | )
49 | questions = await MetaculusApi.get_questions_matching_filter(
50 | api_filter,
51 | num_questions=number_of_questions,
52 | randomly_sample=True,
53 | )
54 | for question in questions:
55 | question.background_info = None # Test ability to find new information
56 | else:
57 | raise ValueError(f"Invalid mode: {mode}")
58 |
59 | with MonetaryCostManager() as cost_manager:
60 | bots = [
61 | TemplateForecaster(
62 | predictions_per_research_report=5,
63 | llms={
64 | "default": GeneralLlm(
65 | model="gpt-4o-mini",
66 | temperature=0.3,
67 | ),
68 | },
69 | ),
70 | TemplateForecaster(
71 | predictions_per_research_report=1,
72 | llms={
73 | "default": GeneralLlm(
74 | model="gpt-4o-mini",
75 | temperature=0.3,
76 | ),
77 | },
78 | ),
79 | # Add other ForecastBots here (or same bot with different parameters)
80 | ]
81 | bots = typeguard.check_type(bots, list[ForecastBot])
82 | benchmarks = await Benchmarker(
83 | questions_to_use=questions,
84 | forecast_bots=bots,
85 | file_path_to_save_reports="benchmarks/",
86 | concurrent_question_batch_size=10,
87 | ).run_benchmark()
88 | for i, benchmark in enumerate(benchmarks):
89 | logger.info(
90 | f"Benchmark {i+1} of {len(benchmarks)}: {benchmark.name}"
91 | )
92 | logger.info(
93 | f"- Final Score: {benchmark.average_expected_baseline_score}"
94 | )
95 | logger.info(f"- Total Cost: {benchmark.total_cost}")
96 | logger.info(f"- Time taken: {benchmark.time_taken_in_minutes}")
97 | logger.info(f"Total Cost: {cost_manager.current_usage}")
98 |
99 |
100 | if __name__ == "__main__":
101 | logging.basicConfig(
102 | level=logging.INFO,
103 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
104 | handlers=[
105 | logging.StreamHandler(sys.stdout),
106 | logging.FileHandler(f"benchmarks/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log")
107 | ]
108 | )
109 |
110 | # Suppress LiteLLM logging
111 | litellm_logger = logging.getLogger("LiteLLM")
112 | litellm_logger.setLevel(logging.WARNING)
113 | litellm_logger.propagate = False
114 |
115 | # Parse command line arguments
116 | parser = argparse.ArgumentParser(
117 | description="Benchmark a list of bots"
118 | )
119 | parser.add_argument(
120 | "--mode",
121 | type=str,
122 | choices=["run", "custom", "display"],
123 | default="display",
124 | help="Specify the run mode (default: display)",
125 | )
126 | args = parser.parse_args()
127 | mode: Literal["run", "custom", "display"] = (
128 | args.mode
129 | )
130 | asyncio.run(benchmark_forecast_bot(mode))
131 |
132 |
133 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import asyncio
3 | import logging
4 | import os
5 | from datetime import datetime
6 | from typing import Literal
7 |
8 | from forecasting_tools import (
9 | AskNewsSearcher,
10 | BinaryQuestion,
11 | ForecastBot,
12 | GeneralLlm,
13 | MetaculusApi,
14 | MetaculusQuestion,
15 | MultipleChoiceQuestion,
16 | NumericDistribution,
17 | NumericQuestion,
18 | PredictedOptionList,
19 | PredictionExtractor,
20 | ReasonedPrediction,
21 | SmartSearcher,
22 | clean_indents,
23 | )
24 |
25 | logger = logging.getLogger(__name__)
26 |
27 |
28 | class TemplateForecaster(ForecastBot):
29 | """
30 | This is a copy of the template bot for Q2 2025 Metaculus AI Tournament.
31 | The official bots on the leaderboard use AskNews in Q2.
32 | Main template bot changes since Q1
33 | - Support for new units parameter was added
34 | - You now set your llms when you initialize the bot (making it easier to switch between and benchmark different models)
35 |
36 | The main entry point of this bot is `forecast_on_tournament` in the parent class.
37 | See the script at the bottom of the file for more details on how to run the bot.
38 | Ignoring the finer details, the general flow is:
39 | - Load questions from Metaculus
40 | - For each question
41 | - Execute run_research a number of times equal to research_reports_per_question
42 | - Execute respective run_forecast function `predictions_per_research_report * research_reports_per_question` times
43 | - Aggregate the predictions
44 | - Submit prediction (if publish_reports_to_metaculus is True)
45 | - Return a list of ForecastReport objects
46 |
47 | Only the research and forecast functions need to be implemented in ForecastBot subclasses.
48 |
49 | If you end up having trouble with rate limits and want to try a more sophisticated rate limiter try:
50 | ```
51 | from forecasting_tools.ai_models.resource_managers.refreshing_bucket_rate_limiter import RefreshingBucketRateLimiter
52 | rate_limiter = RefreshingBucketRateLimiter(
53 | capacity=2,
54 | refresh_rate=1,
55 | ) # Allows 1 request per second on average with a burst of 2 requests initially. Set this as a class variable
56 | await self.rate_limiter.wait_till_able_to_acquire_resources(1) # 1 because it's consuming 1 request (use more if you are adding a token limit)
57 | ```
58 | Additionally OpenRouter has large rate limits immediately on account creation
59 | """
60 |
61 | _max_concurrent_questions = 2 # Set this to whatever works for your search-provider/ai-model rate limits
62 | _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions)
63 |
64 | async def run_research(self, question: MetaculusQuestion) -> str:
65 | async with self._concurrency_limiter:
66 | research = ""
67 | if os.getenv("ASKNEWS_CLIENT_ID") and os.getenv("ASKNEWS_SECRET"):
68 | research = await AskNewsSearcher().get_formatted_news_async(
69 | question.question_text
70 | )
71 | elif os.getenv("EXA_API_KEY"):
72 | research = await self._call_exa_smart_searcher(
73 | question.question_text
74 | )
75 | elif os.getenv("PERPLEXITY_API_KEY"):
76 | research = await self._call_perplexity(question.question_text)
77 | elif os.getenv("OPENROUTER_API_KEY"):
78 | research = await self._call_perplexity(
79 | question.question_text, use_open_router=True
80 | )
81 | else:
82 | logger.warning(
83 | f"No research provider found when processing question URL {question.page_url}. Will pass back empty string."
84 | )
85 | research = ""
86 | logger.info(
87 | f"Found Research for URL {question.page_url}:\n{research}"
88 | )
89 | return research
90 |
91 | async def _call_perplexity(
92 | self, question: str, use_open_router: bool = False
93 | ) -> str:
94 | prompt = clean_indents(
95 | f"""
96 | You are an assistant to a superforecaster.
97 | The superforecaster will give you a question they intend to forecast on.
98 | To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
99 | You do not produce forecasts yourself.
100 |
101 | Question:
102 | {question}
103 | """
104 | ) # NOTE: The metac bot in Q1 put everything but the question in the system prompt.
105 | if use_open_router:
106 | model_name = "openrouter/perplexity/sonar-reasoning"
107 | else:
108 | model_name = "perplexity/sonar-pro" # perplexity/sonar-reasoning and perplexity/sonar are cheaper, but do only 1 search
109 | model = GeneralLlm(
110 | model=model_name,
111 | temperature=0.1,
112 | )
113 | response = await model.invoke(prompt)
114 | return response
115 |
116 | async def _call_exa_smart_searcher(self, question: str) -> str:
117 | """
118 | SmartSearcher is a custom class that is a wrapper around an search on Exa.ai
119 | """
120 | searcher = SmartSearcher(
121 | model=self.get_llm("default", "llm"),
122 | temperature=0,
123 | num_searches_to_run=2,
124 | num_sites_per_search=10,
125 | )
126 | prompt = (
127 | "You are an assistant to a superforecaster. The superforecaster will give"
128 | "you a question they intend to forecast on. To be a great assistant, you generate"
129 | "a concise but detailed rundown of the most relevant news, including if the question"
130 | "would resolve Yes or No based on current information. You do not produce forecasts yourself."
131 | f"\n\nThe question is: {question}"
132 | ) # You can ask the searcher to filter by date, exclude/include a domain, and run specific searches for finding sources vs finding highlights within a source
133 | response = await searcher.invoke(prompt)
134 | return response
135 |
136 | async def _run_forecast_on_binary(
137 | self, question: BinaryQuestion, research: str
138 | ) -> ReasonedPrediction[float]:
139 | prompt = clean_indents(
140 | f"""
141 | You are a professional forecaster interviewing for a job.
142 |
143 | Your interview question is:
144 | {question.question_text}
145 |
146 | Question background:
147 | {question.background_info}
148 |
149 |
150 | This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied:
151 | {question.resolution_criteria}
152 |
153 | {question.fine_print}
154 |
155 |
156 | Your research assistant says:
157 | {research}
158 |
159 | Today is {datetime.now().strftime("%Y-%m-%d")}.
160 |
161 | Before answering you write:
162 | (a) The time left until the outcome to the question is known.
163 | (b) The status quo outcome if nothing changed.
164 | (c) A brief description of a scenario that results in a No outcome.
165 | (d) A brief description of a scenario that results in a Yes outcome.
166 |
167 | You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time.
168 |
169 | The last thing you write is your final answer as: "Probability: ZZ%", 0-100
170 | """
171 | )
172 | reasoning = await self.get_llm("default", "llm").invoke(prompt)
173 | prediction: float = PredictionExtractor.extract_last_percentage_value(
174 | reasoning, max_prediction=1, min_prediction=0
175 | )
176 | logger.info(
177 | f"Forecasted URL {question.page_url} as {prediction} with reasoning:\n{reasoning}"
178 | )
179 | return ReasonedPrediction(
180 | prediction_value=prediction, reasoning=reasoning
181 | )
182 |
183 | async def _run_forecast_on_multiple_choice(
184 | self, question: MultipleChoiceQuestion, research: str
185 | ) -> ReasonedPrediction[PredictedOptionList]:
186 | prompt = clean_indents(
187 | f"""
188 | You are a professional forecaster interviewing for a job.
189 |
190 | Your interview question is:
191 | {question.question_text}
192 |
193 | The options are: {question.options}
194 |
195 |
196 | Background:
197 | {question.background_info}
198 |
199 | {question.resolution_criteria}
200 |
201 | {question.fine_print}
202 |
203 |
204 | Your research assistant says:
205 | {research}
206 |
207 | Today is {datetime.now().strftime("%Y-%m-%d")}.
208 |
209 | Before answering you write:
210 | (a) The time left until the outcome to the question is known.
211 | (b) The status quo outcome if nothing changed.
212 | (c) A description of an scenario that results in an unexpected outcome.
213 |
214 | You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes.
215 |
216 | The last thing you write is your final probabilities for the N options in this order {question.options} as:
217 | Option_A: Probability_A
218 | Option_B: Probability_B
219 | ...
220 | Option_N: Probability_N
221 | """
222 | )
223 | reasoning = await self.get_llm("default", "llm").invoke(prompt)
224 | prediction: PredictedOptionList = (
225 | PredictionExtractor.extract_option_list_with_percentage_afterwards(
226 | reasoning, question.options
227 | )
228 | )
229 | logger.info(
230 | f"Forecasted URL {question.page_url} as {prediction} with reasoning:\n{reasoning}"
231 | )
232 | return ReasonedPrediction(
233 | prediction_value=prediction, reasoning=reasoning
234 | )
235 |
236 | async def _run_forecast_on_numeric(
237 | self, question: NumericQuestion, research: str
238 | ) -> ReasonedPrediction[NumericDistribution]:
239 | upper_bound_message, lower_bound_message = (
240 | self._create_upper_and_lower_bound_messages(question)
241 | )
242 | prompt = clean_indents(
243 | f"""
244 | You are a professional forecaster interviewing for a job.
245 |
246 | Your interview question is:
247 | {question.question_text}
248 |
249 | Background:
250 | {question.background_info}
251 |
252 | {question.resolution_criteria}
253 |
254 | {question.fine_print}
255 |
256 | Units for answer: {question.unit_of_measure if question.unit_of_measure else "Not stated (please infer this)"}
257 |
258 | Your research assistant says:
259 | {research}
260 |
261 | Today is {datetime.now().strftime("%Y-%m-%d")}.
262 |
263 | {lower_bound_message}
264 | {upper_bound_message}
265 |
266 | Formatting Instructions:
267 | - Please notice the units requested (e.g. whether you represent a number as 1,000,000 or 1 million).
268 | - Never use scientific notation.
269 | - Always start with a smaller number (more negative if negative) and then increase from there
270 |
271 | Before answering you write:
272 | (a) The time left until the outcome to the question is known.
273 | (b) The outcome if nothing changed.
274 | (c) The outcome if the current trend continued.
275 | (d) The expectations of experts and markets.
276 | (e) A brief description of an unexpected scenario that results in a low outcome.
277 | (f) A brief description of an unexpected scenario that results in a high outcome.
278 |
279 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns.
280 |
281 | The last thing you write is your final answer as:
282 | "
283 | Percentile 10: XX
284 | Percentile 20: XX
285 | Percentile 40: XX
286 | Percentile 60: XX
287 | Percentile 80: XX
288 | Percentile 90: XX
289 | "
290 | """
291 | )
292 | reasoning = await self.get_llm("default", "llm").invoke(prompt)
293 | prediction: NumericDistribution = (
294 | PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability(
295 | reasoning, question
296 | )
297 | )
298 | logger.info(
299 | f"Forecasted URL {question.page_url} as {prediction.declared_percentiles} with reasoning:\n{reasoning}"
300 | )
301 | return ReasonedPrediction(
302 | prediction_value=prediction, reasoning=reasoning
303 | )
304 |
305 | def _create_upper_and_lower_bound_messages(
306 | self, question: NumericQuestion
307 | ) -> tuple[str, str]:
308 | if question.open_upper_bound:
309 | upper_bound_message = ""
310 | else:
311 | upper_bound_message = (
312 | f"The outcome can not be higher than {question.upper_bound}."
313 | )
314 | if question.open_lower_bound:
315 | lower_bound_message = ""
316 | else:
317 | lower_bound_message = (
318 | f"The outcome can not be lower than {question.lower_bound}."
319 | )
320 | return upper_bound_message, lower_bound_message
321 |
322 |
323 | if __name__ == "__main__":
324 | logging.basicConfig(
325 | level=logging.INFO,
326 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
327 | )
328 |
329 | # Suppress LiteLLM logging
330 | litellm_logger = logging.getLogger("LiteLLM")
331 | litellm_logger.setLevel(logging.WARNING)
332 | litellm_logger.propagate = False
333 |
334 | parser = argparse.ArgumentParser(
335 | description="Run the Q1TemplateBot forecasting system"
336 | )
337 | parser.add_argument(
338 | "--mode",
339 | type=str,
340 | choices=["tournament", "quarterly_cup", "test_questions"],
341 | default="tournament",
342 | help="Specify the run mode (default: tournament)",
343 | )
344 | args = parser.parse_args()
345 | run_mode: Literal["tournament", "quarterly_cup", "test_questions"] = (
346 | args.mode
347 | )
348 | assert run_mode in [
349 | "tournament",
350 | "quarterly_cup",
351 | "test_questions",
352 | ], "Invalid run mode"
353 |
354 | template_bot = TemplateForecaster(
355 | research_reports_per_question=1,
356 | predictions_per_research_report=5,
357 | use_research_summary_to_forecast=False,
358 | publish_reports_to_metaculus=True,
359 | folder_to_save_reports_to=None,
360 | skip_previously_forecasted_questions=True,
361 | # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you
362 | # "default": GeneralLlm(
363 | # model="metaculus/anthropic/claude-3-5-sonnet-20241022",
364 | # temperature=0.3,
365 | # timeout=40,
366 | # allowed_tries=2,
367 | # ),
368 | # "summarizer": "openai/gpt-4o-mini",
369 | # },
370 | )
371 |
372 | if run_mode == "tournament":
373 | forecast_reports = asyncio.run(
374 | template_bot.forecast_on_tournament(
375 | MetaculusApi.CURRENT_AI_COMPETITION_ID, return_exceptions=True
376 | )
377 | )
378 | elif run_mode == "quarterly_cup":
379 | # The quarterly cup is a good way to test the bot's performance on regularly open questions. You can also use AXC_2025_TOURNAMENT_ID = 32564
380 | # The new quarterly cup may not be initialized near the beginning of a quarter
381 | template_bot.skip_previously_forecasted_questions = False
382 | forecast_reports = asyncio.run(
383 | template_bot.forecast_on_tournament(
384 | MetaculusApi.CURRENT_QUARTERLY_CUP_ID, return_exceptions=True
385 | )
386 | )
387 | elif run_mode == "test_questions":
388 | # Example questions are a good way to test the bot's performance on a single question
389 | EXAMPLE_QUESTIONS = [
390 | "https://www.metaculus.com/questions/578/human-extinction-by-2100/", # Human Extinction - Binary
391 | "https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/", # Age of Oldest Human - Numeric
392 | "https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/", # Number of New Leading AI Labs - Multiple Choice
393 | ]
394 | template_bot.skip_previously_forecasted_questions = False
395 | questions = [
396 | MetaculusApi.get_question_by_url(question_url)
397 | for question_url in EXAMPLE_QUESTIONS
398 | ]
399 | forecast_reports = asyncio.run(
400 | template_bot.forecast_questions(questions, return_exceptions=True)
401 | )
402 | TemplateForecaster.log_report_summary(forecast_reports) # type: ignore
403 |
--------------------------------------------------------------------------------
/main_with_no_framework.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import datetime
3 | import json
4 | import os
5 | import re
6 | import dotenv
7 | dotenv.load_dotenv()
8 |
9 | from openai import AsyncOpenAI
10 | import numpy as np
11 | import requests
12 | import forecasting_tools
13 | from asknews_sdk import AskNewsSDK
14 |
15 |
16 | ######################### CONSTANTS #########################
17 | # Constants
18 | SUBMIT_PREDICTION = True # set to True to publish your predictions to Metaculus
19 | USE_EXAMPLE_QUESTIONS = False # set to True to forecast example questions rather than the tournament questions
20 | NUM_RUNS_PER_QUESTION = 5 # The median forecast is taken between NUM_RUNS_PER_QUESTION runs
21 | SKIP_PREVIOUSLY_FORECASTED_QUESTIONS = True
22 |
23 | # Environment variables
24 | # You only need *either* Exa or Perplexity or AskNews keys for online research
25 | METACULUS_TOKEN = os.getenv("METACULUS_TOKEN")
26 | PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
27 | ASKNEWS_CLIENT_ID = os.getenv("ASKNEWS_CLIENT_ID")
28 | ASKNEWS_SECRET = os.getenv("ASKNEWS_SECRET")
29 | EXA_API_KEY = os.getenv("EXA_API_KEY")
30 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # You'll also need the OpenAI API Key if you want to use the Exa Smart Searcher
31 |
32 | # The tournament IDs below can be used for testing your bot.
33 | Q4_2024_AI_BENCHMARKING_ID = 32506
34 | Q1_2025_AI_BENCHMARKING_ID = 32627
35 | Q4_2024_QUARTERLY_CUP_ID = 3672
36 | Q1_2025_QUARTERLY_CUP_ID = 32630
37 | AXC_2025_TOURNAMENT_ID = 32564
38 | GIVEWELL_ID = 3600
39 | RESPIRATORY_OUTLOOK_ID = 3411
40 |
41 | TOURNAMENT_ID = Q1_2025_AI_BENCHMARKING_ID
42 |
43 | # The example questions can be used for testing your bot. (note that question and post id are not always the same)
44 | EXAMPLE_QUESTIONS = [ # (question_id, post_id)
45 | (578, 578), # Human Extinction - Binary - https://www.metaculus.com/questions/578/human-extinction-by-2100/
46 | (14333, 14333), # Age of Oldest Human - Numeric - https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/
47 | (22427, 22427), # Number of New Leading AI Labs - Multiple Choice - https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/
48 | ]
49 |
50 | # Also, we realize the below code could probably be cleaned up a bit in a few places
51 | # Though we are assuming most people will dissect it enough to make this not matter much
52 |
53 | ######################### HELPER FUNCTIONS #########################
54 |
55 | # @title Helper functions
56 | AUTH_HEADERS = {"headers": {"Authorization": f"Token {METACULUS_TOKEN}"}}
57 | API_BASE_URL = "https://www.metaculus.com/api"
58 |
59 |
60 | def post_question_comment(post_id: int, comment_text: str) -> None:
61 | """
62 | Post a comment on the question page as the bot user.
63 | """
64 |
65 | response = requests.post(
66 | f"{API_BASE_URL}/comments/create/",
67 | json={
68 | "text": comment_text,
69 | "parent": None,
70 | "included_forecast": True,
71 | "is_private": True,
72 | "on_post": post_id,
73 | },
74 | **AUTH_HEADERS, # type: ignore
75 | )
76 | if not response.ok:
77 | raise RuntimeError(response.text)
78 |
79 |
80 | def post_question_prediction(question_id: int, forecast_payload: dict) -> None:
81 | """
82 | Post a forecast on a question.
83 | """
84 | url = f"{API_BASE_URL}/questions/forecast/"
85 | response = requests.post(
86 | url,
87 | json=[
88 | {
89 | "question": question_id,
90 | **forecast_payload,
91 | },
92 | ],
93 | **AUTH_HEADERS, # type: ignore
94 | )
95 | print(f"Prediction Post status code: {response.status_code}")
96 | if not response.ok:
97 | raise RuntimeError(response.text)
98 |
99 |
100 | def create_forecast_payload(
101 | forecast: float | dict[str, float] | list[float],
102 | question_type: str,
103 | ) -> dict:
104 | """
105 | Accepts a forecast and generates the api payload in the correct format.
106 |
107 | If the question is binary, forecast must be a float.
108 | If the question is multiple choice, forecast must be a dictionary that
109 | maps question.options labels to floats.
110 | If the question is numeric, forecast must be a dictionary that maps
111 | quartiles or percentiles to datetimes, or a 201 value cdf.
112 | """
113 | if question_type == "binary":
114 | return {
115 | "probability_yes": forecast,
116 | "probability_yes_per_category": None,
117 | "continuous_cdf": None,
118 | }
119 | if question_type == "multiple_choice":
120 | return {
121 | "probability_yes": None,
122 | "probability_yes_per_category": forecast,
123 | "continuous_cdf": None,
124 | }
125 | # numeric or date
126 | return {
127 | "probability_yes": None,
128 | "probability_yes_per_category": None,
129 | "continuous_cdf": forecast,
130 | }
131 |
132 |
133 | def list_posts_from_tournament(
134 | tournament_id: int = TOURNAMENT_ID, offset: int = 0, count: int = 50
135 | ) -> list[dict]:
136 | """
137 | List (all details) {count} posts from the {tournament_id}
138 | """
139 | url_qparams = {
140 | "limit": count,
141 | "offset": offset,
142 | "order_by": "-hotness",
143 | "forecast_type": ",".join(
144 | [
145 | "binary",
146 | "multiple_choice",
147 | "numeric",
148 | ]
149 | ),
150 | "tournaments": [tournament_id],
151 | "statuses": "open",
152 | "include_description": "true",
153 | }
154 | url = f"{API_BASE_URL}/posts/"
155 | response = requests.get(url, **AUTH_HEADERS, params=url_qparams) # type: ignore
156 | if not response.ok:
157 | raise Exception(response.text)
158 | data = json.loads(response.content)
159 | return data
160 |
161 |
162 | def get_open_question_ids_from_tournament() -> list[tuple[int, int]]:
163 | posts = list_posts_from_tournament()
164 |
165 | post_dict = dict()
166 | for post in posts["results"]:
167 | if question := post.get("question"):
168 | # single question post
169 | post_dict[post["id"]] = [question]
170 |
171 | open_question_id_post_id = [] # [(question_id, post_id)]
172 | for post_id, questions in post_dict.items():
173 | for question in questions:
174 | if question.get("status") == "open":
175 | print(
176 | f"ID: {question['id']}\nQ: {question['title']}\nCloses: "
177 | f"{question['scheduled_close_time']}"
178 | )
179 | open_question_id_post_id.append((question["id"], post_id))
180 |
181 | return open_question_id_post_id
182 |
183 |
184 | def get_post_details(post_id: int) -> dict:
185 | """
186 | Get all details about a post from the Metaculus API.
187 | """
188 | url = f"{API_BASE_URL}/posts/{post_id}/"
189 | print(f"Getting details for {url}")
190 | response = requests.get(
191 | url,
192 | **AUTH_HEADERS, # type: ignore
193 | )
194 | if not response.ok:
195 | raise Exception(response.text)
196 | details = json.loads(response.content)
197 | return details
198 |
199 | CONCURRENT_REQUESTS_LIMIT = 5
200 | llm_rate_limiter = asyncio.Semaphore(CONCURRENT_REQUESTS_LIMIT)
201 |
202 |
203 | async def call_llm(prompt: str, model: str = "gpt-4o", temperature: float = 0.3) -> str:
204 | """
205 | Makes a streaming completion request to OpenAI's API with concurrent request limiting.
206 | """
207 |
208 | # Remove the base_url parameter to call the OpenAI API directly
209 | # Also checkout the package 'litellm' for one function that can call any model from any provider
210 | # Email ben@metaculus.com if you need credit for the Metaculus OpenAI/Anthropic proxy
211 | client = AsyncOpenAI(
212 | base_url="https://llm-proxy.metaculus.com/proxy/openai/v1",
213 | default_headers={
214 | "Content-Type": "application/json",
215 | "Authorization": f"Token {METACULUS_TOKEN}",
216 | },
217 | api_key="Fake API Key since openai requires this not to be NONE. This isn't used",
218 | max_retries=2,
219 | )
220 |
221 | async with llm_rate_limiter:
222 | response = await client.chat.completions.create(
223 | model=model,
224 | messages=[{"role": "user", "content": prompt}],
225 | temperature=temperature,
226 | stream=False,
227 | )
228 | answer = response.choices[0].message.content
229 | if answer is None:
230 | raise ValueError("No answer returned from LLM")
231 | return answer
232 |
233 |
234 | def run_research(question: str) -> str:
235 | research = ""
236 | if ASKNEWS_CLIENT_ID and ASKNEWS_SECRET:
237 | research = call_asknews(question)
238 | elif EXA_API_KEY:
239 | research = call_exa_smart_searcher(question)
240 | elif PERPLEXITY_API_KEY:
241 | research = call_perplexity(question)
242 | else:
243 | research = "No research done"
244 |
245 | print(f"########################\nResearch Found:\n{research}\n########################")
246 |
247 | return research
248 |
249 | def call_perplexity(question: str) -> str:
250 | url = "https://api.perplexity.ai/chat/completions"
251 | api_key = PERPLEXITY_API_KEY
252 | headers = {
253 | "accept": "application/json",
254 | "authorization": f"Bearer {api_key}",
255 | "content-type": "application/json",
256 | }
257 | payload = {
258 | "model": "llama-3.1-sonar-huge-128k-online",
259 | "messages": [
260 | {
261 | "role": "system", # this is a system prompt designed to guide the perplexity assistant
262 | "content": """
263 | You are an assistant to a superforecaster.
264 | The superforecaster will give you a question they intend to forecast on.
265 | To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
266 | You do not produce forecasts yourself.
267 | """,
268 | },
269 | {
270 | "role": "user", # this is the actual prompt we ask the perplexity assistant to answer
271 | "content": question,
272 | },
273 | ],
274 | }
275 | response = requests.post(url=url, json=payload, headers=headers)
276 | if not response.ok:
277 | raise Exception(response.text)
278 | content = response.json()["choices"][0]["message"]["content"]
279 | return content
280 |
281 | def call_exa_smart_searcher(question: str) -> str:
282 | if OPENAI_API_KEY is None:
283 | searcher = forecasting_tools.ExaSearcher(
284 | include_highlights=True,
285 | num_results=10,
286 | )
287 | highlights = asyncio.run(searcher.invoke_for_highlights_in_relevance_order(question))
288 | prioritized_highlights = highlights[:10]
289 | combined_highlights = ""
290 | for i, highlight in enumerate(prioritized_highlights):
291 | combined_highlights += f'[Highlight {i+1}]:\nTitle: {highlight.source.title}\nURL: {highlight.source.url}\nText: "{highlight.highlight_text}"\n\n'
292 | response = combined_highlights
293 | else:
294 | searcher = forecasting_tools.SmartSearcher(
295 | temperature=0,
296 | num_searches_to_run=2,
297 | num_sites_per_search=10,
298 | )
299 | prompt = (
300 | "You are an assistant to a superforecaster. The superforecaster will give"
301 | "you a question they intend to forecast on. To be a great assistant, you generate"
302 | "a concise but detailed rundown of the most relevant news, including if the question"
303 | "would resolve Yes or No based on current information. You do not produce forecasts yourself."
304 | f"\n\nThe question is: {question}"
305 | )
306 | response = asyncio.run(searcher.invoke(prompt))
307 |
308 | return response
309 |
310 | def call_asknews(question: str) -> str:
311 | """
312 | Use the AskNews `news` endpoint to get news context for your query.
313 | The full API reference can be found here: https://docs.asknews.app/en/reference#get-/v1/news/search
314 | """
315 | ask = AskNewsSDK(
316 | client_id=ASKNEWS_CLIENT_ID, client_secret=ASKNEWS_SECRET, scopes=set(["news"])
317 | )
318 |
319 | # get the latest news related to the query (within the past 48 hours)
320 | hot_response = ask.news.search_news(
321 | query=question, # your natural language query
322 | n_articles=6, # control the number of articles to include in the context, originally 5
323 | return_type="both",
324 | strategy="latest news", # enforces looking at the latest news only
325 | )
326 |
327 | # get context from the "historical" database that contains a news archive going back to 2023
328 | historical_response = ask.news.search_news(
329 | query=question,
330 | n_articles=10,
331 | return_type="both",
332 | strategy="news knowledge", # looks for relevant news within the past 60 days
333 | )
334 |
335 | hot_articles = hot_response.as_dicts
336 | historical_articles = historical_response.as_dicts
337 | formatted_articles = "Here are the relevant news articles:\n\n"
338 |
339 | if hot_articles:
340 | hot_articles = [article.__dict__ for article in hot_articles]
341 | hot_articles = sorted(hot_articles, key=lambda x: x["pub_date"], reverse=True)
342 |
343 | for article in hot_articles:
344 | pub_date = article["pub_date"].strftime("%B %d, %Y %I:%M %p")
345 | formatted_articles += f"**{article['eng_title']}**\n{article['summary']}\nOriginal language: {article['language']}\nPublish date: {pub_date}\nSource:[{article['source_id']}]({article['article_url']})\n\n"
346 |
347 | if historical_articles:
348 | historical_articles = [article.__dict__ for article in historical_articles]
349 | historical_articles = sorted(
350 | historical_articles, key=lambda x: x["pub_date"], reverse=True
351 | )
352 |
353 | for article in historical_articles:
354 | pub_date = article["pub_date"].strftime("%B %d, %Y %I:%M %p")
355 | formatted_articles += f"**{article['eng_title']}**\n{article['summary']}\nOriginal language: {article['language']}\nPublish date: {pub_date}\nSource:[{article['source_id']}]({article['article_url']})\n\n"
356 |
357 | if not hot_articles and not historical_articles:
358 | formatted_articles += "No articles were found.\n\n"
359 | return formatted_articles
360 |
361 | return formatted_articles
362 |
363 | ############### BINARY ###############
364 | # @title Binary prompt & functions
365 |
366 | # This section includes functionality for binary questions.
367 |
368 | BINARY_PROMPT_TEMPLATE = """
369 | You are a professional forecaster interviewing for a job.
370 |
371 | Your interview question is:
372 | {title}
373 |
374 | Question background:
375 | {background}
376 |
377 |
378 | This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied:
379 | {resolution_criteria}
380 |
381 | {fine_print}
382 |
383 |
384 | Your research assistant says:
385 | {summary_report}
386 |
387 | Today is {today}.
388 |
389 | Before answering you write:
390 | (a) The time left until the outcome to the question is known.
391 | (b) The status quo outcome if nothing changed.
392 | (c) A brief description of a scenario that results in a No outcome.
393 | (d) A brief description of a scenario that results in a Yes outcome.
394 |
395 | You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time.
396 |
397 | The last thing you write is your final answer as: "Probability: ZZ%", 0-100
398 | """
399 |
400 |
401 | def extract_probability_from_response_as_percentage_not_decimal(
402 | forecast_text: str,
403 | ) -> float:
404 | matches = re.findall(r"(\d+)%", forecast_text)
405 | if matches:
406 | # Return the last number found before a '%'
407 | number = int(matches[-1])
408 | number = min(99, max(1, number)) # clamp the number between 1 and 99
409 | return number
410 | else:
411 | raise ValueError(f"Could not extract prediction from response: {forecast_text}")
412 |
413 |
414 | async def get_binary_gpt_prediction(
415 | question_details: dict, num_runs: int
416 | ) -> tuple[float, str]:
417 |
418 | today = datetime.datetime.now().strftime("%Y-%m-%d")
419 | title = question_details["title"]
420 | resolution_criteria = question_details["resolution_criteria"]
421 | background = question_details["description"]
422 | fine_print = question_details["fine_print"]
423 | question_type = question_details["type"]
424 |
425 | summary_report = run_research(title)
426 |
427 | content = BINARY_PROMPT_TEMPLATE.format(
428 | title=title,
429 | today=today,
430 | background=background,
431 | resolution_criteria=resolution_criteria,
432 | fine_print=fine_print,
433 | summary_report=summary_report,
434 | )
435 |
436 | async def get_rationale_and_probability(content: str) -> tuple[float, str]:
437 | rationale = await call_llm(content)
438 |
439 | probability = extract_probability_from_response_as_percentage_not_decimal(
440 | rationale
441 | )
442 | comment = (
443 | f"Extracted Probability: {probability}%\n\nGPT's Answer: "
444 | f"{rationale}\n\n\n"
445 | )
446 | return probability, comment
447 |
448 | probability_and_comment_pairs = await asyncio.gather(
449 | *[get_rationale_and_probability(content) for _ in range(num_runs)]
450 | )
451 | comments = [pair[1] for pair in probability_and_comment_pairs]
452 | final_comment_sections = [
453 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments)
454 | ]
455 | probabilities = [pair[0] for pair in probability_and_comment_pairs]
456 | median_probability = float(np.median(probabilities)) / 100
457 |
458 | final_comment = f"Median Probability: {median_probability}\n\n" + "\n\n".join(
459 | final_comment_sections
460 | )
461 | return median_probability, final_comment
462 |
463 |
464 | ####################### NUMERIC ###############
465 | # @title Numeric prompt & functions
466 |
467 | NUMERIC_PROMPT_TEMPLATE = """
468 | You are a professional forecaster interviewing for a job.
469 |
470 | Your interview question is:
471 | {title}
472 |
473 | Background:
474 | {background}
475 |
476 | {resolution_criteria}
477 |
478 | {fine_print}
479 |
480 | Units for answer: {units}
481 |
482 | Your research assistant says:
483 | {summary_report}
484 |
485 | Today is {today}.
486 |
487 | {lower_bound_message}
488 | {upper_bound_message}
489 |
490 |
491 | Formatting Instructions:
492 | - Please notice the units requested (e.g. whether you represent a number as 1,000,000 or 1m).
493 | - Never use scientific notation.
494 | - Always start with a smaller number (more negative if negative) and then increase from there
495 |
496 | Before answering you write:
497 | (a) The time left until the outcome to the question is known.
498 | (b) The outcome if nothing changed.
499 | (c) The outcome if the current trend continued.
500 | (d) The expectations of experts and markets.
501 | (e) A brief description of an unexpected scenario that results in a low outcome.
502 | (f) A brief description of an unexpected scenario that results in a high outcome.
503 |
504 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unkowns.
505 |
506 | The last thing you write is your final answer as:
507 | "
508 | Percentile 10: XX
509 | Percentile 20: XX
510 | Percentile 40: XX
511 | Percentile 60: XX
512 | Percentile 80: XX
513 | Percentile 90: XX
514 | "
515 | """
516 |
517 |
518 | def extract_percentiles_from_response(forecast_text: str) -> dict:
519 |
520 | # Helper function that returns a list of tuples with numbers for all lines with Percentile
521 | def extract_percentile_numbers(text) -> dict:
522 | pattern = r"^.*(?:P|p)ercentile.*$"
523 | number_pattern = r"-\s*(?:[^\d\-]*\s*)?(\d+(?:,\d{3})*(?:\.\d+)?)|(\d+(?:,\d{3})*(?:\.\d+)?)"
524 | results = []
525 |
526 | for line in text.split("\n"):
527 | if re.match(pattern, line):
528 | numbers = re.findall(number_pattern, line)
529 | numbers_no_commas = [
530 | next(num for num in match if num).replace(",", "")
531 | for match in numbers
532 | ]
533 | numbers = [
534 | float(num) if "." in num else int(num)
535 | for num in numbers_no_commas
536 | ]
537 | if len(numbers) > 1:
538 | first_number = numbers[0]
539 | last_number = numbers[-1]
540 | # Check if the original line had a negative sign before the last number
541 | if "-" in line.split(":")[-1]:
542 | last_number = -abs(last_number)
543 | results.append((first_number, last_number))
544 |
545 | # Convert results to dictionary
546 | percentile_values = {}
547 | for first_num, second_num in results:
548 | key = first_num
549 | percentile_values[key] = second_num
550 |
551 | return percentile_values
552 |
553 | percentile_values = extract_percentile_numbers(forecast_text)
554 |
555 | if len(percentile_values) > 0:
556 | return percentile_values
557 | else:
558 | raise ValueError(f"Could not extract prediction from response: {forecast_text}")
559 |
560 |
561 | def generate_continuous_cdf(
562 | percentile_values: dict,
563 | question_type: str,
564 | open_upper_bound: bool,
565 | open_lower_bound: bool,
566 | upper_bound: float,
567 | lower_bound: float,
568 | zero_point: float | None,
569 | ) -> list[float]:
570 | """
571 | Returns: list[float]: A list of 201 float values representing the CDF.
572 | """
573 |
574 | percentile_max = max(float(key) for key in percentile_values.keys())
575 | percentile_min = min(float(key) for key in percentile_values.keys())
576 | range_min = lower_bound
577 | range_max = upper_bound
578 | range_size = range_max - range_min
579 | buffer = 1 if range_size > 100 else 0.01 * range_size
580 |
581 | # Adjust any values that are exactly at the bounds
582 | for percentile, value in list(percentile_values.items()):
583 | if not open_lower_bound and value <= range_min + buffer:
584 | percentile_values[percentile] = range_min + buffer
585 | if not open_upper_bound and value >= range_max - buffer:
586 | percentile_values[percentile] = range_max - buffer
587 |
588 | # Set cdf values outside range
589 | if open_upper_bound:
590 | if range_max > percentile_values[percentile_max]:
591 | percentile_values[int(100 - (0.5 * (100 - percentile_max)))] = range_max
592 | else:
593 | percentile_values[100] = range_max
594 |
595 | # Set cdf values outside range
596 | if open_lower_bound:
597 | if range_min < percentile_values[percentile_min]:
598 | percentile_values[int(0.5 * percentile_min)] = range_min
599 | else:
600 | percentile_values[0] = range_min
601 |
602 | sorted_percentile_values = dict(sorted(percentile_values.items()))
603 |
604 | # Normalize percentile keys
605 | normalized_percentile_values = {}
606 | for key, value in sorted_percentile_values.items():
607 | percentile = float(key) / 100
608 | normalized_percentile_values[percentile] = value
609 |
610 |
611 | value_percentiles = {
612 | value: key for key, value in normalized_percentile_values.items()
613 | }
614 |
615 | # function for log scaled questions
616 | def generate_cdf_locations(range_min, range_max, zero_point):
617 | if zero_point is None:
618 | scale = lambda x: range_min + (range_max - range_min) * x
619 | else:
620 | deriv_ratio = (range_max - zero_point) / (range_min - zero_point)
621 | scale = lambda x: range_min + (range_max - range_min) * (
622 | deriv_ratio**x - 1
623 | ) / (deriv_ratio - 1)
624 | return [scale(x) for x in np.linspace(0, 1, 201)]
625 |
626 | cdf_xaxis = generate_cdf_locations(range_min, range_max, zero_point)
627 |
628 | def linear_interpolation(x_values, xy_pairs):
629 | # Sort the xy_pairs by x-values
630 | sorted_pairs = sorted(xy_pairs.items())
631 |
632 | # Extract sorted x and y values
633 | known_x = [pair[0] for pair in sorted_pairs]
634 | known_y = [pair[1] for pair in sorted_pairs]
635 |
636 | # Initialize the result list
637 | y_values = []
638 |
639 | for x in x_values:
640 | # Check if x is exactly in the known x values
641 | if x in known_x:
642 | y_values.append(known_y[known_x.index(x)])
643 | else:
644 | # Find the indices of the two nearest known x-values
645 | i = 0
646 | while i < len(known_x) and known_x[i] < x:
647 | i += 1
648 |
649 | list_index_2 = i
650 |
651 | # If x is outside the range of known x-values, use the nearest endpoint
652 | if i == 0:
653 | y_values.append(known_y[0])
654 | elif i == len(known_x):
655 | y_values.append(known_y[-1])
656 | else:
657 | # Perform linear interpolation
658 | x0, x1 = known_x[i - 1], known_x[i]
659 | y0, y1 = known_y[i - 1], known_y[i]
660 |
661 | # Linear interpolation formula
662 | y = y0 + (x - x0) * (y1 - y0) / (x1 - x0)
663 | y_values.append(y)
664 |
665 | return y_values
666 |
667 | continuous_cdf = linear_interpolation(cdf_xaxis, value_percentiles)
668 | return continuous_cdf
669 |
670 |
671 | async def get_numeric_gpt_prediction(
672 | question_details: dict, num_runs: int
673 | ) -> tuple[list[float], str]:
674 |
675 | today = datetime.datetime.now().strftime("%Y-%m-%d")
676 | title = question_details["title"]
677 | resolution_criteria = question_details["resolution_criteria"]
678 | background = question_details["description"]
679 | fine_print = question_details["fine_print"]
680 | question_type = question_details["type"]
681 | scaling = question_details["scaling"]
682 | open_upper_bound = question_details["open_upper_bound"]
683 | open_lower_bound = question_details["open_lower_bound"]
684 | unit_of_measure = question_details["unit"] if question_details["unit"] else "Not stated (please infer this)"
685 | upper_bound = scaling["range_max"]
686 | lower_bound = scaling["range_min"]
687 | zero_point = scaling["zero_point"]
688 |
689 | # Create messages about the bounds that are passed in the LLM prompt
690 | if open_upper_bound:
691 | upper_bound_message = ""
692 | else:
693 | upper_bound_message = f"The outcome can not be higher than {upper_bound}."
694 | if open_lower_bound:
695 | lower_bound_message = ""
696 | else:
697 | lower_bound_message = f"The outcome can not be lower than {lower_bound}."
698 |
699 | summary_report = run_research(title)
700 |
701 | content = NUMERIC_PROMPT_TEMPLATE.format(
702 | title=title,
703 | today=today,
704 | background=background,
705 | resolution_criteria=resolution_criteria,
706 | fine_print=fine_print,
707 | summary_report=summary_report,
708 | lower_bound_message=lower_bound_message,
709 | upper_bound_message=upper_bound_message,
710 | units=unit_of_measure,
711 | )
712 |
713 | async def ask_llm_to_get_cdf(content: str) -> tuple[list[float], str]:
714 | rationale = await call_llm(content)
715 | percentile_values = extract_percentiles_from_response(rationale)
716 |
717 | comment = (
718 | f"Extracted Percentile_values: {percentile_values}\n\nGPT's Answer: "
719 | f"{rationale}\n\n\n"
720 | )
721 |
722 | cdf = generate_continuous_cdf(
723 | percentile_values,
724 | question_type,
725 | open_upper_bound,
726 | open_lower_bound,
727 | upper_bound,
728 | lower_bound,
729 | zero_point,
730 | )
731 |
732 | return cdf, comment
733 |
734 | cdf_and_comment_pairs = await asyncio.gather(
735 | *[ask_llm_to_get_cdf(content) for _ in range(num_runs)]
736 | )
737 | comments = [pair[1] for pair in cdf_and_comment_pairs]
738 | final_comment_sections = [
739 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments)
740 | ]
741 | cdfs: list[list[float]] = [pair[0] for pair in cdf_and_comment_pairs]
742 | all_cdfs = np.array(cdfs)
743 | median_cdf: list[float] = np.median(all_cdfs, axis=0).tolist()
744 |
745 | final_comment = f"Median CDF: `{str(median_cdf)[:100]}...`\n\n" + "\n\n".join(
746 | final_comment_sections
747 | )
748 | return median_cdf, final_comment
749 |
750 |
751 | ########################## MULTIPLE CHOICE ###############
752 | # @title Multiple Choice prompt & functions
753 |
754 | MULTIPLE_CHOICE_PROMPT_TEMPLATE = """
755 | You are a professional forecaster interviewing for a job.
756 |
757 | Your interview question is:
758 | {title}
759 |
760 | The options are: {options}
761 |
762 |
763 | Background:
764 | {background}
765 |
766 | {resolution_criteria}
767 |
768 | {fine_print}
769 |
770 |
771 | Your research assistant says:
772 | {summary_report}
773 |
774 | Today is {today}.
775 |
776 | Before answering you write:
777 | (a) The time left until the outcome to the question is known.
778 | (b) The status quo outcome if nothing changed.
779 | (c) A description of an scenario that results in an unexpected outcome.
780 |
781 | You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes.
782 |
783 | The last thing you write is your final probabilities for the N options in this order {options} as:
784 | Option_A: Probability_A
785 | Option_B: Probability_B
786 | ...
787 | Option_N: Probability_N
788 | """
789 |
790 |
791 | def extract_option_probabilities_from_response(forecast_text: str, options) -> float:
792 |
793 | # Helper function that returns a list of tuples with numbers for all lines with Percentile
794 | def extract_option_probabilities(text):
795 |
796 | # Number extraction pattern
797 | number_pattern = r"-?\d+(?:,\d{3})*(?:\.\d+)?"
798 |
799 | results = []
800 |
801 | # Iterate through each line in the text
802 | for line in text.split("\n"):
803 | # Extract all numbers from the line
804 | numbers = re.findall(number_pattern, line)
805 | numbers_no_commas = [num.replace(",", "") for num in numbers]
806 | # Convert strings to float or int
807 | numbers = [
808 | float(num) if "." in num else int(num) for num in numbers_no_commas
809 | ]
810 | # Add the tuple of numbers to results
811 | if len(numbers) >= 1:
812 | last_number = numbers[-1]
813 | results.append(last_number)
814 |
815 | return results
816 |
817 | option_probabilities = extract_option_probabilities(forecast_text)
818 |
819 | NUM_OPTIONS = len(options)
820 |
821 | if len(option_probabilities) > 0:
822 | # return the last NUM_OPTIONS items
823 | return option_probabilities[-NUM_OPTIONS:]
824 | else:
825 | raise ValueError(f"Could not extract prediction from response: {forecast_text}")
826 |
827 |
828 | def generate_multiple_choice_forecast(options, option_probabilities) -> dict:
829 | """
830 | Returns: dict corresponding to the probabilities of each option.
831 | """
832 |
833 | # confirm that there is a probability for each option
834 | if len(options) != len(option_probabilities):
835 | raise ValueError(
836 | f"Number of options ({len(options)}) does not match number of probabilities ({len(option_probabilities)})"
837 | )
838 |
839 | # Ensure we are using decimals
840 | total_sum = sum(option_probabilities)
841 | decimal_list = [x / total_sum for x in option_probabilities]
842 |
843 | def normalize_list(float_list):
844 | # Step 1: Clamp values
845 | clamped_list = [max(min(x, 0.99), 0.01) for x in float_list]
846 |
847 | # Step 2: Calculate the sum of all elements
848 | total_sum = sum(clamped_list)
849 |
850 | # Step 3: Normalize the list so that all elements add up to 1
851 | normalized_list = [x / total_sum for x in clamped_list]
852 |
853 | # Step 4: Adjust for any small floating-point errors
854 | adjustment = 1.0 - sum(normalized_list)
855 | normalized_list[-1] += adjustment
856 |
857 | return normalized_list
858 |
859 | normalized_option_probabilities = normalize_list(decimal_list)
860 |
861 | probability_yes_per_category = {}
862 | for i in range(len(options)):
863 | probability_yes_per_category[options[i]] = normalized_option_probabilities[i]
864 |
865 | return probability_yes_per_category
866 |
867 |
868 | async def get_multiple_choice_gpt_prediction(
869 | question_details: dict,
870 | num_runs: int,
871 | ) -> tuple[dict[str, float], str]:
872 |
873 | today = datetime.datetime.now().strftime("%Y-%m-%d")
874 | title = question_details["title"]
875 | resolution_criteria = question_details["resolution_criteria"]
876 | background = question_details["description"]
877 | fine_print = question_details["fine_print"]
878 | question_type = question_details["type"]
879 | options = question_details["options"]
880 |
881 | summary_report = run_research(title)
882 |
883 | content = MULTIPLE_CHOICE_PROMPT_TEMPLATE.format(
884 | title=title,
885 | today=today,
886 | background=background,
887 | resolution_criteria=resolution_criteria,
888 | fine_print=fine_print,
889 | summary_report=summary_report,
890 | options=options,
891 | )
892 |
893 | async def ask_llm_for_multiple_choice_probabilities(
894 | content: str,
895 | ) -> tuple[dict[str, float], str]:
896 | rationale = await call_llm(content)
897 |
898 |
899 | option_probabilities = extract_option_probabilities_from_response(
900 | rationale, options
901 | )
902 |
903 | comment = (
904 | f"EXTRACTED_PROBABILITIES: {option_probabilities}\n\nGPT's Answer: "
905 | f"{rationale}\n\n\n"
906 | )
907 |
908 | probability_yes_per_category = generate_multiple_choice_forecast(
909 | options, option_probabilities
910 | )
911 | return probability_yes_per_category, comment
912 |
913 | probability_yes_per_category_and_comment_pairs = await asyncio.gather(
914 | *[ask_llm_for_multiple_choice_probabilities(content) for _ in range(num_runs)]
915 | )
916 | comments = [pair[1] for pair in probability_yes_per_category_and_comment_pairs]
917 | final_comment_sections = [
918 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments)
919 | ]
920 | probability_yes_per_category_dicts: list[dict[str, float]] = [
921 | pair[0] for pair in probability_yes_per_category_and_comment_pairs
922 | ]
923 | average_probability_yes_per_category: dict[str, float] = {}
924 | for option in options:
925 | probabilities_for_current_option: list[float] = [
926 | dict[option] for dict in probability_yes_per_category_dicts
927 | ]
928 | average_probability_yes_per_category[option] = sum(
929 | probabilities_for_current_option
930 | ) / len(probabilities_for_current_option)
931 |
932 | final_comment = (
933 | f"Average Probability Yes Per Category: `{average_probability_yes_per_category}`\n\n"
934 | + "\n\n".join(final_comment_sections)
935 | )
936 | return average_probability_yes_per_category, final_comment
937 |
938 |
939 | ################### FORECASTING ###################
940 | def forecast_is_already_made(post_details: dict) -> bool:
941 | """
942 | Check if a forecast has already been made by looking at my_forecasts in the question data.
943 |
944 | question.my_forecasts.latest.forecast_values has the following values for each question type:
945 | Binary: [probability for no, probability for yes]
946 | Numeric: [cdf value 1, cdf value 2, ..., cdf value 201]
947 | Multiple Choice: [probability for option 1, probability for option 2, ...]
948 | """
949 | try:
950 | forecast_values = post_details["question"]["my_forecasts"]["latest"][
951 | "forecast_values"
952 | ]
953 | return forecast_values is not None
954 | except Exception:
955 | return False
956 |
957 |
958 | async def forecast_individual_question(
959 | question_id: int,
960 | post_id: int,
961 | submit_prediction: bool,
962 | num_runs_per_question: int,
963 | skip_previously_forecasted_questions: bool,
964 | ) -> str:
965 | post_details = get_post_details(post_id)
966 | question_details = post_details["question"]
967 | title = question_details["title"]
968 | question_type = question_details["type"]
969 |
970 | summary_of_forecast = ""
971 | summary_of_forecast += f"-----------------------------------------------\nQuestion: {title}\n"
972 | summary_of_forecast += f"URL: https://www.metaculus.com/questions/{post_id}/\n"
973 |
974 | if question_type == "multiple_choice":
975 | options = question_details["options"]
976 | summary_of_forecast += f"options: {options}\n"
977 |
978 | if (
979 | forecast_is_already_made(post_details)
980 | and skip_previously_forecasted_questions == True
981 | ):
982 | summary_of_forecast += f"Skipped: Forecast already made\n"
983 | return summary_of_forecast
984 |
985 | if question_type == "binary":
986 | forecast, comment = await get_binary_gpt_prediction(
987 | question_details, num_runs_per_question
988 | )
989 | elif question_type == "numeric":
990 | forecast, comment = await get_numeric_gpt_prediction(
991 | question_details, num_runs_per_question
992 | )
993 | elif question_type == "multiple_choice":
994 | forecast, comment = await get_multiple_choice_gpt_prediction(
995 | question_details, num_runs_per_question
996 | )
997 | else:
998 | raise ValueError(f"Unknown question type: {question_type}")
999 |
1000 | print(f"-----------------------------------------------\nPost {post_id} Question {question_id}:\n")
1001 | print(f"Forecast for post {post_id} (question {question_id}):\n{forecast}")
1002 | print(f"Comment for post {post_id} (question {question_id}):\n{comment}")
1003 |
1004 | if question_type == "numeric":
1005 | summary_of_forecast += f"Forecast: {str(forecast)[:200]}...\n"
1006 | else:
1007 | summary_of_forecast += f"Forecast: {forecast}\n"
1008 |
1009 | summary_of_forecast += f"Comment:\n```\n{comment[:200]}...\n```\n\n"
1010 |
1011 | if submit_prediction == True:
1012 | forecast_payload = create_forecast_payload(forecast, question_type)
1013 | post_question_prediction(question_id, forecast_payload)
1014 | post_question_comment(post_id, comment)
1015 | summary_of_forecast += "Posted: Forecast was posted to Metaculus.\n"
1016 |
1017 | return summary_of_forecast
1018 |
1019 |
1020 | async def forecast_questions(
1021 | open_question_id_post_id: list[tuple[int, int]],
1022 | submit_prediction: bool,
1023 | num_runs_per_question: int,
1024 | skip_previously_forecasted_questions: bool,
1025 | ) -> None:
1026 | forecast_tasks = [
1027 | forecast_individual_question(
1028 | question_id,
1029 | post_id,
1030 | submit_prediction,
1031 | num_runs_per_question,
1032 | skip_previously_forecasted_questions,
1033 | )
1034 | for question_id, post_id in open_question_id_post_id
1035 | ]
1036 | forecast_summaries = await asyncio.gather(*forecast_tasks, return_exceptions=True)
1037 | print("\n", "#" * 100, "\nForecast Summaries\n", "#" * 100)
1038 |
1039 | errors = []
1040 | for question_id_post_id, forecast_summary in zip(
1041 | open_question_id_post_id, forecast_summaries
1042 | ):
1043 | question_id, post_id = question_id_post_id
1044 | if isinstance(forecast_summary, Exception):
1045 | print(
1046 | f"-----------------------------------------------\nPost {post_id} Question {question_id}:\nError: {forecast_summary.__class__.__name__} {forecast_summary}\nURL: https://www.metaculus.com/questions/{post_id}/\n"
1047 | )
1048 | errors.append(forecast_summary)
1049 | else:
1050 | print(forecast_summary)
1051 |
1052 | if errors:
1053 | print("-----------------------------------------------\nErrors:\n")
1054 | error_message = f"Errors were encountered: {errors}"
1055 | print(error_message)
1056 | raise RuntimeError(error_message)
1057 |
1058 |
1059 |
1060 |
1061 | ######################## FINAL RUN #########################
1062 | if __name__ == "__main__":
1063 | if USE_EXAMPLE_QUESTIONS:
1064 | open_question_id_post_id = EXAMPLE_QUESTIONS
1065 | else:
1066 | open_question_id_post_id = get_open_question_ids_from_tournament()
1067 |
1068 | asyncio.run(
1069 | forecast_questions(
1070 | open_question_id_post_id,
1071 | SUBMIT_PREDICTION,
1072 | NUM_RUNS_PER_QUESTION,
1073 | SKIP_PREVIOUSLY_FORECASTED_QUESTIONS,
1074 | )
1075 | )
1076 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "metac-bot-template"
3 | version = "0.1.0"
4 | description = ""
5 | authors = ["Vasile Popescu "]
6 | readme = "README.md"
7 | package-mode = false
8 |
9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | python-decouple = "^3.8"
12 | requests = "^2.32.3"
13 | asknews = "^0.9.1"
14 | numpy = "^1.26.0"
15 | openai = "^1.57.4"
16 | python-dotenv = "^1.0.1"
17 | forecasting-tools = "^0.2.23"
18 |
19 |
20 | [tool.poetry.group.dev.dependencies]
21 | ipykernel = "^6.29.5"
22 |
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 |
27 |
--------------------------------------------------------------------------------