├── .env.template ├── pyproject.toml ├── .github └── workflows │ ├── run_bot_on_metaculus_cup.yaml │ ├── test_bot.yaml │ └── run_bot_on_tournament.yaml ├── .gitignore ├── community_benchmark.py ├── README.md ├── main.py └── main_with_no_framework.py /.env.template: -------------------------------------------------------------------------------- 1 | # Required 2 | METACULUS_TOKEN=1234567890 3 | 4 | # Optional 5 | OPENROUTER_API_KEY=1234567890 6 | PERPLEXITY_API_KEY=1234567890 7 | OPENAI_API_KEY=1234567890 8 | EXA_API_KEY=1234567890 9 | ASKNEWS_CLIENT_ID=1234567890 10 | ASKNEWS_SECRET=1234567890 11 | ANTHROPIC_API_KEY=1234567890 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "metac-bot-template" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Vasile Popescu "] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | python-decouple = "^3.8" 12 | requests = "^2.32.3" 13 | asknews = "^0.13.0" 14 | numpy = "^2.3.0" 15 | openai = "^2.0.0" 16 | python-dotenv = "^1.0.1" 17 | forecasting-tools = "^0.2.80" 18 | 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | ipykernel = "^6.29.5" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/run_bot_on_metaculus_cup.yaml: -------------------------------------------------------------------------------- 1 | name: Forecast on Metaculus Cup 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 */2 * *" # runs at midnight every 2 days 7 | 8 | # Add concurrency group to prevent parallel runs 9 | concurrency: 10 | group: ${{ github.workflow }} 11 | cancel-in-progress: false 12 | 13 | 14 | # Daily job to run the simple forecast bot 15 | jobs: 16 | forecast_job: 17 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is 18 | steps: # sets up the steps that will be run in order 19 | # setup repository with all necessary dependencies - keep as is 20 | - name: Check out repository 21 | uses: actions/checkout@v3 22 | - uses: actions/setup-python@v4 23 | with: 24 | python-version: "3.11" 25 | - name: Install poetry 26 | uses: snok/install-poetry@v1 27 | with: 28 | virtualenvs-create: true 29 | virtualenvs-in-project: true 30 | installer-parallel: true 31 | - name: Install dependencies 32 | run: poetry install --no-interaction --no-root 33 | - name: Run bot 34 | run: | 35 | poetry run python main.py --mode metaculus_cup 36 | # this reads the environment variables from the github repository. 37 | # Store under Settings --> Secrets and variables --> Actions 38 | env: 39 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token 40 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} 41 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }} 42 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 43 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 44 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 45 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} 46 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} -------------------------------------------------------------------------------- /.github/workflows/test_bot.yaml: -------------------------------------------------------------------------------- 1 | name: Test Bot 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | # Add concurrency group to prevent parallel runs 7 | concurrency: 8 | group: ${{ github.workflow }} 9 | cancel-in-progress: false 10 | 11 | 12 | # Daily job to run the simple forecast bot 13 | jobs: 14 | forecast_job: 15 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is 16 | steps: # sets up the steps that will be run in order 17 | # setup repository with all necessary dependencies - keep as is 18 | - name: Check out repository 19 | uses: actions/checkout@v3 20 | - uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.11" 23 | - name: Install poetry 24 | uses: snok/install-poetry@v1 25 | with: 26 | virtualenvs-create: true 27 | virtualenvs-in-project: true 28 | installer-parallel: true 29 | # Adding the below will make the workflow faster, but will mean you don't automatically get updates from forecasting-tools and other packages 30 | # - name: Load cached venv 31 | # id: cached-poetry-dependencies 32 | # uses: actions/cache@v4 33 | # with: 34 | # path: .venv 35 | # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 36 | - name: Install dependencies 37 | run: poetry install --no-interaction --no-root 38 | - name: Run bot 39 | run: | 40 | poetry run python main.py --mode test_questions 41 | # this reads the environment variables from the github repository. 42 | # Store under Settings --> Secrets and variables --> Actions 43 | env: 44 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token 45 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} 46 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }} 47 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 48 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 49 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 50 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} 51 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} -------------------------------------------------------------------------------- /.github/workflows/run_bot_on_tournament.yaml: -------------------------------------------------------------------------------- 1 | name: Forecast on new AI tournament questions 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "*/20 * * * *" # runs every 20 minutes. Make sure to skip already forecasted questions! 7 | 8 | # Add concurrency group to prevent parallel runs 9 | concurrency: 10 | group: ${{ github.workflow }} 11 | cancel-in-progress: false 12 | 13 | 14 | # Daily job to run the simple forecast bot 15 | jobs: 16 | forecast_job: 17 | runs-on: ubuntu-latest # determines the machine that will run the job - keep as is 18 | steps: # sets up the steps that will be run in order 19 | # setup repository with all necessary dependencies - keep as is 20 | - name: Check out repository 21 | uses: actions/checkout@v3 22 | - uses: actions/setup-python@v4 23 | with: 24 | python-version: "3.11" 25 | - name: Install poetry 26 | uses: snok/install-poetry@v1 27 | with: 28 | virtualenvs-create: true 29 | virtualenvs-in-project: true 30 | installer-parallel: true 31 | # Adding the below will make the workflow faster, but will mean you don't automatically get updates from forecasting-tools and other packages 32 | # - name: Load cached venv 33 | # id: cached-poetry-dependencies 34 | # uses: actions/cache@v4 35 | # with: 36 | # path: .venv 37 | # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 38 | - name: Install dependencies 39 | run: poetry install --no-interaction --no-root 40 | - name: Run bot 41 | run: | 42 | poetry run python main.py 43 | # this reads the environment variables from the github repository. 44 | # Store under Settings --> Secrets and variables --> Actions 45 | env: 46 | METACULUS_TOKEN: ${{ secrets.METACULUS_TOKEN }} # replace this with the name of the variable under which you stored your own Metaculus token 47 | PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} 48 | EXA_API_KEY: ${{ secrets.EXA_API_KEY }} 49 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 50 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 51 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} 52 | ASKNEWS_CLIENT_ID: ${{ secrets.ASKNEWS_CLIENT_ID }} 53 | ASKNEWS_SECRET: ${{ secrets.ASKNEWS_SECRET }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Specific to Project ### 2 | benchmarks/ 3 | sandbox.py 4 | 5 | 6 | ### General gitignore ### 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | # *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/#use-with-ide 117 | .pdm.toml 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # Ruff 163 | .ruff_cache/ 164 | 165 | # PyCharm 166 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 167 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 168 | # and can be added to the global gitignore or merged into this file. For a more nuclear 169 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 170 | # .idea/ 171 | 172 | # MacOS 173 | .DS_Store 174 | -------------------------------------------------------------------------------- /community_benchmark.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import asyncio 5 | import logging 6 | import sys 7 | from datetime import datetime, timedelta 8 | from typing import Literal 9 | 10 | import typeguard 11 | from forecasting_tools import ( 12 | Benchmarker, 13 | ForecastBot, 14 | GeneralLlm, 15 | MonetaryCostManager, 16 | MetaculusApi, 17 | ApiFilter, 18 | run_benchmark_streamlit_page, 19 | ) 20 | 21 | from main import TemplateForecaster 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | 27 | async def benchmark_forecast_bot(mode: str) -> None: 28 | """ 29 | Run a benchmark that compares your forecasts against the community prediction 30 | """ 31 | 32 | number_of_questions = 30 # Recommend 100+ for meaningful error bars, but 30 is faster/cheaper 33 | if mode == "display": 34 | run_benchmark_streamlit_page() 35 | return 36 | elif mode == "run": 37 | questions = MetaculusApi.get_benchmark_questions(number_of_questions) 38 | elif mode == "custom": 39 | # Below is an example of getting custom questions 40 | one_year_from_now = datetime.now() + timedelta(days=365) 41 | api_filter = ApiFilter( 42 | allowed_statuses=["open"], 43 | allowed_types=["binary"], 44 | num_forecasters_gte=40, 45 | scheduled_resolve_time_lt=one_year_from_now, 46 | includes_bots_in_aggregates=False, 47 | community_prediction_exists=True, 48 | ) 49 | questions = await MetaculusApi.get_questions_matching_filter( 50 | api_filter, 51 | num_questions=number_of_questions, 52 | randomly_sample=True, 53 | ) 54 | for question in questions: 55 | question.background_info = None # Test ability to find new information 56 | else: 57 | raise ValueError(f"Invalid mode: {mode}") 58 | 59 | with MonetaryCostManager() as cost_manager: 60 | bots = [ 61 | TemplateForecaster( 62 | predictions_per_research_report=5, 63 | llms={ 64 | "default": GeneralLlm( 65 | model="gpt-4o-mini", 66 | temperature=0.3, 67 | ), 68 | }, 69 | ), 70 | TemplateForecaster( 71 | predictions_per_research_report=1, 72 | llms={ 73 | "default": GeneralLlm( 74 | model="gpt-4o-mini", 75 | temperature=0.3, 76 | ), 77 | }, 78 | ), 79 | # Add other ForecastBots here (or same bot with different parameters) 80 | ] 81 | bots = typeguard.check_type(bots, list[ForecastBot]) 82 | benchmarks = await Benchmarker( 83 | questions_to_use=questions, 84 | forecast_bots=bots, 85 | file_path_to_save_reports="benchmarks/", 86 | concurrent_question_batch_size=10, 87 | ).run_benchmark() 88 | for i, benchmark in enumerate(benchmarks): 89 | logger.info( 90 | f"Benchmark {i+1} of {len(benchmarks)}: {benchmark.name}" 91 | ) 92 | logger.info( 93 | f"- Final Score: {benchmark.average_expected_baseline_score}" 94 | ) 95 | logger.info(f"- Total Cost: {benchmark.total_cost}") 96 | logger.info(f"- Time taken: {benchmark.time_taken_in_minutes}") 97 | logger.info(f"Total Cost: {cost_manager.current_usage}") 98 | 99 | 100 | if __name__ == "__main__": 101 | logging.basicConfig( 102 | level=logging.INFO, 103 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 104 | handlers=[ 105 | logging.StreamHandler(sys.stdout), 106 | logging.FileHandler(f"benchmarks/log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log") 107 | ] 108 | ) 109 | 110 | # Suppress LiteLLM logging 111 | litellm_logger = logging.getLogger("LiteLLM") 112 | litellm_logger.setLevel(logging.WARNING) 113 | litellm_logger.propagate = False 114 | 115 | # Parse command line arguments 116 | parser = argparse.ArgumentParser( 117 | description="Benchmark a list of bots" 118 | ) 119 | parser.add_argument( 120 | "--mode", 121 | type=str, 122 | choices=["run", "custom", "display"], 123 | default="display", 124 | help="Specify the run mode (default: display)", 125 | ) 126 | args = parser.parse_args() 127 | mode: Literal["run", "custom", "display"] = ( 128 | args.mode 129 | ) 130 | asyncio.run(benchmark_forecast_bot(mode)) 131 | 132 | 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Metaculus forecasting bot 2 | This repository contains a simple bot meant to get you started with creating your own bot for the AI Forecasting Tournament. Go to https://www.metaculus.com/aib/ for more info and tournament rules (this should link to the "Getting Started" section of our [resources](https://www.metaculus.com/notebooks/38928/ai-benchmark-resources/#GettingStarted:~:text=AI%20Forecasting%20Benchmark%3F-,Getting%20Started,-We%27ve%20published%20a) page). 3 | 4 | In this project are 2 files: 5 | - **main.py**: Our recommended template option that uses [forecasting-tools](https://github.com/Metaculus/forecasting-tools) package to handle a lot of stuff in the background for you (such as API calls). We will update the package, thus allowing you to gain new features with minimal changes to your code. 6 | - **main_with_no_framework.py**: A copy of main.py but implemented with minimal dependencies. Useful if you want a more custom approach. 7 | 8 | Join the conversation about bot creation, get support, and follow updates on the [Metaculus Discord](https://discord.com/invite/NJgCC2nDfh) 'build a forecasting bot' channel. 9 | 10 | ## 30min Video Tutorial 11 | This tutorial shows you how to set up our template bot so you can start forecasting in the tournament. 12 | 13 | [![Watch the tutorial](https://cdn.loom.com/sessions/thumbnails/fc3c1a643b984a15b510647d8f760685-42b452e1ab7d2afa-full-play.gif)](https://www.loom.com/share/fc3c1a643b984a15b510647d8f760685?sid=29b502e0-cf64-421e-82c0-3a78451159ed) 14 | 15 | If you run into trouble, reach out to `ben [at] metaculus [.com]` 16 | 17 | 18 | ## Quick start -> Fork and use Github Actions 19 | The easiest way to use this repo is to fork it, enable github workflow/actions, and then set repository secrets. Then your bot will run every 30min, pick up new questions, and forecast on them. Automation is handled in the `.github/workflows/` folder. The `daily_run_simple_bot.yaml` file runs the simple bot every 30min and will skip questions it has already forecasted on. 20 | 21 | 1) **Fork the repository**: Go to the [repository](https://github.com/Metaculus/metac-bot-template) and click 'fork'. 22 | 2) **Set secrets**: Go to `Settings -> Secrets and variables -> Actions -> New repository secret` and set API keys/Tokens as secrets. You will want to set your METACULUS_TOKEN and an OPENROUTER_API_KEY (or whatever LLM/search providers you plan to use). This will be used to post questions to Metaculus. Make sure to copy the name of these variables exactly (including all caps). 23 | - You can create a METACULUS_TOKEN at https://metaculus.com/aib. If you get confused, please see the instructions on our [resources](https://www.metaculus.com/notebooks/38928/ai-benchmark-resources/#creating-your-bot-account-and-metaculus-token) page. 24 | - You can get an OPENROUTER_API_KEY with free credits by filling out this [form](https://forms.gle/aQdYMq9Pisrf1v7d8). If you don't want to wait or want to use more models than we provide, you can also make your own API key on OpenRouter's [website](https://openrouter.ai/). First, make an account, then go to your profile, then go to "keys", and then make a key. 25 | - Other LLM and Search providers should work out of the box (such as OPENAI_API_KEY, PERPLEXITY_API_KEY, ASKNEWS_SECRET, etc), though we recommend OpenRouter to start. 26 | 4) **Enable Actions**: Go to 'Actions' then click 'Enable'. Then go to the 'Regularly forecast new questions' workflow, and click 'Enable'. To test if the workflow is working, click 'Run workflow', choose the main branch, then click the green 'Run workflow' button. This will check for new questions and forecast only on ones it has not yet successfully forecast on. 27 | 28 | The bot should just work as is at this point. You can disable the workflow by clicking `Actions > Regularly forecast new questions > Triple dots > disable workflow` 29 | 30 | ## API Keys 31 | Instructions for getting your METACULUS_TOKEN, OPENROUTER_API_KEY, or optional search provider API keys (AskNews, Exa, Perplexity, etc) are listed on the "Getting Started" section of the [resources](https://www.metaculus.com/notebooks/38928/ai-benchmark-resources/#GettingStarted:~:text=AI%20Forecasting%20Benchmark%3F-,Getting%20Started,-We%27ve%20published%20a) page. 32 | 33 | ## Changing the Github automation 34 | You can change which file is run in the GitHub automation by either changing the content of `main.py` to the contents of `main_with_no_framwork.py` (or another script) or by chaging all references to `main.py` to another script in `.github/workflows/run_bot_on_tournament.yaml` and related files. 35 | 36 | ## Editing in GitHub UI 37 | Remember that you can edit a bot non locally by clicking on a file in Github, and then clicking the 'Edit this file' button. Whether you develop locally or not, when making edits, attempt to do things that you think others have not tried, as this will help further innovation in the field more than doing something that has already been done. Feel free to ask about what has or has not been tried in the Discord. 38 | 39 | ## Run/Edit the bot locally 40 | Clone the repository. Find your terminal and run the following commands: 41 | ```bash 42 | git clone https://github.com/Metaculus/metac-bot-template.git 43 | ``` 44 | 45 | If you forked the repository first, you have to replace the url in the `git clone` command with the url to your fork. Just go to your forked repository and copy the URL from the address bar in the browser. 46 | 47 | ### Installing dependencies 48 | Make sure you have python and [poetry](https://python-poetry.org/docs/#installing-with-pipx) installed (poetry is a python package manager). 49 | 50 | If you don't have poetry installed, run the below: 51 | ```bash 52 | sudo apt update -y 53 | sudo apt install -y pipx 54 | pipx install poetry 55 | 56 | # Optional 57 | poetry config virtualenvs.in-project true 58 | ``` 59 | 60 | Inside the terminal, go to the directory you cloned the repository into and run the following command: 61 | ```bash 62 | poetry install 63 | ``` 64 | to install all required dependencies. 65 | 66 | ### Setting environment variables 67 | 68 | Running the bot requires various environment variables. If you run the bot locally, the easiest way to set them is to create a file called `.env` in the root directory of the repository (copy the `.env.template`). 69 | 70 | ### Running the bot 71 | 72 | To test the simple bot, execute the following command in your terminal: 73 | ```bash 74 | poetry run python main.py --mode test_questions 75 | ``` 76 | Make sure to set the environment variables as described above and to set the parameters in the code to your liking. In particular, to submit predictions, make sure that `submit_predictions` is set to `True` (it is set to `True` by default in main.py). 77 | 78 | ## Early Benchmarking 79 | Provided in this project is an example of how to benchmark your bot's forecasts against the community prediction for questions on Metaculus. Running `community_benchmark.py` will run versions of your bot defined by you (e.g. with different LLMs or research paths) and score them on how close they are to the community prediction using expected baseline score (a proper score assuming the community prediction is the true probability). You will want to edit the file to choose which bot configurations you want to test and how many questions you want to test on. Any class inheriting from `forecasting-tools.Forecastbot` can be passed into the benchmarker. As of March 28, 2025 the benchmarker only works with binary questions. 80 | 81 | To run a benchmark: 82 | `poetry run python community_benchmark.py --mode run` 83 | 84 | To run a custom benchmark (e.g. remove background info from questions to test retrieval): 85 | `poetry run python community_benchmark.py --mode custom` 86 | 87 | To view a UI showing your scores, statistical error bars, and your bot's reasoning: 88 | `poetry run streamlit run community_benchmark.py` 89 | 90 | See more information in the benchmarking section of the [forecasting-tools repo](https://github.com/Metaculus/forecasting-tools?tab=readme-ov-file#benchmarking) 91 | 92 | 93 | ## Example usage of /news and /deepnews: 94 | If you are using AskNews, here is some useful example code. 95 | ```python 96 | from asknews_sdk import AsyncAskNewsSDK 97 | import asyncio 98 | 99 | """ 100 | More information available here: 101 | https://docs.asknews.app/en/news 102 | https://docs.asknews.app/en/deepnews 103 | 104 | Installation: 105 | pip install asknews 106 | """ 107 | 108 | client_id = "" 109 | client_secret = "" 110 | 111 | ask = AsyncAskNewsSDK( 112 | client_id=client_id, 113 | client_secret=client_secret, 114 | scopes=["chat", "news", "stories", "analytics"], 115 | ) 116 | 117 | # /news endpoint example 118 | async def search_news(query): 119 | 120 | hot_response = await ask.news.search_news( 121 | query=query, # your natural language query 122 | n_articles=5, # control the number of articles to include in the context 123 | return_type="both", 124 | strategy="latest news" # enforces looking at the latest news only 125 | ) 126 | 127 | print(hot_response.as_string) 128 | 129 | # get context from the "historical" database that contains a news archive going back to 2023 130 | historical_response = await ask.news.search_news( 131 | query=query, 132 | n_articles=10, 133 | return_type="both", 134 | strategy="news knowledge" # looks for relevant news within the past 60 days 135 | ) 136 | 137 | print(historical_response.as_string) 138 | 139 | # /deepnews endpoint example: 140 | async def deep_research( 141 | query, sources, model, search_depth=2, max_depth=2 142 | ): 143 | 144 | response = await ask.chat.get_deep_news( 145 | messages=[{"role": "user", "content": query}], 146 | search_depth=search_depth, 147 | max_depth=max_depth, 148 | sources=sources, 149 | stream=False, 150 | return_sources=False, 151 | model=model, 152 | inline_citations="numbered" 153 | ) 154 | 155 | print(response) 156 | 157 | 158 | if __name__ == "__main__": 159 | query = "What is the TAM of the global market for electric vehicles in 2025? With your final report, please report the TAM in USD using the tags ... " 160 | 161 | sources = ["asknews"] 162 | model = "deepseek-basic" 163 | search_depth = 2 164 | max_depth = 2 165 | asyncio.run( 166 | deep_research( 167 | query, sources, model, search_depth, max_depth 168 | ) 169 | ) 170 | 171 | asyncio.run(search_news(query)) 172 | ``` 173 | 174 | Some tips for DeepNews: 175 | 176 | You will get tags in your response, including: 177 | 178 | 179 | 180 | 181 | 182 | These tags are likely useful for extracting the pieces that you need for your pipeline. For example, if you don't want to include all the thinking/searching, you could just extract 183 | 184 | 185 | ## Ideas for bot improvements 186 | Below are some ideas for making a novel bot. 187 | - Finetuned LLM on Metaculus Data: Create an optimized prompt (using DSPY or a similar toolset) and/or a fine-tuned LLM using all past Metaculus data. The thought is that this will train the LLM to be well-calibrated on real-life questions. Consider knowledge cutoffs and data leakage from search providers. 188 | - Dataset explorer: Create a tool that can find if there are datasets or graphs related to a question online, download them if they exist, and then run data science on them to answer a question. 189 | - Question decomposer: A tool that takes a complex question and breaks it down into simpler questions to answer those instead 190 | - Meta-Forecast Researcher: A tool that searches all major prediction markets, prediction aggregators, and possibly thought leaders to find relevant forecasts, and then combines them into an assessment for the current question (see [Metaforecast](https://metaforecast.org/)). 191 | - Base rate researcher: Create a tool to find accurate base rates. There is an experimental version [here](https://forecasting-tools.streamlit.app/base-rate-generator) in [forecasting-tools](https://github.com/Metaculus/forecasting-tools) that works 50% of the time. 192 | - Key factors researcher: Improve our experimental [key factors researcher](https://forecasting-tools.streamlit.app/key-factors) to find higher significance key factors for a given question. 193 | - Monte Carlo Simulations: Experiment with combining some tools to run effective Monte Carlo simulations. This could include experimenting with combining Squiggle with the question decomposer. 194 | - Adding personality diversity, LLM diversity, and other variations: Have GPT come up with a number of different ‘expert personalities’ or 'world-models' that it runs the forecasting bot with and then aggregates the median. Additionally, run the bot on different LLMs and see if the median of different LLMs improves the forecast. Finally, try simulating up to hundreds of personalities/LLM combinations to create large, diverse crowds. Each individual could have a backstory, thinking process, biases they are resistant to, etc. This will ideally improve accuracy and give more useful bot reasoning outputs to help humans reading the output consider things from multiple angles. 195 | - Worldbuilding: Have GPT world build different future scenarios and then forecast all the different parts of those scenarios. It would then choose the most likely future world. In addition to a forecast, descriptions of future ‘worlds’ are created. This can take inspiration from Feinman paths. 196 | - Consistency Forecasting: Forecast many tangential questions all at once (in a single prompt) and prompts for consistency rules. 197 | - Extremize & Calibrate Predictions: Using the historical performance of a bot, adjust forecasts to be better calibrated. For instance, if predictions of 30% from the bot actually happen 40% of the time, then transform predictions of 30% to 40%. 198 | - Assigning points to evidence: Starting with some ideas from a [blog post from Ozzie Gooen](https://forum.effectivealtruism.org/posts/mrAZFnEjsQAQPJvLh/using-points-to-rate-different-kinds-of-evidence), you could experiment with assigning ‘points’ to major types of evidence and having GPT categorize the evidence it finds related to a forecast so that the ‘total points’ can be calculated. This can then be turned into a forecast, and potentially optimized using machine learning on past Metaculus data. 199 | - Search provider benchmark: Run bots using different combinations of search providers (e.g. Google, Bing, Exa.ai, Tavily, AskNews, Perplexity, etc) and search filters (e.g. only recent data, sites with a certain search rank, etc) and see if any specific one is better than others, or if using multiple of them makes a difference. 200 | - Timeline researcher: Make a tool that can take a niche topic and make a timeline for all major and minor events relevant to that topic. 201 | - Research Tools: Utilize the ComputerUse and DataAnalyzer tool from forecasting-tools for advanced analysis and to find/analyze datasets. 202 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import logging 4 | from datetime import datetime, timezone 5 | from typing import Literal 6 | 7 | 8 | from forecasting_tools import ( 9 | AskNewsSearcher, 10 | BinaryQuestion, 11 | ForecastBot, 12 | GeneralLlm, 13 | MetaculusClient, 14 | MetaculusQuestion, 15 | MultipleChoiceQuestion, 16 | NumericDistribution, 17 | NumericQuestion, 18 | DateQuestion, 19 | DatePercentile, 20 | Percentile, 21 | ConditionalQuestion, 22 | ConditionalPrediction, 23 | PredictionTypes, 24 | PredictionAffirmed, 25 | BinaryPrediction, 26 | PredictedOptionList, 27 | ReasonedPrediction, 28 | SmartSearcher, 29 | clean_indents, 30 | structure_output, 31 | ) 32 | 33 | logger = logging.getLogger(__name__) 34 | 35 | 36 | class SpringTemplateBot2026(ForecastBot): 37 | """ 38 | This is the template bot for Spring 2026 Metaculus AI Tournament. 39 | This is a copy of what is used by Metaculus to run the Metac Bots in our benchmark, provided as a template for new bot makers. 40 | This template is given as-is, and is use-at-your-own-risk. 41 | We have covered most test cases in forecasting-tools it may be worth double checking key components locally. 42 | So far our track record has been 1 mentionable bug per season (affecting forecasts for 1-2% of total questions) 43 | 44 | Main changes since Fall: 45 | - Additional prompting has been added to numeric questions to emphasize putting pecentile values in the correct order. 46 | - Support for conditional and date questions has been added 47 | - Note: Spring AIB will not use date/conditional questions, so these are only for forecasting on the main site as you wish. 48 | 49 | The main entry point of this bot is `bot.forecast_on_tournament(tournament_id)` in the parent class. 50 | See the script at the bottom of the file for more details on how to run the bot. 51 | Ignoring the finer details, the general flow is: 52 | - Load questions from Metaculus 53 | - For each question 54 | - Execute run_research a number of times equal to research_reports_per_question 55 | - Execute respective run_forecast function `predictions_per_research_report * research_reports_per_question` times 56 | - Aggregate the predictions 57 | - Submit prediction (if publish_reports_to_metaculus is True) 58 | - Return a list of ForecastReport objects 59 | 60 | Alternatively, you can use the MetaculusClient to make a custom filter of questions to forecast on 61 | and forecast them with `bot.forecast_questions(questions)` 62 | 63 | Only the research and forecast functions need to be implemented in ForecastBot subclasses, 64 | though you may want to override other ForecastBot functions. 65 | In this example, you can change the prompts to be whatever you want since, 66 | structure_output uses an LLM to intelligently reformat the output into the needed structure. 67 | 68 | By default (i.e. 'tournament' mode), when you run this script, it will forecast on any open questions in the 69 | primary bot tournament and MiniBench. If you want to forecast on only one or the other, you can remove one 70 | of them from the 'tournament' mode code at the bottom of the file. 71 | 72 | You can experiment with what models work best with your bot by using the `llms` parameter when initializing the bot. 73 | You can initialize the bot with any number of models. For example, 74 | ```python 75 | my_bot = MyBot( 76 | ... 77 | llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you 78 | "default": GeneralLlm( 79 | model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) 80 | temperature=0.3, 81 | timeout=40, 82 | allowed_tries=2, 83 | ), 84 | "summarizer": "openai/gpt-4o-mini", 85 | "researcher": "asknews/news-summaries", 86 | "parser": "openai/gpt-4o-mini", 87 | }, 88 | ) 89 | ``` 90 | 91 | Then you can access the model in custom functions like this: 92 | ```python 93 | research_strategy = self.get_llm("researcher", "model_name" 94 | if research_strategy == "asknews/news-summaries": 95 | ... 96 | # OR 97 | summarizer = await self.get_llm("summarizer", "llm").invoke(prompt) 98 | # OR 99 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 100 | ``` 101 | 102 | If you end up having trouble with rate limits and want to try a more sophisticated rate limiter try: 103 | ```python 104 | from forecasting_tools import RefreshingBucketRateLimiter 105 | rate_limiter = RefreshingBucketRateLimiter( 106 | capacity=2, 107 | refresh_rate=1, 108 | ) # Allows 1 request per second on average with a burst of 2 requests initially. Set this as a class variable 109 | await self.rate_limiter.wait_till_able_to_acquire_resources(1) # 1 because it's consuming 1 request (use more if you are adding a token limit) 110 | ``` 111 | Additionally OpenRouter has large rate limits immediately on account creation 112 | """ 113 | 114 | _max_concurrent_questions = ( 115 | 1 # Set this to whatever works for your search-provider/ai-model rate limits 116 | ) 117 | _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions) 118 | _structure_output_validation_samples = 2 119 | 120 | ##################################### RESEARCH ##################################### 121 | 122 | async def run_research(self, question: MetaculusQuestion) -> str: 123 | async with self._concurrency_limiter: 124 | research = "" 125 | researcher = self.get_llm("researcher") 126 | 127 | prompt = clean_indents( 128 | f""" 129 | You are an assistant to a superforecaster. 130 | The superforecaster will give you a question they intend to forecast on. 131 | To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. 132 | You do not produce forecasts yourself. 133 | 134 | Question: 135 | {question.question_text} 136 | 137 | This question's outcome will be determined by the specific criteria below: 138 | {question.resolution_criteria} 139 | 140 | {question.fine_print} 141 | """ 142 | ) 143 | 144 | if isinstance(researcher, GeneralLlm): 145 | research = await researcher.invoke(prompt) 146 | elif ( 147 | researcher == "asknews/news-summaries" 148 | or researcher == "asknews/deep-research/low-depth" 149 | or researcher == "asknews/deep-research/medium-depth" 150 | or researcher == "asknews/deep-research/high-depth" 151 | ): 152 | research = await AskNewsSearcher().call_preconfigured_version( 153 | researcher, prompt 154 | ) 155 | elif researcher.startswith("smart-searcher"): 156 | model_name = researcher.removeprefix("smart-searcher/") 157 | searcher = SmartSearcher( 158 | model=model_name, 159 | temperature=0, 160 | num_searches_to_run=2, 161 | num_sites_per_search=10, 162 | use_advanced_filters=False, 163 | ) 164 | research = await searcher.invoke(prompt) 165 | elif not researcher or researcher == "None" or researcher == "no_research": 166 | research = "" 167 | else: 168 | research = await self.get_llm("researcher", "llm").invoke(prompt) 169 | logger.info(f"Found Research for URL {question.page_url}:\n{research}") 170 | return research 171 | 172 | ##################################### BINARY QUESTIONS ##################################### 173 | 174 | async def _run_forecast_on_binary( 175 | self, question: BinaryQuestion, research: str 176 | ) -> ReasonedPrediction[float]: 177 | prompt = clean_indents( 178 | f""" 179 | You are a professional forecaster interviewing for a job. 180 | 181 | Your interview question is: 182 | {question.question_text} 183 | 184 | Question background: 185 | {question.background_info} 186 | 187 | 188 | This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied: 189 | {question.resolution_criteria} 190 | 191 | {question.fine_print} 192 | 193 | 194 | Your research assistant says: 195 | {research} 196 | 197 | Today is {datetime.now().strftime("%Y-%m-%d")}. 198 | 199 | Before answering you write: 200 | (a) The time left until the outcome to the question is known. 201 | (b) The status quo outcome if nothing changed. 202 | (c) A brief description of a scenario that results in a No outcome. 203 | (d) A brief description of a scenario that results in a Yes outcome. 204 | 205 | You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. 206 | {self._get_conditional_disclaimer_if_necessary(question)} 207 | 208 | The last thing you write is your final answer as: "Probability: ZZ%", 0-100 209 | """ 210 | ) 211 | 212 | return await self._binary_prompt_to_forecast(question, prompt) 213 | 214 | async def _binary_prompt_to_forecast( 215 | self, 216 | question: BinaryQuestion, 217 | prompt: str, 218 | ) -> ReasonedPrediction[float]: 219 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 220 | logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") 221 | binary_prediction: BinaryPrediction = await structure_output( 222 | reasoning, 223 | BinaryPrediction, 224 | model=self.get_llm("parser", "llm"), 225 | num_validation_samples=self._structure_output_validation_samples, 226 | ) 227 | decimal_pred = max(0.01, min(0.99, binary_prediction.prediction_in_decimal)) 228 | 229 | logger.info( 230 | f"Forecasted URL {question.page_url} with prediction: {decimal_pred}." 231 | ) 232 | return ReasonedPrediction(prediction_value=decimal_pred, reasoning=reasoning) 233 | 234 | ##################################### MULTIPLE CHOICE QUESTIONS ##################################### 235 | 236 | async def _run_forecast_on_multiple_choice( 237 | self, question: MultipleChoiceQuestion, research: str 238 | ) -> ReasonedPrediction[PredictedOptionList]: 239 | prompt = clean_indents( 240 | f""" 241 | You are a professional forecaster interviewing for a job. 242 | 243 | Your interview question is: 244 | {question.question_text} 245 | 246 | The options are: {question.options} 247 | 248 | 249 | Background: 250 | {question.background_info} 251 | 252 | {question.resolution_criteria} 253 | 254 | {question.fine_print} 255 | 256 | 257 | Your research assistant says: 258 | {research} 259 | 260 | Today is {datetime.now().strftime("%Y-%m-%d")}. 261 | 262 | Before answering you write: 263 | (a) The time left until the outcome to the question is known. 264 | (b) The status quo outcome if nothing changed. 265 | (c) A description of an scenario that results in an unexpected outcome. 266 | 267 | {self._get_conditional_disclaimer_if_necessary(question)} 268 | You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. 269 | 270 | The last thing you write is your final probabilities for the N options in this order {question.options} as: 271 | Option_A: Probability_A 272 | Option_B: Probability_B 273 | ... 274 | Option_N: Probability_N 275 | """ 276 | ) 277 | return await self._multiple_choice_prompt_to_forecast(question, prompt) 278 | 279 | async def _multiple_choice_prompt_to_forecast( 280 | self, 281 | question: MultipleChoiceQuestion, 282 | prompt: str, 283 | ) -> ReasonedPrediction[PredictedOptionList]: 284 | parsing_instructions = clean_indents( 285 | f""" 286 | Make sure that all option names are one of the following: 287 | {question.options} 288 | 289 | The text you are parsing may prepend these options with some variation of "Option" which you should remove if not part of the option names I just gave you. 290 | Additionally, you may sometimes need to parse a 0% probability. Please do not skip options with 0% but rather make it an entry in your final list with 0% probability. 291 | """ 292 | ) 293 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 294 | logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") 295 | predicted_option_list: PredictedOptionList = await structure_output( 296 | text_to_structure=reasoning, 297 | output_type=PredictedOptionList, 298 | model=self.get_llm("parser", "llm"), 299 | num_validation_samples=self._structure_output_validation_samples, 300 | additional_instructions=parsing_instructions, 301 | ) 302 | 303 | logger.info( 304 | f"Forecasted URL {question.page_url} with prediction: {predicted_option_list}." 305 | ) 306 | return ReasonedPrediction( 307 | prediction_value=predicted_option_list, reasoning=reasoning 308 | ) 309 | 310 | ##################################### NUMERIC QUESTIONS ##################################### 311 | 312 | async def _run_forecast_on_numeric( 313 | self, question: NumericQuestion, research: str 314 | ) -> ReasonedPrediction[NumericDistribution]: 315 | upper_bound_message, lower_bound_message = ( 316 | self._create_upper_and_lower_bound_messages(question) 317 | ) 318 | prompt = clean_indents( 319 | f""" 320 | You are a professional forecaster interviewing for a job. 321 | 322 | Your interview question is: 323 | {question.question_text} 324 | 325 | Background: 326 | {question.background_info} 327 | 328 | {question.resolution_criteria} 329 | 330 | {question.fine_print} 331 | 332 | Units for answer: {question.unit_of_measure if question.unit_of_measure else "Not stated (please infer this)"} 333 | 334 | Your research assistant says: 335 | {research} 336 | 337 | Today is {datetime.now().strftime("%Y-%m-%d")}. 338 | 339 | {lower_bound_message} 340 | {upper_bound_message} 341 | 342 | Formatting Instructions: 343 | - Please notice the units requested and give your answer in these units (e.g. whether you represent a number as 1,000,000 or 1 million). 344 | - Never use scientific notation. 345 | - Always start with a smaller number (more negative if negative) and then increase from there. The value for percentile 10 should always be less than the value for percentile 20, and so on. 346 | 347 | Before answering you write: 348 | (a) The time left until the outcome to the question is known. 349 | (b) The outcome if nothing changed. 350 | (c) The outcome if the current trend continued. 351 | (d) The expectations of experts and markets. 352 | (e) A brief description of an unexpected scenario that results in a low outcome. 353 | (f) A brief description of an unexpected scenario that results in a high outcome. 354 | 355 | {self._get_conditional_disclaimer_if_necessary(question)} 356 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. 357 | 358 | The last thing you write is your final answer as: 359 | " 360 | Percentile 10: XX (lowest number value) 361 | Percentile 20: XX 362 | Percentile 40: XX 363 | Percentile 60: XX 364 | Percentile 80: XX 365 | Percentile 90: XX (highest number value) 366 | " 367 | """ 368 | ) 369 | return await self._numeric_prompt_to_forecast(question, prompt) 370 | 371 | async def _numeric_prompt_to_forecast( 372 | self, 373 | question: NumericQuestion, 374 | prompt: str, 375 | ) -> ReasonedPrediction[NumericDistribution]: 376 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 377 | logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") 378 | parsing_instructions = clean_indents( 379 | f""" 380 | The text given to you is trying to give a forecast distribution for a numeric question. 381 | - This text is trying to answer the numeric question: "{question.question_text}". 382 | - When parsing the text, please make sure to give the values (the ones assigned to percentiles) in terms of the correct units. 383 | - The units for the forecast are: {question.unit_of_measure} 384 | - Your work will be shown publicly with these units stated verbatim after the numbers your parse. 385 | - As an example, someone else guessed that the answer will be between {question.lower_bound} {question.unit_of_measure} and {question.upper_bound} {question.unit_of_measure}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". 386 | - If the answer doesn't give the answer in the correct units, you should parse it in the right units. For instance if the answer gives numbers as $500,000,000 and units are "B $" then you should parse the answer as 0.5 (since $500,000,000 is $0.5 billion). 387 | - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. 388 | - Turn any values that are in scientific notation into regular numbers. 389 | """ 390 | ) 391 | percentile_list: list[Percentile] = await structure_output( 392 | reasoning, 393 | list[Percentile], 394 | model=self.get_llm("parser", "llm"), 395 | additional_instructions=parsing_instructions, 396 | num_validation_samples=self._structure_output_validation_samples, 397 | ) 398 | prediction = NumericDistribution.from_question(percentile_list, question) 399 | logger.info( 400 | f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." 401 | ) 402 | return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) 403 | 404 | ##################################### DATE QUESTIONS ##################################### 405 | 406 | async def _run_forecast_on_date( 407 | self, question: DateQuestion, research: str 408 | ) -> ReasonedPrediction[NumericDistribution]: 409 | upper_bound_message, lower_bound_message = ( 410 | self._create_upper_and_lower_bound_messages(question) 411 | ) 412 | prompt = clean_indents( 413 | f""" 414 | You are a professional forecaster interviewing for a job. 415 | 416 | Your interview question is: 417 | {question.question_text} 418 | 419 | Background: 420 | {question.background_info} 421 | 422 | {question.resolution_criteria} 423 | 424 | {question.fine_print} 425 | 426 | Your research assistant says: 427 | {research} 428 | 429 | Today is {datetime.now().strftime("%Y-%m-%d")}. 430 | 431 | {lower_bound_message} 432 | {upper_bound_message} 433 | 434 | Formatting Instructions: 435 | - This is a date question, and as such, the answer must be expressed in terms of dates. 436 | - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ.No other formatting is allowed. 437 | - Always start with a lower date chronologically and then increase from there. 438 | - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. 439 | 440 | Before answering you write: 441 | (a) The time left until the outcome to the question is known. 442 | (b) The outcome if nothing changed. 443 | (c) The outcome if the current trend continued. 444 | (d) The expectations of experts and markets. 445 | (e) A brief description of an unexpected scenario that results in a low outcome. 446 | (f) A brief description of an unexpected scenario that results in a high outcome. 447 | 448 | {self._get_conditional_disclaimer_if_necessary(question)} 449 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. 450 | 451 | The last thing you write is your final answer as: 452 | " 453 | Percentile 10: YYYY-MM-DD (oldest date) 454 | Percentile 20: YYYY-MM-DD 455 | Percentile 40: YYYY-MM-DD 456 | Percentile 60: YYYY-MM-DD 457 | Percentile 80: YYYY-MM-DD 458 | Percentile 90: YYYY-MM-DD (newest date) 459 | " 460 | """ 461 | ) 462 | forecast = await self._date_prompt_to_forecast(question, prompt) 463 | return forecast 464 | 465 | async def _date_prompt_to_forecast( 466 | self, 467 | question: DateQuestion, 468 | prompt: str, 469 | ) -> ReasonedPrediction[NumericDistribution]: 470 | reasoning = await self.get_llm("default", "llm").invoke(prompt) 471 | logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") 472 | parsing_instructions = clean_indents( 473 | f""" 474 | The text given to you is trying to give a forecast distribution for a date question. 475 | - This text is trying to answer the question: "{question.question_text}". 476 | - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". 477 | - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. 478 | - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. 479 | """ 480 | ) 481 | date_percentile_list: list[DatePercentile] = await structure_output( 482 | reasoning, 483 | list[DatePercentile], 484 | model=self.get_llm("parser", "llm"), 485 | additional_instructions=parsing_instructions, 486 | num_validation_samples=self._structure_output_validation_samples, 487 | ) 488 | 489 | percentile_list = [ 490 | Percentile( 491 | percentile=percentile.percentile, 492 | value=percentile.value.timestamp(), 493 | ) 494 | for percentile in date_percentile_list 495 | ] 496 | prediction = NumericDistribution.from_question(percentile_list, question) 497 | logger.info( 498 | f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." 499 | ) 500 | return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) 501 | 502 | def _create_upper_and_lower_bound_messages( 503 | self, question: NumericQuestion | DateQuestion 504 | ) -> tuple[str, str]: 505 | if isinstance(question, NumericQuestion): 506 | if question.nominal_upper_bound is not None: 507 | upper_bound_number = question.nominal_upper_bound 508 | else: 509 | upper_bound_number = question.upper_bound 510 | if question.nominal_lower_bound is not None: 511 | lower_bound_number = question.nominal_lower_bound 512 | else: 513 | lower_bound_number = question.lower_bound 514 | unit_of_measure = question.unit_of_measure 515 | elif isinstance(question, DateQuestion): 516 | upper_bound_number = question.upper_bound.date().isoformat() 517 | lower_bound_number = question.lower_bound.date().isoformat() 518 | unit_of_measure = "" 519 | else: 520 | raise ValueError() 521 | 522 | if question.open_upper_bound: 523 | upper_bound_message = f"The question creator thinks the number is likely not higher than {upper_bound_number} {unit_of_measure}." 524 | else: 525 | upper_bound_message = f"The outcome can not be higher than {upper_bound_number} {unit_of_measure}." 526 | 527 | if question.open_lower_bound: 528 | lower_bound_message = f"The question creator thinks the number is likely not lower than {lower_bound_number} {unit_of_measure}." 529 | else: 530 | lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {unit_of_measure}." 531 | return upper_bound_message, lower_bound_message 532 | 533 | ##################################### CONDITIONAL QUESTIONS ##################################### 534 | 535 | async def _run_forecast_on_conditional( 536 | self, question: ConditionalQuestion, research: str 537 | ) -> ReasonedPrediction[ConditionalPrediction]: 538 | parent_info, full_research = await self._get_question_prediction_info( 539 | question.parent, research, "parent" 540 | ) 541 | child_info, full_research = await self._get_question_prediction_info( 542 | question.child, research, "child" 543 | ) 544 | yes_info, full_research = await self._get_question_prediction_info( 545 | question.question_yes, full_research, "yes" 546 | ) 547 | no_info, full_research = await self._get_question_prediction_info( 548 | question.question_no, full_research, "no" 549 | ) 550 | full_reasoning = clean_indents( 551 | f""" 552 | ## Parent Question Reasoning 553 | {parent_info.reasoning} 554 | ## Child Question Reasoning 555 | {child_info.reasoning} 556 | ## Yes Question Reasoning 557 | {yes_info.reasoning} 558 | ## No Question Reasoning 559 | {no_info.reasoning} 560 | """ 561 | ) 562 | full_prediction = ConditionalPrediction( 563 | parent=parent_info.prediction_value, # type: ignore 564 | child=child_info.prediction_value, # type: ignore 565 | prediction_yes=yes_info.prediction_value, # type: ignore 566 | prediction_no=no_info.prediction_value, # type: ignore 567 | ) 568 | return ReasonedPrediction( 569 | reasoning=full_reasoning, prediction_value=full_prediction 570 | ) 571 | 572 | async def _get_question_prediction_info( 573 | self, question: MetaculusQuestion, research: str, question_type: str 574 | ) -> tuple[ReasonedPrediction[PredictionTypes | PredictionAffirmed], str]: 575 | from forecasting_tools.data_models.data_organizer import DataOrganizer 576 | 577 | previous_forecasts = question.previous_forecasts 578 | if ( 579 | question_type in ["parent", "child"] 580 | and previous_forecasts 581 | and question_type not in self.force_reforecast_in_conditional 582 | ): 583 | # TODO: add option to not affirm current parent/child forecasts, create new forecast 584 | previous_forecast = previous_forecasts[-1] 585 | current_utc_time = datetime.now(timezone.utc) 586 | if ( 587 | previous_forecast.timestamp_end is None 588 | or previous_forecast.timestamp_end > current_utc_time 589 | ): 590 | pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) # type: ignore 591 | prediction = ReasonedPrediction( 592 | prediction_value=PredictionAffirmed(), 593 | reasoning=f"Already existing forecast reaffirmed at {pretty_value}.", 594 | ) 595 | return (prediction, research) # type: ignore 596 | info = await self._make_prediction(question, research) 597 | full_research = self._add_reasoning_to_research(research, info, question_type) 598 | return info, full_research # type: ignore 599 | 600 | def _add_reasoning_to_research( 601 | self, 602 | research: str, 603 | reasoning: ReasonedPrediction[PredictionTypes], 604 | question_type: str, 605 | ) -> str: 606 | from forecasting_tools.data_models.data_organizer import DataOrganizer 607 | 608 | question_type = question_type.title() 609 | return clean_indents( 610 | f""" 611 | {research} 612 | --- 613 | ## {question_type} Question Information 614 | You have previously forecasted the {question_type} Question to the value: {DataOrganizer.get_readable_prediction(reasoning.prediction_value)} 615 | This is relevant information for your current forecast, but it is NOT your current forecast, but previous forecasting information that is relevant to your current forecast. 616 | The reasoning for the {question_type} Question was as such: 617 | ``` 618 | {reasoning.reasoning} 619 | ``` 620 | This is absolutely essential: do NOT use this reasoning to re-forecast the {question_type} question. 621 | """ 622 | ) 623 | 624 | def _get_conditional_disclaimer_if_necessary( 625 | self, question: MetaculusQuestion 626 | ) -> str: 627 | if question.conditional_type not in ["yes", "no"]: 628 | return "" 629 | return clean_indents( 630 | """ 631 | As you are given a conditional question with a parent and child, you are to only forecast the **CHILD** question, given the parent question's resolution. 632 | You never re-forecast the parent question under any circumstances, but you use probabilistic reasoning, strongly considering the parent question's resolution, to forecast the child question. 633 | """ 634 | ) 635 | 636 | 637 | if __name__ == "__main__": 638 | logging.basicConfig( 639 | level=logging.INFO, 640 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 641 | ) 642 | 643 | # Suppress LiteLLM logging 644 | litellm_logger = logging.getLogger("LiteLLM") 645 | litellm_logger.setLevel(logging.WARNING) 646 | litellm_logger.propagate = False 647 | 648 | parser = argparse.ArgumentParser( 649 | description="Run the TemplateBot forecasting system" 650 | ) 651 | parser.add_argument( 652 | "--mode", 653 | type=str, 654 | choices=["tournament", "metaculus_cup", "test_questions"], 655 | default="tournament", 656 | help="Specify the run mode (default: tournament)", 657 | ) 658 | args = parser.parse_args() 659 | run_mode: Literal["tournament", "metaculus_cup", "test_questions"] = args.mode 660 | assert run_mode in [ 661 | "tournament", 662 | "metaculus_cup", 663 | "test_questions", 664 | ], "Invalid run mode" 665 | 666 | template_bot = SpringTemplateBot2026( 667 | research_reports_per_question=1, 668 | predictions_per_research_report=5, 669 | use_research_summary_to_forecast=False, 670 | publish_reports_to_metaculus=True, 671 | folder_to_save_reports_to=None, 672 | skip_previously_forecasted_questions=True, 673 | extra_metadata_in_explanation=True, 674 | # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you 675 | # "default": GeneralLlm( 676 | # model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) 677 | # temperature=0.3, 678 | # timeout=40, 679 | # allowed_tries=2, 680 | # ), 681 | # "summarizer": "openai/gpt-4o-mini", 682 | # "researcher": "asknews/news-summaries", 683 | # "parser": "openai/gpt-4o-mini", 684 | # }, 685 | ) 686 | 687 | client = MetaculusClient() 688 | if run_mode == "tournament": 689 | # You may want to change this to the specific tournament ID you want to forecast on 690 | seasonal_tournament_reports = asyncio.run( 691 | template_bot.forecast_on_tournament( 692 | client.CURRENT_AI_COMPETITION_ID, return_exceptions=True 693 | ) 694 | ) 695 | minibench_reports = asyncio.run( 696 | template_bot.forecast_on_tournament( 697 | client.CURRENT_MINIBENCH_ID, return_exceptions=True 698 | ) 699 | ) 700 | forecast_reports = seasonal_tournament_reports + minibench_reports 701 | elif run_mode == "metaculus_cup": 702 | # The Metaculus cup is a good way to test the bot's performance on regularly open questions. You can also use AXC_2025_TOURNAMENT_ID = 32564 or AI_2027_TOURNAMENT_ID = "ai-2027" 703 | # The Metaculus cup may not be initialized near the beginning of a season (i.e. January, May, September) 704 | template_bot.skip_previously_forecasted_questions = False 705 | forecast_reports = asyncio.run( 706 | template_bot.forecast_on_tournament( 707 | client.CURRENT_METACULUS_CUP_ID, return_exceptions=True 708 | ) 709 | ) 710 | elif run_mode == "test_questions": 711 | # Example questions are a good way to test the bot's performance on a single question 712 | EXAMPLE_QUESTIONS = [ 713 | "https://www.metaculus.com/questions/578/human-extinction-by-2100/", # Human Extinction - Binary 714 | "https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/", # Age of Oldest Human - Numeric 715 | "https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/", # Number of New Leading AI Labs - Multiple Choice 716 | "https://www.metaculus.com/c/diffusion-community/38880/how-many-us-labor-strikes-due-to-ai-in-2029/", # Number of US Labor Strikes Due to AI in 2029 - Discrete 717 | ] 718 | template_bot.skip_previously_forecasted_questions = False 719 | questions = [ 720 | client.get_question_by_url(question_url) 721 | for question_url in EXAMPLE_QUESTIONS 722 | ] 723 | forecast_reports = asyncio.run( 724 | template_bot.forecast_questions(questions, return_exceptions=True) 725 | ) 726 | template_bot.log_report_summary(forecast_reports) 727 | -------------------------------------------------------------------------------- /main_with_no_framework.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import datetime 3 | import json 4 | import os 5 | import re 6 | 7 | import dotenv 8 | 9 | dotenv.load_dotenv() 10 | 11 | import forecasting_tools 12 | import numpy as np 13 | import requests 14 | from asknews_sdk import AskNewsSDK 15 | from openai import AsyncOpenAI 16 | 17 | 18 | """ 19 | Updates pending for Spring season: 20 | - Adding better handling for log scaled questions 21 | - Adding CDF standardization 22 | 23 | This file provides a simple forecasting bot built from the ground up. 24 | We provide this for people who want to dissect 25 | it to build their own bot without using forecasting-tools. 26 | 27 | This template assumes you are using a OpenAI model and have an OpenAI API key 28 | You will also need a Metaculus API key, for posting questions to Metaculus 29 | and a Perplexity or AskNews API key for online research 30 | 31 | This is not a representative of the template bots used by Metaculus, as there are some 32 | differences in implementation. The actual template bot (e.g. like main.py) has the following differences: 33 | - An LLM now parses the final forecast output (rather than programmatic parsing) 34 | - Support for nominal bounds was added (i.e. when there are discrete questions and normal upper/lower bounds are not as intuitive) 35 | - Upper/Lower bounds are mentioned as suggestions (not ignored) when the bounds are open 36 | - Group questions, conditional questions, and date questions are supported (these are extra and won't be launched in Spring AIB) 37 | - The research prompt mentions resolution criteria and fine print explicitly 38 | 39 | We realize the below code could probably be cleaned up a bit in a few places 40 | Though we are assuming most people will dissect it enough to make this not matter much 41 | 42 | Note that this is code is given as-is and though we have have done basic testing 43 | with this file it may be worth double checking key components locally. 44 | """ 45 | 46 | 47 | ######################### CONSTANTS ######################### 48 | # Constants 49 | SUBMIT_PREDICTION = True # set to True to publish your predictions to Metaculus 50 | USE_EXAMPLE_QUESTIONS = False # set to True to forecast example questions rather than the tournament questions 51 | NUM_RUNS_PER_QUESTION = 5 # The median forecast is taken between NUM_RUNS_PER_QUESTION runs 52 | SKIP_PREVIOUSLY_FORECASTED_QUESTIONS = True 53 | 54 | # Environment variables 55 | # You only need *either* Exa or Perplexity or AskNews keys for online research 56 | METACULUS_TOKEN = os.getenv("METACULUS_TOKEN") 57 | PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") 58 | ASKNEWS_CLIENT_ID = os.getenv("ASKNEWS_CLIENT_ID") 59 | ASKNEWS_SECRET = os.getenv("ASKNEWS_SECRET") 60 | EXA_API_KEY = os.getenv("EXA_API_KEY") 61 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # You'll also need the OpenAI API Key if you want to use the Exa Smart Searcher 62 | 63 | # The tournament IDs below can be used for testing your bot. 64 | Q4_2024_AI_BENCHMARKING_ID = 32506 65 | Q1_2025_AI_BENCHMARKING_ID = 32627 66 | FALL_2025_AI_BENCHMARKING_ID = "fall-aib-2025" 67 | CURRENT_MINIBENCH_ID = "minibench" 68 | 69 | Q4_2024_QUARTERLY_CUP_ID = 3672 70 | Q1_2025_QUARTERLY_CUP_ID = 32630 71 | CURRENT_METACULUS_CUP_ID = "metaculus-cup" 72 | 73 | AXC_2025_TOURNAMENT_ID = 32564 74 | AI_2027_TOURNAMENT_ID = "ai-2027" 75 | 76 | TOURNAMENT_ID = FALL_2025_AI_BENCHMARKING_ID 77 | 78 | # The example questions can be used for testing your bot. (note that question and post id are not always the same) 79 | EXAMPLE_QUESTIONS = [ # (question_id, post_id) 80 | (578, 578), # Human Extinction - Binary - https://www.metaculus.com/questions/578/human-extinction-by-2100/ 81 | (14333, 14333), # Age of Oldest Human - Numeric - https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/ 82 | (22427, 22427), # Number of New Leading AI Labs - Multiple Choice - https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/ 83 | (38195, 38880), # Number of US Labor Strikes Due to AI in 2029 - Discrete - https://www.metaculus.com/c/diffusion-community/38880/how-many-us-labor-strikes-due-to-ai-in-2029/ 84 | ] 85 | 86 | 87 | ######################### HELPER FUNCTIONS ######################### 88 | 89 | # @title Helper functions 90 | AUTH_HEADERS = {"headers": {"Authorization": f"Token {METACULUS_TOKEN}"}} 91 | API_BASE_URL = "https://www.metaculus.com/api" 92 | 93 | 94 | def post_question_comment(post_id: int, comment_text: str) -> None: 95 | """ 96 | Post a comment on the question page as the bot user. 97 | """ 98 | 99 | response = requests.post( 100 | f"{API_BASE_URL}/comments/create/", 101 | json={ 102 | "text": comment_text, 103 | "parent": None, 104 | "included_forecast": True, 105 | "is_private": True, 106 | "on_post": post_id, 107 | }, 108 | **AUTH_HEADERS, # type: ignore 109 | ) 110 | if not response.ok: 111 | raise RuntimeError(response.text) 112 | 113 | 114 | def post_question_prediction(question_id: int, forecast_payload: dict) -> None: 115 | """ 116 | Post a forecast on a question. 117 | """ 118 | url = f"{API_BASE_URL}/questions/forecast/" 119 | response = requests.post( 120 | url, 121 | json=[ 122 | { 123 | "question": question_id, 124 | **forecast_payload, 125 | }, 126 | ], 127 | **AUTH_HEADERS, # type: ignore 128 | ) 129 | print(f"Prediction Post status code: {response.status_code}") 130 | if not response.ok: 131 | raise RuntimeError(response.text) 132 | 133 | 134 | def create_forecast_payload( 135 | forecast: float | dict[str, float] | list[float], 136 | question_type: str, 137 | ) -> dict: 138 | """ 139 | Accepts a forecast and generates the api payload in the correct format. 140 | 141 | If the question is binary, forecast must be a float. 142 | If the question is multiple choice, forecast must be a dictionary that 143 | maps question.options labels to floats. 144 | If the question is numeric, forecast must be a dictionary that maps 145 | quartiles or percentiles to datetimes, or a 201 value cdf. 146 | """ 147 | if question_type == "binary": 148 | return { 149 | "probability_yes": forecast, 150 | "probability_yes_per_category": None, 151 | "continuous_cdf": None, 152 | } 153 | if question_type == "multiple_choice": 154 | return { 155 | "probability_yes": None, 156 | "probability_yes_per_category": forecast, 157 | "continuous_cdf": None, 158 | } 159 | # numeric or date 160 | return { 161 | "probability_yes": None, 162 | "probability_yes_per_category": None, 163 | "continuous_cdf": forecast, 164 | } 165 | 166 | 167 | def list_posts_from_tournament( 168 | tournament_id: int | str = TOURNAMENT_ID, offset: int = 0, count: int = 50 169 | ) -> list[dict]: 170 | """ 171 | List (all details) {count} posts from the {tournament_id} 172 | """ 173 | url_qparams = { 174 | "limit": count, 175 | "offset": offset, 176 | "order_by": "-hotness", 177 | "forecast_type": ",".join( 178 | [ 179 | "binary", 180 | "multiple_choice", 181 | "numeric", 182 | "discrete", 183 | ] 184 | ), 185 | "tournaments": [tournament_id], 186 | "statuses": "open", 187 | "include_description": "true", 188 | } 189 | url = f"{API_BASE_URL}/posts/" 190 | response = requests.get(url, **AUTH_HEADERS, params=url_qparams) # type: ignore 191 | if not response.ok: 192 | raise Exception(response.text) 193 | data = json.loads(response.content) 194 | return data 195 | 196 | 197 | def get_open_question_ids_from_tournament() -> list[tuple[int, int]]: 198 | posts = list_posts_from_tournament() 199 | 200 | post_dict = dict() 201 | for post in posts["results"]: 202 | if question := post.get("question"): 203 | # single question post 204 | post_dict[post["id"]] = [question] 205 | 206 | open_question_id_post_id = [] # [(question_id, post_id)] 207 | for post_id, questions in post_dict.items(): 208 | for question in questions: 209 | if question.get("status") == "open": 210 | print( 211 | f"ID: {question['id']}\nQ: {question['title']}\nCloses: " 212 | f"{question['scheduled_close_time']}" 213 | ) 214 | open_question_id_post_id.append((question["id"], post_id)) 215 | 216 | return open_question_id_post_id 217 | 218 | 219 | def get_post_details(post_id: int) -> dict: 220 | """ 221 | Get all details about a post from the Metaculus API. 222 | """ 223 | url = f"{API_BASE_URL}/posts/{post_id}/" 224 | print(f"Getting details for {url}") 225 | response = requests.get( 226 | url, 227 | **AUTH_HEADERS, # type: ignore 228 | ) 229 | if not response.ok: 230 | raise Exception(response.text) 231 | details = json.loads(response.content) 232 | return details 233 | 234 | CONCURRENT_REQUESTS_LIMIT = 5 235 | llm_rate_limiter = asyncio.Semaphore(CONCURRENT_REQUESTS_LIMIT) 236 | 237 | 238 | async def call_llm(prompt: str, model: str = "gpt-4o", temperature: float = 0.3) -> str: 239 | """ 240 | Makes a streaming completion request to OpenAI's API with concurrent request limiting. 241 | """ 242 | 243 | # Remove the base_url parameter to call the OpenAI API directly 244 | # Also checkout the package 'litellm' for one function that can call any model from any provider 245 | # Also checkout OpenRouter for allowing one API key for many providers (especially powerful if combined with litellm) 246 | client = AsyncOpenAI() 247 | 248 | async with llm_rate_limiter: 249 | response = await client.chat.completions.create( 250 | model=model, 251 | messages=[{"role": "user", "content": prompt}], 252 | temperature=temperature, 253 | stream=False, 254 | ) 255 | answer = response.choices[0].message.content 256 | if answer is None: 257 | raise ValueError("No answer returned from LLM") 258 | return answer 259 | 260 | 261 | def run_research(question: str) -> str: 262 | research = "" 263 | if ASKNEWS_CLIENT_ID and ASKNEWS_SECRET: 264 | research = call_asknews(question) 265 | elif EXA_API_KEY: 266 | research = call_exa_smart_searcher(question) 267 | elif PERPLEXITY_API_KEY: 268 | research = call_perplexity(question) 269 | else: 270 | research = "No research done" 271 | 272 | print(f"########################\nResearch Found:\n{research}\n########################") 273 | 274 | return research 275 | 276 | def call_perplexity(question: str) -> str: 277 | url = "https://api.perplexity.ai/chat/completions" 278 | api_key = PERPLEXITY_API_KEY 279 | headers = { 280 | "accept": "application/json", 281 | "authorization": f"Bearer {api_key}", 282 | "content-type": "application/json", 283 | } 284 | payload = { 285 | "model": "llama-3.1-sonar-huge-128k-online", 286 | "messages": [ 287 | { 288 | "role": "system", # this is a system prompt designed to guide the perplexity assistant 289 | "content": """ 290 | You are an assistant to a superforecaster. 291 | The superforecaster will give you a question they intend to forecast on. 292 | To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. 293 | You do not produce forecasts yourself. 294 | """, 295 | }, 296 | { 297 | "role": "user", # this is the actual prompt we ask the perplexity assistant to answer 298 | "content": question, 299 | }, 300 | ], 301 | } 302 | response = requests.post(url=url, json=payload, headers=headers) 303 | if not response.ok: 304 | raise Exception(response.text) 305 | content = response.json()["choices"][0]["message"]["content"] 306 | return content 307 | 308 | def call_exa_smart_searcher(question: str) -> str: 309 | if OPENAI_API_KEY is None: 310 | searcher = forecasting_tools.ExaSearcher( 311 | include_highlights=True, 312 | num_results=10, 313 | ) 314 | highlights = asyncio.run(searcher.invoke_for_highlights_in_relevance_order(question)) 315 | prioritized_highlights = highlights[:10] 316 | combined_highlights = "" 317 | for i, highlight in enumerate(prioritized_highlights): 318 | combined_highlights += f'[Highlight {i+1}]:\nTitle: {highlight.source.title}\nURL: {highlight.source.url}\nText: "{highlight.highlight_text}"\n\n' 319 | response = combined_highlights 320 | else: 321 | searcher = forecasting_tools.SmartSearcher( 322 | temperature=0, 323 | num_searches_to_run=2, 324 | num_sites_per_search=10, 325 | ) 326 | prompt = ( 327 | "You are an assistant to a superforecaster. The superforecaster will give" 328 | "you a question they intend to forecast on. To be a great assistant, you generate" 329 | "a concise but detailed rundown of the most relevant news, including if the question" 330 | "would resolve Yes or No based on current information. You do not produce forecasts yourself." 331 | f"\n\nThe question is: {question}" 332 | ) 333 | response = asyncio.run(searcher.invoke(prompt)) 334 | assert response is not None 335 | 336 | return response 337 | 338 | def call_asknews(question: str) -> str: 339 | """ 340 | Use the AskNews `news` endpoint to get news context for your query. 341 | The full API reference can be found here: https://docs.asknews.app/en/reference#get-/v1/news/search 342 | """ 343 | ask = AskNewsSDK( 344 | client_id=ASKNEWS_CLIENT_ID, client_secret=ASKNEWS_SECRET, scopes=set(["news"]) 345 | ) 346 | 347 | # get the latest news related to the query (within the past 48 hours) 348 | hot_response = ask.news.search_news( 349 | query=question, # your natural language query 350 | n_articles=6, # control the number of articles to include in the context, originally 5 351 | return_type="both", 352 | strategy="latest news", # enforces looking at the latest news only 353 | ) 354 | 355 | # get context from the "historical" database that contains a news archive going back to 2023 356 | historical_response = ask.news.search_news( 357 | query=question, 358 | n_articles=10, 359 | return_type="both", 360 | strategy="news knowledge", # looks for relevant news within the past 60 days 361 | ) 362 | 363 | hot_articles = hot_response.as_dicts 364 | historical_articles = historical_response.as_dicts 365 | formatted_articles = "Here are the relevant news articles:\n\n" 366 | 367 | if hot_articles: 368 | hot_articles = [article.__dict__ for article in hot_articles] 369 | hot_articles = sorted(hot_articles, key=lambda x: x["pub_date"], reverse=True) 370 | 371 | for article in hot_articles: 372 | pub_date = article["pub_date"].strftime("%B %d, %Y %I:%M %p") 373 | formatted_articles += f"**{article['eng_title']}**\n{article['summary']}\nOriginal language: {article['language']}\nPublish date: {pub_date}\nSource:[{article['source_id']}]({article['article_url']})\n\n" 374 | 375 | if historical_articles: 376 | historical_articles = [article.__dict__ for article in historical_articles] 377 | historical_articles = sorted( 378 | historical_articles, key=lambda x: x["pub_date"], reverse=True 379 | ) 380 | 381 | for article in historical_articles: 382 | pub_date = article["pub_date"].strftime("%B %d, %Y %I:%M %p") 383 | formatted_articles += f"**{article['eng_title']}**\n{article['summary']}\nOriginal language: {article['language']}\nPublish date: {pub_date}\nSource:[{article['source_id']}]({article['article_url']})\n\n" 384 | 385 | if not hot_articles and not historical_articles: 386 | formatted_articles += "No articles were found.\n\n" 387 | return formatted_articles 388 | 389 | return formatted_articles 390 | 391 | ############### BINARY ############### 392 | # @title Binary prompt & functions 393 | 394 | # This section includes functionality for binary questions. 395 | 396 | BINARY_PROMPT_TEMPLATE = """ 397 | You are a professional forecaster interviewing for a job. 398 | 399 | Your interview question is: 400 | {title} 401 | 402 | Question background: 403 | {background} 404 | 405 | 406 | This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied: 407 | {resolution_criteria} 408 | 409 | {fine_print} 410 | 411 | 412 | Your research assistant says: 413 | {summary_report} 414 | 415 | Today is {today}. 416 | 417 | Before answering you write: 418 | (a) The time left until the outcome to the question is known. 419 | (b) The status quo outcome if nothing changed. 420 | (c) A brief description of a scenario that results in a No outcome. 421 | (d) A brief description of a scenario that results in a Yes outcome. 422 | 423 | You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. 424 | 425 | The last thing you write is your final answer as: "Probability: ZZ%", 0-100 426 | """ 427 | 428 | 429 | def extract_probability_from_response_as_percentage_not_decimal( 430 | forecast_text: str, 431 | ) -> float: 432 | matches = re.findall(r"(\d+)%", forecast_text) 433 | if matches: 434 | # Return the last number found before a '%' 435 | number = int(matches[-1]) 436 | number = min(99, max(1, number)) # clamp the number between 1 and 99 437 | return number 438 | else: 439 | raise ValueError(f"Could not extract prediction from response: {forecast_text}") 440 | 441 | 442 | async def get_binary_gpt_prediction( 443 | question_details: dict, num_runs: int 444 | ) -> tuple[float, str]: 445 | 446 | today = datetime.datetime.now().strftime("%Y-%m-%d") 447 | title = question_details["title"] 448 | resolution_criteria = question_details["resolution_criteria"] 449 | background = question_details["description"] 450 | fine_print = question_details["fine_print"] 451 | question_type = question_details["type"] 452 | 453 | summary_report = run_research(title) 454 | 455 | content = BINARY_PROMPT_TEMPLATE.format( 456 | title=title, 457 | today=today, 458 | background=background, 459 | resolution_criteria=resolution_criteria, 460 | fine_print=fine_print, 461 | summary_report=summary_report, 462 | ) 463 | 464 | async def get_rationale_and_probability(content: str) -> tuple[float, str]: 465 | rationale = await call_llm(content) 466 | 467 | probability = extract_probability_from_response_as_percentage_not_decimal( 468 | rationale 469 | ) 470 | comment = ( 471 | f"Extracted Probability: {probability}%\n\nGPT's Answer: " 472 | f"{rationale}\n\n\n" 473 | ) 474 | return probability, comment 475 | 476 | probability_and_comment_pairs = await asyncio.gather( 477 | *[get_rationale_and_probability(content) for _ in range(num_runs)] 478 | ) 479 | comments = [pair[1] for pair in probability_and_comment_pairs] 480 | final_comment_sections = [ 481 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments) 482 | ] 483 | probabilities = [pair[0] for pair in probability_and_comment_pairs] 484 | median_probability = float(np.median(probabilities)) / 100 485 | 486 | final_comment = f"Median Probability: {median_probability}\n\n" + "\n\n".join( 487 | final_comment_sections 488 | ) 489 | return median_probability, final_comment 490 | 491 | 492 | ####################### NUMERIC ############### 493 | # @title Numeric prompt & functions 494 | 495 | NUMERIC_PROMPT_TEMPLATE = """ 496 | You are a professional forecaster interviewing for a job. 497 | 498 | Your interview question is: 499 | {title} 500 | 501 | Background: 502 | {background} 503 | 504 | {resolution_criteria} 505 | 506 | {fine_print} 507 | 508 | Units for answer: {units} 509 | 510 | Your research assistant says: 511 | {summary_report} 512 | 513 | Today is {today}. 514 | 515 | {lower_bound_message} 516 | {upper_bound_message} 517 | 518 | 519 | Formatting Instructions: 520 | - Please notice the units requested (e.g. whether you represent a number as 1,000,000 or 1m). 521 | - Never use scientific notation. 522 | - Always start with a smaller number (more negative if negative) and then increase from there 523 | 524 | Before answering you write: 525 | (a) The time left until the outcome to the question is known. 526 | (b) The outcome if nothing changed. 527 | (c) The outcome if the current trend continued. 528 | (d) The expectations of experts and markets. 529 | (e) A brief description of an unexpected scenario that results in a low outcome. 530 | (f) A brief description of an unexpected scenario that results in a high outcome. 531 | 532 | You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unkowns. 533 | 534 | The last thing you write is your final answer as: 535 | " 536 | Percentile 10: XX 537 | Percentile 20: XX 538 | Percentile 40: XX 539 | Percentile 60: XX 540 | Percentile 80: XX 541 | Percentile 90: XX 542 | " 543 | """ 544 | 545 | 546 | def extract_percentiles_from_response(forecast_text: str) -> dict: 547 | 548 | # Helper function that returns a list of tuples with numbers for all lines with Percentile 549 | def extract_percentile_numbers(text) -> dict: 550 | pattern = r"^.*(?:P|p)ercentile.*$" 551 | number_pattern = r"-\s*(?:[^\d\-]*\s*)?(\d+(?:,\d{3})*(?:\.\d+)?)|(\d+(?:,\d{3})*(?:\.\d+)?)" 552 | results = [] 553 | 554 | for line in text.split("\n"): 555 | if re.match(pattern, line): 556 | numbers = re.findall(number_pattern, line) 557 | numbers_no_commas = [ 558 | next(num for num in match if num).replace(",", "") 559 | for match in numbers 560 | ] 561 | numbers = [ 562 | float(num) if "." in num else int(num) 563 | for num in numbers_no_commas 564 | ] 565 | if len(numbers) > 1: 566 | first_number = numbers[0] 567 | last_number = numbers[-1] 568 | # Check if the original line had a negative sign before the last number 569 | if "-" in line.split(":")[-1]: 570 | last_number = -abs(last_number) 571 | results.append((first_number, last_number)) 572 | 573 | # Convert results to dictionary 574 | percentile_values = {} 575 | for first_num, second_num in results: 576 | key = first_num 577 | percentile_values[key] = second_num 578 | 579 | return percentile_values 580 | 581 | percentile_values = extract_percentile_numbers(forecast_text) 582 | 583 | if len(percentile_values) > 0: 584 | return percentile_values 585 | else: 586 | raise ValueError(f"Could not extract prediction from response: {forecast_text}") 587 | 588 | 589 | def generate_continuous_cdf( 590 | percentile_values: dict, 591 | question_type: str, 592 | open_upper_bound: bool, 593 | open_lower_bound: bool, 594 | upper_bound: float, 595 | lower_bound: float, 596 | zero_point: float | None, 597 | cdf_size: int, 598 | ) -> list[float]: 599 | """ 600 | Returns: list[float]: A list of 201 float values representing the CDF. 601 | """ 602 | 603 | percentile_max = max(float(key) for key in percentile_values.keys()) 604 | percentile_min = min(float(key) for key in percentile_values.keys()) 605 | range_min = lower_bound 606 | range_max = upper_bound 607 | range_size = range_max - range_min 608 | buffer = 1 if range_size > 100 else 0.01 * range_size 609 | 610 | # Adjust any values that are exactly at the bounds 611 | for percentile, value in list(percentile_values.items()): 612 | if not open_lower_bound and value <= range_min + buffer: 613 | percentile_values[percentile] = range_min + buffer 614 | if not open_upper_bound and value >= range_max - buffer: 615 | percentile_values[percentile] = range_max - buffer 616 | 617 | # Set cdf values outside range 618 | if open_upper_bound: 619 | if range_max > percentile_values[percentile_max]: 620 | percentile_values[int(100 - (0.5 * (100 - percentile_max)))] = range_max 621 | else: 622 | percentile_values[100] = range_max 623 | 624 | # Set cdf values outside range 625 | if open_lower_bound: 626 | if range_min < percentile_values[percentile_min]: 627 | percentile_values[int(0.5 * percentile_min)] = range_min 628 | else: 629 | percentile_values[0] = range_min 630 | 631 | sorted_percentile_values = dict(sorted(percentile_values.items())) 632 | 633 | # Normalize percentile keys 634 | normalized_percentile_values = {} 635 | for key, value in sorted_percentile_values.items(): 636 | percentile = float(key) / 100 637 | normalized_percentile_values[percentile] = value 638 | 639 | 640 | value_percentiles = { 641 | value: key for key, value in normalized_percentile_values.items() 642 | } 643 | 644 | # function for log scaled questions 645 | def generate_cdf_locations(range_min, range_max, zero_point): 646 | if zero_point is None: 647 | scale = lambda x: range_min + (range_max - range_min) * x 648 | else: 649 | deriv_ratio = (range_max - zero_point) / (range_min - zero_point) 650 | scale = lambda x: range_min + (range_max - range_min) * ( 651 | deriv_ratio**x - 1 652 | ) / (deriv_ratio - 1) 653 | return [scale(x) for x in np.linspace(0, 1, cdf_size)] 654 | 655 | cdf_xaxis = generate_cdf_locations(range_min, range_max, zero_point) 656 | 657 | def linear_interpolation(x_values, xy_pairs): 658 | # Sort the xy_pairs by x-values 659 | sorted_pairs = sorted(xy_pairs.items()) 660 | 661 | # Extract sorted x and y values 662 | known_x = [pair[0] for pair in sorted_pairs] 663 | known_y = [pair[1] for pair in sorted_pairs] 664 | 665 | # Initialize the result list 666 | y_values = [] 667 | 668 | for x in x_values: 669 | # Check if x is exactly in the known x values 670 | if x in known_x: 671 | y_values.append(known_y[known_x.index(x)]) 672 | else: 673 | # Find the indices of the two nearest known x-values 674 | i = 0 675 | while i < len(known_x) and known_x[i] < x: 676 | i += 1 677 | 678 | list_index_2 = i 679 | 680 | # If x is outside the range of known x-values, use the nearest endpoint 681 | if i == 0: 682 | y_values.append(known_y[0]) 683 | elif i == len(known_x): 684 | y_values.append(known_y[-1]) 685 | else: 686 | # Perform linear interpolation 687 | x0, x1 = known_x[i - 1], known_x[i] 688 | y0, y1 = known_y[i - 1], known_y[i] 689 | 690 | # Linear interpolation formula 691 | y = y0 + (x - x0) * (y1 - y0) / (x1 - x0) 692 | y_values.append(y) 693 | 694 | return y_values 695 | 696 | continuous_cdf = linear_interpolation(cdf_xaxis, value_percentiles) 697 | return continuous_cdf 698 | 699 | 700 | async def get_numeric_gpt_prediction( 701 | question_details: dict, num_runs: int 702 | ) -> tuple[list[float], str]: 703 | 704 | today = datetime.datetime.now().strftime("%Y-%m-%d") 705 | title = question_details["title"] 706 | resolution_criteria = question_details["resolution_criteria"] 707 | background = question_details["description"] 708 | fine_print = question_details["fine_print"] 709 | question_type = question_details["type"] 710 | scaling = question_details["scaling"] 711 | open_upper_bound = question_details["open_upper_bound"] 712 | open_lower_bound = question_details["open_lower_bound"] 713 | unit_of_measure = question_details["unit"] if question_details["unit"] else "Not stated (please infer this)" 714 | upper_bound = scaling["range_max"] 715 | lower_bound = scaling["range_min"] 716 | zero_point = scaling["zero_point"] 717 | if question_type == "discrete": 718 | outcome_count = question_details["scaling"]["inbound_outcome_count"] 719 | cdf_size = outcome_count + 1 720 | else: 721 | cdf_size = 201 722 | 723 | # Create messages about the bounds that are passed in the LLM prompt 724 | if open_upper_bound: 725 | upper_bound_message = "" 726 | else: 727 | upper_bound_message = f"The outcome can not be higher than {upper_bound}." 728 | if open_lower_bound: 729 | lower_bound_message = "" 730 | else: 731 | lower_bound_message = f"The outcome can not be lower than {lower_bound}." 732 | 733 | summary_report = run_research(title) 734 | 735 | content = NUMERIC_PROMPT_TEMPLATE.format( 736 | title=title, 737 | today=today, 738 | background=background, 739 | resolution_criteria=resolution_criteria, 740 | fine_print=fine_print, 741 | summary_report=summary_report, 742 | lower_bound_message=lower_bound_message, 743 | upper_bound_message=upper_bound_message, 744 | units=unit_of_measure, 745 | ) 746 | 747 | async def ask_llm_to_get_cdf(content: str) -> tuple[list[float], str]: 748 | rationale = await call_llm(content) 749 | percentile_values = extract_percentiles_from_response(rationale) 750 | 751 | comment = ( 752 | f"Extracted Percentile_values: {percentile_values}\n\nGPT's Answer: " 753 | f"{rationale}\n\n\n" 754 | ) 755 | 756 | cdf = generate_continuous_cdf( 757 | percentile_values, 758 | question_type, 759 | open_upper_bound, 760 | open_lower_bound, 761 | upper_bound, 762 | lower_bound, 763 | zero_point, 764 | cdf_size, 765 | ) 766 | 767 | return cdf, comment 768 | 769 | cdf_and_comment_pairs = await asyncio.gather( 770 | *[ask_llm_to_get_cdf(content) for _ in range(num_runs)] 771 | ) 772 | comments = [pair[1] for pair in cdf_and_comment_pairs] 773 | final_comment_sections = [ 774 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments) 775 | ] 776 | cdfs: list[list[float]] = [pair[0] for pair in cdf_and_comment_pairs] 777 | all_cdfs = np.array(cdfs) 778 | median_cdf: list[float] = np.median(all_cdfs, axis=0).tolist() 779 | 780 | final_comment = f"Median CDF: `{str(median_cdf)[:100]}...`\n\n" + "\n\n".join( 781 | final_comment_sections 782 | ) 783 | return median_cdf, final_comment 784 | 785 | 786 | ########################## MULTIPLE CHOICE ############### 787 | # @title Multiple Choice prompt & functions 788 | 789 | MULTIPLE_CHOICE_PROMPT_TEMPLATE = """ 790 | You are a professional forecaster interviewing for a job. 791 | 792 | Your interview question is: 793 | {title} 794 | 795 | The options are: {options} 796 | 797 | 798 | Background: 799 | {background} 800 | 801 | {resolution_criteria} 802 | 803 | {fine_print} 804 | 805 | 806 | Your research assistant says: 807 | {summary_report} 808 | 809 | Today is {today}. 810 | 811 | Before answering you write: 812 | (a) The time left until the outcome to the question is known. 813 | (b) The status quo outcome if nothing changed. 814 | (c) A description of an scenario that results in an unexpected outcome. 815 | 816 | You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. 817 | 818 | The last thing you write is your final probabilities for the N options in this order {options} as: 819 | Option_A: Probability_A 820 | Option_B: Probability_B 821 | ... 822 | Option_N: Probability_N 823 | """ 824 | 825 | 826 | def extract_option_probabilities_from_response(forecast_text: str, options) -> float: 827 | 828 | # Helper function that returns a list of tuples with numbers for all lines with Percentile 829 | def extract_option_probabilities(text): 830 | 831 | # Number extraction pattern 832 | number_pattern = r"-?\d+(?:,\d{3})*(?:\.\d+)?" 833 | 834 | results = [] 835 | 836 | # Iterate through each line in the text 837 | for line in text.split("\n"): 838 | # Extract all numbers from the line 839 | numbers = re.findall(number_pattern, line) 840 | numbers_no_commas = [num.replace(",", "") for num in numbers] 841 | # Convert strings to float or int 842 | numbers = [ 843 | float(num) if "." in num else int(num) for num in numbers_no_commas 844 | ] 845 | # Add the tuple of numbers to results 846 | if len(numbers) >= 1: 847 | last_number = numbers[-1] 848 | results.append(last_number) 849 | 850 | return results 851 | 852 | option_probabilities = extract_option_probabilities(forecast_text) 853 | 854 | NUM_OPTIONS = len(options) 855 | 856 | if len(option_probabilities) > 0: 857 | # return the last NUM_OPTIONS items 858 | return option_probabilities[-NUM_OPTIONS:] 859 | else: 860 | raise ValueError(f"Could not extract prediction from response: {forecast_text}") 861 | 862 | 863 | def generate_multiple_choice_forecast(options, option_probabilities) -> dict: 864 | """ 865 | Returns: dict corresponding to the probabilities of each option. 866 | """ 867 | 868 | # confirm that there is a probability for each option 869 | if len(options) != len(option_probabilities): 870 | raise ValueError( 871 | f"Number of options ({len(options)}) does not match number of probabilities ({len(option_probabilities)})" 872 | ) 873 | 874 | # Ensure we are using decimals 875 | total_sum = sum(option_probabilities) 876 | decimal_list = [x / total_sum for x in option_probabilities] 877 | 878 | def normalize_list(float_list): 879 | # Step 1: Clamp values 880 | clamped_list = [max(min(x, 0.99), 0.01) for x in float_list] 881 | 882 | # Step 2: Calculate the sum of all elements 883 | total_sum = sum(clamped_list) 884 | 885 | # Step 3: Normalize the list so that all elements add up to 1 886 | normalized_list = [x / total_sum for x in clamped_list] 887 | 888 | # Step 4: Adjust for any small floating-point errors 889 | adjustment = 1.0 - sum(normalized_list) 890 | normalized_list[-1] += adjustment 891 | 892 | return normalized_list 893 | 894 | normalized_option_probabilities = normalize_list(decimal_list) 895 | 896 | probability_yes_per_category = {} 897 | for i in range(len(options)): 898 | probability_yes_per_category[options[i]] = normalized_option_probabilities[i] 899 | 900 | return probability_yes_per_category 901 | 902 | 903 | async def get_multiple_choice_gpt_prediction( 904 | question_details: dict, 905 | num_runs: int, 906 | ) -> tuple[dict[str, float], str]: 907 | 908 | today = datetime.datetime.now().strftime("%Y-%m-%d") 909 | title = question_details["title"] 910 | resolution_criteria = question_details["resolution_criteria"] 911 | background = question_details["description"] 912 | fine_print = question_details["fine_print"] 913 | question_type = question_details["type"] 914 | options = question_details["options"] 915 | 916 | summary_report = run_research(title) 917 | 918 | content = MULTIPLE_CHOICE_PROMPT_TEMPLATE.format( 919 | title=title, 920 | today=today, 921 | background=background, 922 | resolution_criteria=resolution_criteria, 923 | fine_print=fine_print, 924 | summary_report=summary_report, 925 | options=options, 926 | ) 927 | 928 | async def ask_llm_for_multiple_choice_probabilities( 929 | content: str, 930 | ) -> tuple[dict[str, float], str]: 931 | rationale = await call_llm(content) 932 | 933 | 934 | option_probabilities = extract_option_probabilities_from_response( 935 | rationale, options 936 | ) 937 | 938 | comment = ( 939 | f"EXTRACTED_PROBABILITIES: {option_probabilities}\n\nGPT's Answer: " 940 | f"{rationale}\n\n\n" 941 | ) 942 | 943 | probability_yes_per_category = generate_multiple_choice_forecast( 944 | options, option_probabilities 945 | ) 946 | return probability_yes_per_category, comment 947 | 948 | probability_yes_per_category_and_comment_pairs = await asyncio.gather( 949 | *[ask_llm_for_multiple_choice_probabilities(content) for _ in range(num_runs)] 950 | ) 951 | comments = [pair[1] for pair in probability_yes_per_category_and_comment_pairs] 952 | final_comment_sections = [ 953 | f"## Rationale {i+1}\n{comment}" for i, comment in enumerate(comments) 954 | ] 955 | probability_yes_per_category_dicts: list[dict[str, float]] = [ 956 | pair[0] for pair in probability_yes_per_category_and_comment_pairs 957 | ] 958 | average_probability_yes_per_category: dict[str, float] = {} 959 | for option in options: 960 | probabilities_for_current_option: list[float] = [ 961 | dict[option] for dict in probability_yes_per_category_dicts 962 | ] 963 | average_probability_yes_per_category[option] = sum( 964 | probabilities_for_current_option 965 | ) / len(probabilities_for_current_option) 966 | 967 | final_comment = ( 968 | f"Average Probability Yes Per Category: `{average_probability_yes_per_category}`\n\n" 969 | + "\n\n".join(final_comment_sections) 970 | ) 971 | return average_probability_yes_per_category, final_comment 972 | 973 | 974 | ################### FORECASTING ################### 975 | def forecast_is_already_made(post_details: dict) -> bool: 976 | """ 977 | Check if a forecast has already been made by looking at my_forecasts in the question data. 978 | 979 | question.my_forecasts.latest.forecast_values has the following values for each question type: 980 | Binary: [probability for no, probability for yes] 981 | Numeric: [cdf value 1, cdf value 2, ..., cdf value 201] 982 | Multiple Choice: [probability for option 1, probability for option 2, ...] 983 | """ 984 | try: 985 | forecast_values = post_details["question"]["my_forecasts"]["latest"][ 986 | "forecast_values" 987 | ] 988 | return forecast_values is not None 989 | except Exception: 990 | return False 991 | 992 | 993 | async def forecast_individual_question( 994 | question_id: int, 995 | post_id: int, 996 | submit_prediction: bool, 997 | num_runs_per_question: int, 998 | skip_previously_forecasted_questions: bool, 999 | ) -> str: 1000 | post_details = get_post_details(post_id) 1001 | question_details = post_details["question"] 1002 | title = question_details["title"] 1003 | question_type = question_details["type"] 1004 | 1005 | summary_of_forecast = "" 1006 | summary_of_forecast += f"-----------------------------------------------\nQuestion: {title}\n" 1007 | summary_of_forecast += f"URL: https://www.metaculus.com/questions/{post_id}/\n" 1008 | 1009 | if question_type == "multiple_choice": 1010 | options = question_details["options"] 1011 | summary_of_forecast += f"options: {options}\n" 1012 | 1013 | if ( 1014 | forecast_is_already_made(post_details) 1015 | and skip_previously_forecasted_questions == True 1016 | ): 1017 | summary_of_forecast += f"Skipped: Forecast already made\n" 1018 | return summary_of_forecast 1019 | 1020 | if question_type == "binary": 1021 | forecast, comment = await get_binary_gpt_prediction( 1022 | question_details, num_runs_per_question 1023 | ) 1024 | elif question_type == "numeric": 1025 | forecast, comment = await get_numeric_gpt_prediction( 1026 | question_details, num_runs_per_question 1027 | ) 1028 | elif question_type == "discrete": 1029 | forecast, comment = await get_numeric_gpt_prediction( 1030 | question_details, num_runs_per_question 1031 | ) 1032 | elif question_type == "multiple_choice": 1033 | forecast, comment = await get_multiple_choice_gpt_prediction( 1034 | question_details, num_runs_per_question 1035 | ) 1036 | else: 1037 | raise ValueError(f"Unknown question type: {question_type}") 1038 | 1039 | print(f"-----------------------------------------------\nPost {post_id} Question {question_id}:\n") 1040 | print(f"Forecast for post {post_id} (question {question_id}):\n{forecast}") 1041 | print(f"Comment for post {post_id} (question {question_id}):\n{comment}") 1042 | 1043 | if question_type == "numeric" or question_type == "discrete": 1044 | summary_of_forecast += f"Forecast: {str(forecast)[:200]}...\n" 1045 | else: 1046 | summary_of_forecast += f"Forecast: {forecast}\n" 1047 | 1048 | summary_of_forecast += f"Comment:\n```\n{comment[:200]}...\n```\n\n" 1049 | 1050 | if submit_prediction == True: 1051 | forecast_payload = create_forecast_payload(forecast, question_type) 1052 | post_question_prediction(question_id, forecast_payload) 1053 | post_question_comment(post_id, comment) 1054 | summary_of_forecast += "Posted: Forecast was posted to Metaculus.\n" 1055 | 1056 | return summary_of_forecast 1057 | 1058 | 1059 | async def forecast_questions( 1060 | open_question_id_post_id: list[tuple[int, int]], 1061 | submit_prediction: bool, 1062 | num_runs_per_question: int, 1063 | skip_previously_forecasted_questions: bool, 1064 | ) -> None: 1065 | forecast_tasks = [ 1066 | forecast_individual_question( 1067 | question_id, 1068 | post_id, 1069 | submit_prediction, 1070 | num_runs_per_question, 1071 | skip_previously_forecasted_questions, 1072 | ) 1073 | for question_id, post_id in open_question_id_post_id 1074 | ] 1075 | forecast_summaries = await asyncio.gather(*forecast_tasks, return_exceptions=True) 1076 | print("\n", "#" * 100, "\nForecast Summaries\n", "#" * 100) 1077 | 1078 | errors = [] 1079 | for question_id_post_id, forecast_summary in zip( 1080 | open_question_id_post_id, forecast_summaries 1081 | ): 1082 | question_id, post_id = question_id_post_id 1083 | if isinstance(forecast_summary, Exception): 1084 | print( 1085 | f"-----------------------------------------------\nPost {post_id} Question {question_id}:\nError: {forecast_summary.__class__.__name__} {forecast_summary}\nURL: https://www.metaculus.com/questions/{post_id}/\n" 1086 | ) 1087 | errors.append(forecast_summary) 1088 | else: 1089 | print(forecast_summary) 1090 | 1091 | if errors: 1092 | print("-----------------------------------------------\nErrors:\n") 1093 | error_message = f"Errors were encountered: {errors}" 1094 | print(error_message) 1095 | raise RuntimeError(error_message) 1096 | 1097 | 1098 | 1099 | 1100 | ######################## FINAL RUN ######################### 1101 | if __name__ == "__main__": 1102 | if USE_EXAMPLE_QUESTIONS: 1103 | open_question_id_post_id = EXAMPLE_QUESTIONS 1104 | else: 1105 | open_question_id_post_id = get_open_question_ids_from_tournament() 1106 | 1107 | asyncio.run( 1108 | forecast_questions( 1109 | open_question_id_post_id, 1110 | SUBMIT_PREDICTION, 1111 | NUM_RUNS_PER_QUESTION, 1112 | SKIP_PREVIOUSLY_FORECASTED_QUESTIONS, 1113 | ) 1114 | ) 1115 | --------------------------------------------------------------------------------