├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── README.md ├── make.bat └── source │ ├── conf.py │ └── index.md ├── emergingtrajectories ├── __init__.py ├── agents.py ├── chunkers.py ├── citationagents.py ├── crawlers.py ├── facts.py ├── factsforecaster.py ├── factsrag.py ├── factsrag2.py ├── factsrag3.py ├── factsragforecaster.py ├── factsragforecaster2.py ├── factsragforecaster3.py ├── knowledge.py ├── news.py ├── pdf.py ├── prompts.py ├── recursiveagent.py └── utils.py ├── forecast1.py ├── forecasttest1.py ├── newstest1.py ├── newstest2.py ├── newstest3.py ├── newstest4.py ├── newstest5.py ├── newstest6.py ├── newstest7.py ├── project_metadata.py ├── requirements-dev.txt ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.10" 7 | 8 | sphinx: 9 | configuration: docs/source/conf.py 10 | 11 | python: 12 | install: 13 | - method: pip 14 | path: . 15 | extra_requirements: 16 | - docs -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Phase AI Technologies Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Emerging Trajectories 2 | 3 | Open source library for tracking and saving forecasts of political, economic, and social events. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | pip install emergingtrajectories 9 | ``` 10 | 11 | ## Questions? 12 | 13 | Visit our site: https://emergingtrajectories.com/ 14 | 15 | Please reach out: hello --at-- emergingtrajectories --dot-- com 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ### Docs Setup 2 | 3 | 1) Install docs dependencies 4 | ``` 5 | pip install -e .[docs] 6 | ``` 7 | 8 | 2) Run a local docs server 9 | ``` 10 | sphinx-autobuild docs/source/ docs/build/html 11 | ``` 12 | 13 | ### Manual Build 14 | 15 | ``` 16 | cd docs 17 | make html 18 | ``` 19 | 20 | ### Helpful Tools 21 | 22 | * Convert reStructuredText (.rst) to Markdown (.md) 23 | ``` 24 | pip install rst-to-myst[sphinx] 25 | rst2myst convert docs/**/*.rst 26 | ``` 27 | 28 | ### Useful Resources 29 | 30 | * Document Your Scientific Project With Markdown, Sphinx, and Read the Docs | PyData Global 2021 31 | * https://www.sphinx-doc.org/en/master/usage/quickstart.html 32 | * https://www.youtube.com/watch?v=qRSb299awB0 -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("../../") 4 | 5 | from project_metadata import NAME, VERSION, AUTHOR # noqa: E402 6 | 7 | # Configuration file for the Sphinx documentation builder. 8 | # 9 | # For the full list of built-in configuration values, see the documentation: 10 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 11 | 12 | # -- Project information ----------------------------------------------------- 13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 14 | 15 | project = NAME 16 | copyright = f"2024, {AUTHOR}" 17 | author = AUTHOR 18 | release = VERSION 19 | 20 | # -- General configuration --------------------------------------------------- 21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 22 | 23 | # Add paths to the Python source code. 24 | sys.path.append("../../emergingtrajectories") 25 | 26 | # Allow markdown files to be used. 27 | extensions = [ 28 | "myst_parser", 29 | "autoapi.extension", 30 | "sphinx.ext.duration", 31 | "sphinx.ext.autodoc", 32 | "sphinx.ext.napoleon", 33 | ] 34 | 35 | # Configure autoapi. 36 | autoapi_dirs = ["../../emergingtrajectories"] 37 | autoapi_python_class_content = "init" 38 | 39 | templates_path = ["_templates"] 40 | exclude_patterns = [] 41 | 42 | # -- Options for HTML output ------------------------------------------------- 43 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 44 | 45 | html_theme = "furo" 46 | html_static_path = ["_static"] 47 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | % Phasellm documentation master file, created by 2 | % sphinx-quickstart on Tue Aug 8 15:42:56 2023. 3 | % You can adapt this file completely to your liking, but it should at least 4 | % contain the root `toctree` directive. 5 | 6 | ```{include} ../../README.md 7 | :relative-images: 8 | ``` 9 | 10 | ## Contents 11 | ```{toctree} 12 | :maxdepth: 2 13 | 14 | ``` 15 | -------------------------------------------------------------------------------- /emergingtrajectories/agents.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agents for generating forecasts. 3 | """ 4 | 5 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt 6 | from phasellm.agents import WebpageAgent, WebSearchAgent 7 | 8 | from . import Client 9 | from .utils import UtilityHelper 10 | from .knowledge import KnowledgeBaseFileCache 11 | 12 | # from . import scrapeandpredict as sap 13 | 14 | import datetime 15 | 16 | # Step 0: provide context 17 | # Step 1: provide content and extract facts 18 | # Step 2: review past forecast and determine if new information changes the forecast 19 | # Step 3: update the actual forecast statement 20 | 21 | base_system_prompt_ext = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 22 | 23 | The project description is as follows... 24 | {statement_description} 25 | 26 | We need your help analyzing content and extracting any relevant information. We'll have a few requests for you... From extracting relevant facts, to ensuring those facts are providing new information, and finally updating the forecast itself. 27 | 28 | The user will provide the relevant requests. 29 | """ 30 | 31 | ext_message_1 = """Today's date is {the_date}. 32 | 33 | Here is all the content we've managed to collect. 34 | 35 | ---------------------- 36 | {scraped_content} 37 | ---------------------- 38 | 39 | Could you please extract the relevant facts from the content provided? Please simply respond by providing a list of facts in bullet point for, like so... 40 | 41 | - Fact 1 42 | - Fact 2 43 | ... and so on. 44 | """ 45 | 46 | ext_message_2 = """Today's date is {the_date}. 47 | 48 | Assume all the content and facts above are accurate and correct up to today's date. The forecasting challenge we are working on is outlined below: 49 | {statement_fill_in_the_blank} 50 | 51 | The earlier forecast was as follows... 52 | ---------------------- 53 | PREDICTION: {forecast_value} 54 | 55 | JUSTIFICATION... 56 | 57 | {forecast_justification} 58 | ---------------------- 59 | 60 | Given the above, please use your logical thinking and reasoning to update the "justification" by including any new facts you provided earlier. Update the actual forecast prediction accordingly. 61 | 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 63 | """ 64 | 65 | ext_message_3 = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank below... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR". 66 | 67 | {statement_fill_in_the_blank} 68 | """ 69 | 70 | 71 | def ExtendScrapePredictAgent( 72 | openai_api_key: str, 73 | google_api_key: str, 74 | google_search_id: str, 75 | google_search_query: str, 76 | knowledge_base: KnowledgeBaseFileCache, 77 | forecast_id: int, 78 | et_api_key: str = None, 79 | statement_title: str = None, 80 | statement_description: str = None, 81 | fill_in_the_blank: str = None, 82 | chat_prompt_system: str = base_system_prompt_ext, 83 | ext_message_1: str = ext_message_1, 84 | ext_message_2: str = ext_message_2, 85 | ext_message_3: str = ext_message_3, 86 | prediction_title: str = "Prediction", 87 | prediction_agent: str = "Generic Agent", 88 | ) -> dict: 89 | """ 90 | Extends an existing forecast by scraping content and including any content from a knowledge base (assuming there's new content). 91 | 92 | Args: 93 | openai_api_key: the OpenAI API key 94 | google_api_key: the Google Search API key 95 | google_search_id: the Google search ID 96 | google_search_query: the Google search query 97 | knowledge_base: the KnowledgeBaseFileCache object 98 | forecast_id: the ID of the forecast to extend 99 | et_api_key: the Emerging Trajectories API key 100 | statement_title: the title of the statement (if not submitting a statement ID) 101 | statement_description: the description of the statement (if not submitting a statement ID) 102 | fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID) 103 | ext_message_1: the first message to the LLM 104 | ext_message_2: the second message to the LLM 105 | ext_message_3: the third message to the LLM 106 | prediction_title: the title of the forecast 107 | prediction_agent: the agent making the forecast 108 | 109 | Returns: 110 | dict: the response from the Emerging Trajectories platform 111 | """ 112 | 113 | if et_api_key is not None: 114 | client = Client(et_api_key) 115 | forecast = client.get_forecast(forecast_id) 116 | statement_id = forecast["statement_id"] 117 | statement = client.get_statement(statement_id) 118 | statement_title = statement["title"] 119 | statement_description = statement["description"] 120 | fill_in_the_blank = statement["fill_in_the_blank"] 121 | justification = forecast["justification"] 122 | forecast_value = forecast["value"] 123 | 124 | webagent = WebSearchAgent(api_key=google_api_key) 125 | results = webagent.search_google( 126 | query=google_search_query, custom_search_engine_id=google_search_id, num=10 127 | ) 128 | 129 | scraped_content = "" 130 | 131 | added_new_content = False 132 | 133 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 134 | accessed_resources = [] 135 | 136 | for result in results: 137 | if not knowledge_base.in_cache(result.url): 138 | added_new_content = True 139 | page_content = knowledge_base.get(result.url) 140 | 141 | accessed_resources.append(result.url) 142 | # knowledge_base.log_access(result.url) 143 | 144 | scraped_content += f"{page_content}\n\n----------------------\n\n" 145 | 146 | # We also check the knowledge base for content that was added manually. 147 | unaccessed_uris = knowledge_base.get_unaccessed_content() 148 | for ua in unaccessed_uris: 149 | added_new_content = True 150 | page_content = knowledge_base.get(ua) 151 | 152 | # knowledge_base.log_access(ua) 153 | accessed_resources.append(ua) 154 | 155 | scraped_content += f"{page_content}\n\n----------------------\n\n" 156 | 157 | if not added_new_content: 158 | print("No new content added to the forecast.") 159 | return None 160 | 161 | the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 162 | 163 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 164 | chatbot = ChatBot(llm) 165 | 166 | # Steps 0 and 1 167 | 168 | prompt_template = ChatPrompt( 169 | [ 170 | {"role": "system", "content": chat_prompt_system}, 171 | {"role": "user", "content": ext_message_1}, 172 | ] 173 | ) 174 | 175 | chatbot.messages = prompt_template.fill( 176 | statement_title=statement_title, 177 | statement_description=statement_description, 178 | statement_fill_in_the_blank=fill_in_the_blank, 179 | scraped_content=scraped_content, 180 | the_date=the_date, 181 | forecast_value=str(forecast_value), 182 | forecast_justification=justification, 183 | ) 184 | 185 | new_facts = chatbot.resend() 186 | 187 | print("\n\n\n") 188 | print(new_facts) 189 | 190 | # Step 3 191 | 192 | prompt_template_2 = ChatPrompt( 193 | [ 194 | {"role": "system", "content": chat_prompt_system}, 195 | {"role": "user", "content": ext_message_1}, 196 | {"role": "assistant", "content": "{new_facts}"}, 197 | {"role": "user", "content": ext_message_2}, 198 | ] 199 | ) 200 | 201 | chatbot.messages = prompt_template_2.fill( 202 | statement_title=statement_title, 203 | statement_description=statement_description, 204 | statement_fill_in_the_blank=fill_in_the_blank, 205 | scraped_content=scraped_content, 206 | new_facts=new_facts, 207 | the_date=the_date, 208 | forecast_value=str(forecast_value), 209 | forecast_justification=justification, 210 | ) 211 | 212 | assistant_analysis = chatbot.resend() 213 | 214 | print("\n\n\n") 215 | print(assistant_analysis) 216 | 217 | # Step 4 218 | 219 | prompt_template_3 = ChatPrompt( 220 | [ 221 | {"role": "system", "content": chat_prompt_system}, 222 | {"role": "user", "content": ext_message_1}, 223 | {"role": "assistant", "content": "{new_facts}"}, 224 | {"role": "user", "content": ext_message_2}, 225 | {"role": "assistant", "content": "{assistant_analysis}"}, 226 | {"role": "user", "content": ext_message_3}, 227 | ] 228 | ) 229 | 230 | chatbot.messages = prompt_template_3.fill( 231 | statement_title=statement_title, 232 | statement_description=statement_description, 233 | statement_fill_in_the_blank=fill_in_the_blank, 234 | scraped_content=scraped_content, 235 | new_facts=new_facts, 236 | assistant_analysis=assistant_analysis, 237 | the_date=the_date, 238 | forecast_value=str(forecast_value), 239 | forecast_justification=justification, 240 | ) 241 | 242 | filled_in_statement = chatbot.resend() 243 | 244 | print("\n\n\n") 245 | print(filled_in_statement) 246 | 247 | uh = UtilityHelper(openai_api_key) 248 | prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank) 249 | 250 | response = client.create_forecast( 251 | statement_id, 252 | prediction_title, 253 | assistant_analysis, 254 | prediction, 255 | prediction_agent, 256 | { 257 | "full_response_from_llm": assistant_analysis, 258 | "raw_forecast": filled_in_statement, 259 | "extracted_value": prediction, 260 | }, 261 | forecast_id, 262 | ) 263 | 264 | for ar in accessed_resources: 265 | knowledge_base.log_access(ar) 266 | 267 | return response 268 | 269 | 270 | base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 271 | 272 | The project description is as follows... 273 | {statement_description} 274 | 275 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking, and then finally provide a forecast for us based on the points. 276 | 277 | The format of the forecast needs to be, verbatim, as follows: {statement_fill_in_the_blank} 278 | """ 279 | 280 | base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 281 | 282 | ---------------------- 283 | {scraped_content} 284 | ---------------------- 285 | 286 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) discuss your logic and rationale for making a forecast based on the above. 287 | 288 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 289 | """ 290 | 291 | base_user_prompt_followup = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR". 292 | 293 | {statement_fill_in_the_blank} 294 | """ 295 | 296 | 297 | # In this case, we also get any documents that haven't been accessed by the agent. 298 | # This is why agent <-> kb needs to be a 1:1 relationship. 299 | def ScrapeAndPredictAgent( 300 | openai_api_key: str, 301 | google_api_key: str, 302 | google_search_id: str, 303 | google_search_query: str, 304 | knowledge_base: KnowledgeBaseFileCache = None, 305 | statement_id: int = -1, 306 | et_api_key: str = None, 307 | statement_title: str = None, 308 | statement_description: str = None, 309 | fill_in_the_blank: str = None, 310 | chat_prompt_system: str = base_system_prompt, 311 | chat_prompt_user: str = base_user_prompt, 312 | chat_prompt_user_followup: str = base_user_prompt_followup, 313 | prediction_title: str = "Prediction", 314 | prediction_agent: str = "Generic Agent", 315 | ) -> dict: 316 | """ 317 | This agent submits a search query to Google to find information related to its forecast. It also uses any information that it has not previously accessed in its KnowledgeBase. It then generates a forecast with all the relevant information. 318 | 319 | Args: 320 | openai_api_key: the OpenAI API key 321 | google_api_key: the Google Search API key 322 | google_search_id: the Google search ID 323 | google_search_query: the Google search query 324 | knowledge_base: the KnowledgeBaseFileCache object 325 | statement_id: the ID of the statement to use 326 | et_api_key: the Emerging Trajectories API key 327 | statement_title: the title of the statement (if not submitting a statement ID) 328 | statement_description: the description of the statement (if not submitting a statement ID) 329 | fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID) 330 | chat_prompt_system: the system prompt for the chatbot (optional, for overriding defaults) 331 | chat_prompt_user: the user prompt for the chatbot (optional, for overriding defaults) 332 | chat_prompt_user_followup: the follow-up user prompt for the chatbot (optional, for overriding defaults) 333 | prediction_title: the title of the forecast 334 | prediction_agent: the agent making the forecast 335 | 336 | Returns: 337 | dict: the response from the Emerging Trajectories platform 338 | """ 339 | 340 | if et_api_key is not None: 341 | client = Client(et_api_key) 342 | statement = client.get_statement(statement_id) 343 | statement_title = statement["title"] 344 | statement_description = statement["description"] 345 | fill_in_the_blank = statement["fill_in_the_blank"] 346 | 347 | if statement_id == -1 and ( 348 | statement_title is None 349 | or statement_description is None 350 | or fill_in_the_blank is None 351 | ): 352 | raise Exception( 353 | "You must provide either a statement ID or a statement title, description, and fill-in-the-blank." 354 | ) 355 | 356 | webagent = WebSearchAgent(api_key=google_api_key) 357 | results = webagent.search_google( 358 | query=google_search_query, custom_search_engine_id=google_search_id, num=10 359 | ) 360 | 361 | scraped_content = "" 362 | 363 | added_new_content = False 364 | 365 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 366 | accessed_resources = [] 367 | 368 | for result in results: 369 | if not knowledge_base.in_cache(result.url): 370 | added_new_content = True 371 | page_content = knowledge_base.get(result.url) 372 | 373 | accessed_resources.append(result.url) 374 | # knowledge_base.log_access(result.url) 375 | 376 | scraped_content += f"{page_content}\n\n----------------------\n\n" 377 | 378 | # We also check the knowledge base for content that was added manually. 379 | unaccessed_uris = knowledge_base.get_unaccessed_content() 380 | for ua in unaccessed_uris: 381 | added_new_content = True 382 | page_content = knowledge_base.get(ua) 383 | 384 | accessed_resources.append(ua) 385 | # knowledge_base.log_access(ua) 386 | 387 | scraped_content += f"{page_content}\n\n----------------------\n\n" 388 | 389 | if not added_new_content: 390 | print("No new content added to the forecast.") 391 | return None 392 | 393 | the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 394 | 395 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 396 | chatbot = ChatBot(llm) 397 | 398 | prompt_template = ChatPrompt( 399 | [ 400 | {"role": "system", "content": chat_prompt_system}, 401 | {"role": "user", "content": chat_prompt_user}, 402 | ] 403 | ) 404 | 405 | chatbot.messages = prompt_template.fill( 406 | statement_title=statement_title, 407 | statement_description=statement_description, 408 | statement_fill_in_the_blank=fill_in_the_blank, 409 | scraped_content=scraped_content, 410 | the_date=the_date, 411 | ) 412 | 413 | assistant_analysis = chatbot.resend() 414 | 415 | print("\n\n\n") 416 | print(assistant_analysis) 417 | 418 | prompt_template_2 = ChatPrompt( 419 | [ 420 | {"role": "system", "content": chat_prompt_system}, 421 | {"role": "user", "content": chat_prompt_user}, 422 | {"role": "assistant", "content": "{assistant_analysis}"}, 423 | {"role": "user", "content": chat_prompt_user_followup}, 424 | ] 425 | ) 426 | 427 | chatbot.messages = prompt_template_2.fill( 428 | statement_title=statement_title, 429 | statement_description=statement_description, 430 | statement_fill_in_the_blank=fill_in_the_blank, 431 | scraped_content=scraped_content, 432 | assistant_analysis=assistant_analysis, 433 | the_date=the_date, 434 | ) 435 | 436 | filled_in_statement = chatbot.resend() 437 | 438 | print("\n\n\n") 439 | print(filled_in_statement) 440 | 441 | uh = UtilityHelper(openai_api_key) 442 | prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank) 443 | 444 | response = client.create_forecast( 445 | statement_id, 446 | prediction_title, 447 | assistant_analysis, 448 | prediction, 449 | prediction_agent, 450 | { 451 | "full_response_from_llm": assistant_analysis, 452 | "raw_forecast": filled_in_statement, 453 | "extracted_value": prediction, 454 | }, 455 | ) 456 | 457 | for ar in accessed_resources: 458 | knowledge_base.log_access(ar) 459 | 460 | return response 461 | -------------------------------------------------------------------------------- /emergingtrajectories/chunkers.py: -------------------------------------------------------------------------------- 1 | """ 2 | chunkers.py is used to chunk facts using different strategies. Emerging Trajectories started by chunking via GPT-4, but we can also appreciate using sentences, paragraphs, or other verbatim approaches. We'll be adding more chunkers as time goes on. 3 | 4 | Chunkers should simply take a piece of content and chunk it into a list of facts. 5 | """ 6 | 7 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt 8 | from phasellm.agents import WebpageAgent, WebSearchAgent 9 | 10 | fact_system_prompt = """You are a researcher helping extract facts about {topic}, trends, and related observations. We will give you a piece of content scraped on the web. Please extract facts from this. Each fact should stand on its own, and can be several sentences long if need be. You can have as many facts as needed. For each fact, please start it as a new line with "---" as the bullet point. For example: 11 | 12 | --- Fact 1... This is the fact. 13 | --- Here is a second fact. 14 | --- And a third fact. 15 | 16 | Please do not include new lines between bullet points. Make sure you write your facts in ENGLISH. Translate any foreign language content/facts/observations into ENGLISH. 17 | 18 | We will simply provide you with content and you will just provide facts.""" 19 | 20 | 21 | class ChunkerGPT4: 22 | 23 | def __init__(self, openai_api_key: str, model="gpt-4-turbo"): 24 | """ 25 | Chunker based on GPT-4 reading text and providing a list of facts. 26 | 27 | Args: 28 | openai_api_key (str): The OpenAI API key. 29 | model (str): The OpenAI model to use. Defaults to "gpt-4-turbo". 30 | """ 31 | self.openai_api_key = openai_api_key 32 | self.model = model 33 | 34 | def chunk(self, content: str, topic: str) -> list[str]: 35 | """ 36 | Chunk text into facts. 37 | 38 | Args: 39 | content (str): The content to chunk. 40 | topic (str): The topic to focus on when building facts. 41 | 42 | Returns: 43 | list[str]: The list of facts. 44 | """ 45 | 46 | llm = OpenAIGPTWrapper(self.openai_api_key, model=self.model) 47 | chatbot = ChatBot(llm) 48 | chatbot.messages = [{"role": "system", "content": fact_system_prompt}] 49 | 50 | prompt_template = ChatPrompt( 51 | [ 52 | {"role": "system", "content": fact_system_prompt}, 53 | ] 54 | ) 55 | 56 | chatbot.messages = prompt_template.fill(topic=topic) 57 | 58 | response = chatbot.chat(content) 59 | 60 | lines = response.split("\n") 61 | 62 | facts = [] 63 | 64 | for line in lines: 65 | if line[0:4] == "--- ": 66 | fact = line[4:] 67 | facts.append(fact) 68 | 69 | return facts 70 | 71 | 72 | class ChunkerNewLines: 73 | 74 | def __init__(self, min_length: int = 7): 75 | """ 76 | Chunker using line breaks for content. 77 | 78 | Args: 79 | min_length (int): The minimum length (in characters) of a fact. Defaults to 7 characters. 80 | """ 81 | self.min_length = min_length 82 | 83 | def chunk(self, content: str, topic: str = None) -> list[str]: 84 | """ 85 | Chunk text into facts. 86 | 87 | Args: 88 | content (str): The content to chunk. 89 | topic (str): The topic to focus on when building facts. This defaults to None so we can keep the same function calls as other chunkers. 90 | 91 | Returns: 92 | list[str]: The list of facts. 93 | """ 94 | 95 | lines = content.split("\n") 96 | 97 | facts = [] 98 | 99 | for line in lines: 100 | ls = line.strip() 101 | if len(ls) >= self.min_length: 102 | facts.append(ls) 103 | 104 | return facts 105 | -------------------------------------------------------------------------------- /emergingtrajectories/citationagents.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agents for generating forecasts. 3 | """ 4 | 5 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt 6 | from phasellm.agents import WebpageAgent, WebSearchAgent 7 | 8 | from . import Client 9 | from .utils import UtilityHelper 10 | from .knowledge import KnowledgeBaseFileCache 11 | 12 | # from . import scrapeandpredict as sap 13 | 14 | import datetime 15 | import re 16 | 17 | #### 18 | # EXTENDING FORECASTS 19 | # 20 | 21 | base_system_prompt_ext = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 22 | 23 | The project description is as follows... 24 | {statement_description} 25 | 26 | We need your help analyzing content and extracting any relevant information. We'll have a few requests for you... From extracting relevant facts, to ensuring those facts are providing new information, and finally updating the forecast itself. 27 | 28 | The user will provide the relevant requests. 29 | """ 30 | 31 | ext_message_1 = """Today's date is {the_date}. 32 | 33 | Here is all the content we've managed to collect. 34 | 35 | ---------------------- 36 | {scraped_content} 37 | ---------------------- 38 | 39 | Could you please extract the relevant facts from the content provided? Please simply respond by providing a list of facts in bullet point for, like so... 40 | 41 | - Fact 1 42 | - Fact 2 43 | ... and so on. 44 | 45 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 46 | 47 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number. 48 | 49 | """ 50 | 51 | ext_message_2 = """Today's date is {the_date}. 52 | 53 | Assume all the content and facts above are accurate and correct up to today's date. The forecasting challenge we are working on is outlined below: 54 | {statement_fill_in_the_blank} 55 | 56 | The earlier forecast was as follows... 57 | ---------------------- 58 | PREDICTION: {forecast_value} 59 | 60 | JUSTIFICATION... 61 | 62 | {forecast_justification} 63 | ---------------------- 64 | 65 | Given the above, please use your logical thinking and reasoning to update the "justification" by including any new facts you provided earlier. Update the actual forecast prediction accordingly. 66 | 67 | Make sure to reference the citation/source numbers from the fact list. 68 | 69 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 70 | """ 71 | 72 | ext_message_3 = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank below... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR". 73 | 74 | {statement_fill_in_the_blank} 75 | """ 76 | 77 | 78 | def CiteExtendScrapePredictAgent( 79 | openai_api_key: str, 80 | google_api_key: str, 81 | google_search_id: str, 82 | google_search_query: str, 83 | knowledge_base: KnowledgeBaseFileCache, 84 | forecast_id: int, 85 | et_api_key: str = None, 86 | statement_title: str = None, 87 | statement_description: str = None, 88 | fill_in_the_blank: str = None, 89 | chat_prompt_system: str = base_system_prompt_ext, 90 | ext_message_1: str = ext_message_1, 91 | ext_message_2: str = ext_message_2, 92 | ext_message_3: str = ext_message_3, 93 | prediction_title: str = "Prediction", 94 | prediction_agent: str = "Generic Agent", 95 | ) -> dict: 96 | """ 97 | Extends an existing forecast by scraping content and including any content from a knowledge base (assuming there's new content). 98 | 99 | Args: 100 | openai_api_key: the OpenAI API key 101 | google_api_key: the Google Search API key 102 | google_search_id: the Google search ID 103 | google_search_query: the Google search query 104 | knowledge_base: the KnowledgeBaseFileCache object 105 | forecast_id: the ID of the forecast to extend 106 | et_api_key: the Emerging Trajectories API key 107 | statement_title: the title of the statement (if not submitting a statement ID) 108 | statement_description: the description of the statement (if not submitting a statement ID) 109 | fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID) 110 | ext_message_1: the first message to the LLM 111 | ext_message_2: the second message to the LLM 112 | ext_message_3: the third message to the LLM 113 | prediction_title: the title of the forecast 114 | prediction_agent: the agent making the forecast 115 | 116 | Returns: 117 | dict: the response from the Emerging Trajectories platform 118 | """ 119 | 120 | if et_api_key is not None: 121 | client = Client(et_api_key) 122 | forecast = client.get_forecast(forecast_id) 123 | statement_id = forecast["statement_id"] 124 | statement = client.get_statement(statement_id) 125 | statement_title = statement["title"] 126 | statement_description = statement["description"] 127 | fill_in_the_blank = statement["fill_in_the_blank"] 128 | justification = forecast["justification"] 129 | forecast_value = forecast["value"] 130 | 131 | webagent = WebSearchAgent(api_key=google_api_key) 132 | results = webagent.search_google( 133 | query=google_search_query, custom_search_engine_id=google_search_id, num=10 134 | ) 135 | 136 | scraped_content = "" 137 | 138 | added_new_content = False 139 | 140 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 141 | accessed_resources = [] 142 | 143 | ctr = 0 144 | ctr_to_source = {} 145 | 146 | for result in results: 147 | if not knowledge_base.in_cache(result.url): 148 | ctr += 1 149 | added_new_content = True 150 | page_content = knowledge_base.get(result.url) 151 | 152 | accessed_resources.append(result.url) 153 | # knowledge_base.log_access(result.url) 154 | 155 | scraped_content += ( 156 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 157 | ) 158 | ctr_to_source[ctr] = result.url 159 | 160 | # We also check the knowledge base for content that was added manually. 161 | unaccessed_uris = knowledge_base.get_unaccessed_content() 162 | for ua in unaccessed_uris: 163 | added_new_content = True 164 | ctr += 1 165 | page_content = knowledge_base.get(ua) 166 | 167 | accessed_resources.append(ua) 168 | # knowledge_base.log_access(ua) 169 | 170 | scraped_content += f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 171 | ctr_to_source[ctr] = ua 172 | 173 | if not added_new_content: 174 | print("No new content added to the forecast.") 175 | return None 176 | 177 | the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 178 | 179 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 180 | chatbot = ChatBot(llm) 181 | 182 | # Steps 0 and 1 183 | 184 | prompt_template = ChatPrompt( 185 | [ 186 | {"role": "system", "content": chat_prompt_system}, 187 | {"role": "user", "content": ext_message_1}, 188 | ] 189 | ) 190 | 191 | chatbot.messages = prompt_template.fill( 192 | statement_title=statement_title, 193 | statement_description=statement_description, 194 | statement_fill_in_the_blank=fill_in_the_blank, 195 | scraped_content=scraped_content, 196 | the_date=the_date, 197 | forecast_value=str(forecast_value), 198 | forecast_justification=justification, 199 | ) 200 | 201 | new_facts = chatbot.resend() 202 | 203 | print("\n\n\n") 204 | print(new_facts) 205 | 206 | # Step 3 207 | 208 | prompt_template_2 = ChatPrompt( 209 | [ 210 | {"role": "system", "content": chat_prompt_system}, 211 | {"role": "user", "content": ext_message_1}, 212 | {"role": "assistant", "content": "{new_facts}"}, 213 | {"role": "user", "content": ext_message_2}, 214 | ] 215 | ) 216 | 217 | chatbot.messages = prompt_template_2.fill( 218 | statement_title=statement_title, 219 | statement_description=statement_description, 220 | statement_fill_in_the_blank=fill_in_the_blank, 221 | scraped_content=scraped_content, 222 | new_facts=new_facts, 223 | the_date=the_date, 224 | forecast_value=str(forecast_value), 225 | forecast_justification=justification, 226 | ) 227 | 228 | assistant_analysis = chatbot.resend() 229 | 230 | print("\n\n\n") 231 | print(assistant_analysis) 232 | 233 | # Step 4 234 | 235 | prompt_template_3 = ChatPrompt( 236 | [ 237 | {"role": "system", "content": chat_prompt_system}, 238 | {"role": "user", "content": ext_message_1}, 239 | {"role": "assistant", "content": "{new_facts}"}, 240 | {"role": "user", "content": ext_message_2}, 241 | {"role": "assistant", "content": "{assistant_analysis}"}, 242 | {"role": "user", "content": ext_message_3}, 243 | ] 244 | ) 245 | 246 | chatbot.messages = prompt_template_3.fill( 247 | statement_title=statement_title, 248 | statement_description=statement_description, 249 | statement_fill_in_the_blank=fill_in_the_blank, 250 | scraped_content=scraped_content, 251 | new_facts=new_facts, 252 | assistant_analysis=assistant_analysis, 253 | the_date=the_date, 254 | forecast_value=str(forecast_value), 255 | forecast_justification=justification, 256 | ) 257 | 258 | filled_in_statement = chatbot.resend() 259 | 260 | print("\n\n\n") 261 | print(filled_in_statement) 262 | 263 | assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source) 264 | 265 | print("\n\n\n*** ANALYSIS WITH CITATIONS***\n\n\n") 266 | print(assistant_analysis_sourced) 267 | 268 | uh = UtilityHelper(openai_api_key) 269 | prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank) 270 | 271 | response = client.create_forecast( 272 | statement_id, 273 | prediction_title, 274 | assistant_analysis_sourced, 275 | prediction, 276 | prediction_agent, 277 | { 278 | "full_response_from_llm_before_source_cleanup": assistant_analysis, 279 | "full_response_from_llm": assistant_analysis_sourced, 280 | "raw_forecast": filled_in_statement, 281 | "extracted_value": prediction, 282 | }, 283 | forecast_id, 284 | ) 285 | 286 | for ar in accessed_resources: 287 | knowledge_base.log_access(ar) 288 | 289 | return response 290 | 291 | 292 | #### 293 | # INITIAL FORECAST 294 | # 295 | 296 | base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 297 | 298 | The project description is as follows... 299 | {statement_description} 300 | 301 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking, and then finally provide a forecast for us based on the points. 302 | 303 | The format of the forecast needs to be, verbatim, as follows: {statement_fill_in_the_blank} 304 | """ 305 | 306 | base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 307 | 308 | ---------------------- 309 | {scraped_content} 310 | ---------------------- 311 | 312 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) discuss your logic and rationale for making a forecast based on the above. 313 | 314 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 315 | 316 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number. 317 | 318 | Do this for the final justification of your forecast as well. 319 | 320 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 321 | """ 322 | 323 | base_user_prompt_followup = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR". 324 | 325 | {statement_fill_in_the_blank} 326 | """ 327 | 328 | 329 | def clean_citations(assistant_analysis: str, ctr_to_source: dict) -> str: 330 | """ 331 | The analysis currently contains numerical citations that are likely not in order, or in some cases are not used. We will update the cituations to follow the proper numerical order, and also include the URLs at the very end. 332 | 333 | Args: 334 | assistant_analysis: the analysis text from the assistant 335 | ctr_to_source: the mapping of citation number to source URL 336 | 337 | Returns: 338 | str: the cleaned analysis text, with citations following a proper numerical format and URIs at the end of the analysis 339 | """ 340 | 341 | new_ctr_map = {} 342 | ctr = 1 343 | 344 | end_notes = "\n\n--- SOURCES ---\n\n" 345 | new_analysis = "" 346 | 347 | matches = re.finditer(r"\[\d+\]", assistant_analysis) 348 | 349 | last_index = 0 350 | for m in matches: 351 | 352 | # print(m.group()) 353 | # print(m.start()) 354 | # print(m.end()) 355 | # print(assistant_analysis[m.start() - 1: m.end() + 1]) 356 | 357 | m_start = m.start() + 1 358 | m_end = m.end() - 1 359 | 360 | old_ctr = int(m.group()[1:-1]) 361 | uri = ctr_to_source[old_ctr] 362 | 363 | if old_ctr not in new_ctr_map: 364 | new_ctr_map[old_ctr] = ctr 365 | end_notes += f"{ctr}: {uri}\n" 366 | ctr += 1 367 | 368 | new_analysis += assistant_analysis[last_index:m_start] + str( 369 | new_ctr_map[old_ctr] 370 | ) 371 | last_index = m_end 372 | 373 | if last_index != 0: 374 | new_analysis += assistant_analysis[last_index:] + end_notes 375 | 376 | else: 377 | new_analysis = assistant_analysis + end_notes + "No citations provided." 378 | 379 | return new_analysis 380 | 381 | 382 | # In this case, we also get any documents that haven't been accessed by the agent. 383 | # This is why agent <-> kb needs to be a 1:1 relationship. 384 | def CitationScrapeAndPredictAgent( 385 | openai_api_key: str, 386 | google_api_key: str, 387 | google_search_id: str, 388 | google_search_query: str, 389 | knowledge_base: KnowledgeBaseFileCache = None, 390 | statement_id: int = -1, 391 | et_api_key: str = None, 392 | statement_title: str = None, 393 | statement_description: str = None, 394 | fill_in_the_blank: str = None, 395 | chat_prompt_system: str = base_system_prompt, 396 | chat_prompt_user: str = base_user_prompt, 397 | chat_prompt_user_followup: str = base_user_prompt_followup, 398 | prediction_title: str = "Prediction", 399 | prediction_agent: str = "Generic Agent", 400 | ) -> dict: 401 | """ 402 | This agent submits a search query to Google to find information related to its forecast. It also uses any information that it has not previously accessed in its KnowledgeBase. It then generates a forecast with all the relevant information. 403 | 404 | Args: 405 | openai_api_key: the OpenAI API key 406 | google_api_key: the Google Search API key 407 | google_search_id: the Google search ID 408 | google_search_query: the Google search query 409 | knowledge_base: the KnowledgeBaseFileCache object 410 | statement_id: the ID of the statement to use 411 | et_api_key: the Emerging Trajectories API key 412 | statement_title: the title of the statement (if not submitting a statement ID) 413 | statement_description: the description of the statement (if not submitting a statement ID) 414 | fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID) 415 | chat_prompt_system: the system prompt for the chatbot (optional, for overriding defaults) 416 | chat_prompt_user: the user prompt for the chatbot (optional, for overriding defaults) 417 | chat_prompt_user_followup: the follow-up user prompt for the chatbot (optional, for overriding defaults) 418 | prediction_title: the title of the forecast 419 | prediction_agent: the agent making the forecast 420 | 421 | Returns: 422 | dict: the response from the Emerging Trajectories platform 423 | """ 424 | 425 | if et_api_key is not None: 426 | client = Client(et_api_key) 427 | statement = client.get_statement(statement_id) 428 | statement_title = statement["title"] 429 | statement_description = statement["description"] 430 | fill_in_the_blank = statement["fill_in_the_blank"] 431 | 432 | if statement_id == -1 and ( 433 | statement_title is None 434 | or statement_description is None 435 | or fill_in_the_blank is None 436 | ): 437 | raise Exception( 438 | "You must provide either a statement ID or a statement title, description, and fill-in-the-blank." 439 | ) 440 | 441 | webagent = WebSearchAgent(api_key=google_api_key) 442 | results = webagent.search_google( 443 | query=google_search_query, custom_search_engine_id=google_search_id, num=10 444 | ) 445 | 446 | scraped_content = "" 447 | 448 | added_new_content = False 449 | 450 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 451 | accessed_resources = [] 452 | 453 | ctr = 0 454 | ctr_to_source = {} 455 | 456 | for result in results: 457 | if not knowledge_base.in_cache(result.url): 458 | ctr += 1 459 | added_new_content = True 460 | page_content = knowledge_base.get(result.url) 461 | 462 | accessed_resources.append(result.url) 463 | # knowledge_base.log_access(result.url) 464 | 465 | scraped_content += ( 466 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 467 | ) 468 | ctr_to_source[ctr] = result.url 469 | 470 | # We also check the knowledge base for content that was added manually. 471 | unaccessed_uris = knowledge_base.get_unaccessed_content() 472 | for ua in unaccessed_uris: 473 | added_new_content = True 474 | ctr += 1 475 | page_content = knowledge_base.get(ua) 476 | 477 | accessed_resources.append(ua) 478 | # knowledge_base.log_access(ua) 479 | 480 | scraped_content += f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 481 | ctr_to_source[ctr] = ua 482 | 483 | if not added_new_content: 484 | print("No new content added to the forecast.") 485 | return None 486 | 487 | the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 488 | 489 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 490 | chatbot = ChatBot(llm) 491 | 492 | prompt_template = ChatPrompt( 493 | [ 494 | {"role": "system", "content": chat_prompt_system}, 495 | {"role": "user", "content": chat_prompt_user}, 496 | ] 497 | ) 498 | 499 | chatbot.messages = prompt_template.fill( 500 | statement_title=statement_title, 501 | statement_description=statement_description, 502 | statement_fill_in_the_blank=fill_in_the_blank, 503 | scraped_content=scraped_content, 504 | the_date=the_date, 505 | ) 506 | 507 | assistant_analysis = chatbot.resend() 508 | 509 | print("\n\n\n") 510 | print(assistant_analysis) 511 | 512 | prompt_template_2 = ChatPrompt( 513 | [ 514 | {"role": "system", "content": chat_prompt_system}, 515 | {"role": "user", "content": chat_prompt_user}, 516 | {"role": "assistant", "content": "{assistant_analysis}"}, 517 | {"role": "user", "content": chat_prompt_user_followup}, 518 | ] 519 | ) 520 | 521 | chatbot.messages = prompt_template_2.fill( 522 | statement_title=statement_title, 523 | statement_description=statement_description, 524 | statement_fill_in_the_blank=fill_in_the_blank, 525 | scraped_content=scraped_content, 526 | assistant_analysis=assistant_analysis, 527 | the_date=the_date, 528 | ) 529 | 530 | filled_in_statement = chatbot.resend() 531 | 532 | print("\n\n\n") 533 | print(filled_in_statement) 534 | 535 | assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source) 536 | 537 | print("\n\n\n*** ANALYSIS WITH CITATIONS***\n\n\n") 538 | print(assistant_analysis_sourced) 539 | 540 | uh = UtilityHelper(openai_api_key) 541 | prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank) 542 | 543 | response = client.create_forecast( 544 | statement_id, 545 | prediction_title, 546 | assistant_analysis_sourced, 547 | prediction, 548 | prediction_agent, 549 | { 550 | "full_response_from_llm_before_source_cleanup": assistant_analysis, 551 | "full_response_from_llm": assistant_analysis_sourced, 552 | "raw_forecast": filled_in_statement, 553 | "extracted_value": prediction, 554 | }, 555 | ) 556 | 557 | for ar in accessed_resources: 558 | knowledge_base.log_access(ar) 559 | 560 | return response 561 | -------------------------------------------------------------------------------- /emergingtrajectories/crawlers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Crawlers provide a standardized approach to interacting with with web pages and extracting information. We have a number of crawlers based on PhaseLLM (Python requests) and ones using Playwright (headlessly and with a front-end) to enable flexible scraping. 3 | 4 | All scraping agents return the raw HTML content and the extracted text content. 5 | """ 6 | 7 | from playwright.sync_api import sync_playwright 8 | from bs4 import BeautifulSoup 9 | 10 | from phasellm.agents import WebpageAgent 11 | 12 | from scrapingbee import ScrapingBeeClient 13 | 14 | 15 | def _bs4_childtraversal(html: str) -> str: 16 | """ 17 | Recursively travserse the DOM to extract content. 18 | 19 | Args: 20 | html (str): HTML content 21 | 22 | Returns: 23 | str: Extracted content 24 | """ 25 | 26 | if len(str(html).strip()) < 2: 27 | return "" 28 | 29 | new_html = "" 30 | 31 | for content in html: 32 | contentname = "" 33 | 34 | if isinstance(content, str): 35 | contentname = "" 36 | elif content.name is not None: 37 | contentname = content.name.lower() 38 | 39 | if contentname in ["p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "span"]: 40 | text = content.get_text() 41 | num_words = len(text.strip().split(" ")) 42 | # print(num_words) 43 | if num_words > 7: 44 | new_html = new_html + content.get_text() + "\n\n" 45 | else: 46 | new_html = new_html + _bs4_childtraversal(content) 47 | 48 | return new_html 49 | 50 | 51 | def _get_text_bs4(html: str) -> str: 52 | """ 53 | Extract text content from HTML using BeautifulSoup. 54 | 55 | Args: 56 | html (str): HTML content 57 | 58 | Returns: 59 | str: Extracted text content 60 | """ 61 | 62 | new_html = "" 63 | 64 | souppre = BeautifulSoup(html, "html.parser") 65 | soup = souppre.body 66 | 67 | for content in soup.contents: 68 | contentname = "" 69 | if content.name is not None: 70 | contentname = content.name.lower() 71 | if contentname not in ["script", "style"]: 72 | new_html = new_html + _bs4_childtraversal(content) 73 | 74 | new_html = new_html + "" 75 | 76 | newsoup = BeautifulSoup(new_html, "html.parser") 77 | text = newsoup.get_text() 78 | 79 | return text 80 | 81 | 82 | class crawlerPlaywright: 83 | 84 | def __init__(self, headless: bool = True) -> None: 85 | """ 86 | Crawler that uses Playwright to scrape web pages. 87 | 88 | Args: 89 | headless (bool, optional): Run the browser in headless mode. Defaults to True. 90 | """ 91 | self.headless = headless 92 | 93 | def get_content(self, url: str) -> tuple[str, str]: 94 | """ 95 | Gets content for a specific URL. 96 | 97 | Args: 98 | url (str): URL to scrape 99 | 100 | Returns: 101 | tuple[str, str]: Raw HTML content and extracted text content (in this order) 102 | """ 103 | 104 | content = "" 105 | text = "" 106 | with sync_playwright() as playwright: 107 | 108 | browser = playwright.chromium.launch(headless=self.headless) 109 | page = browser.new_page() 110 | 111 | # Navigate to the webpage 112 | page.goto(url) 113 | 114 | # Extract data 115 | content = page.content() 116 | 117 | # Close the browser 118 | browser.close() 119 | 120 | text = _get_text_bs4(content) 121 | 122 | return content, text 123 | 124 | 125 | class crawlerPhaseLLM: 126 | 127 | def __init__(self): 128 | """ 129 | PhaseLLM scraper. Uses Python requests and does not execute JS. 130 | """ 131 | self.scraper = WebpageAgent() 132 | 133 | def get_content(self, url): 134 | """ 135 | Gets content for a specific URL. 136 | 137 | Args: 138 | url (str): URL to scrape 139 | 140 | Returns: 141 | tuple[str, str]: Raw HTML content and extracted text content (in this order) 142 | """ 143 | content_raw = self.scraper.scrape(url, text_only=False, body_only=False) 144 | content_parsed = self.scraper.scrape(url, text_only=True, body_only=True) 145 | return content_raw, content_parsed 146 | 147 | 148 | class crawlerScrapingBee: 149 | 150 | def __init__(self, api_key: str): 151 | """ 152 | Crawler that uses ScrapingBee to scrape web pages. 153 | """ 154 | self.client = ScrapingBeeClient(api_key=api_key) 155 | 156 | def get_content(self, url): 157 | """ 158 | Gets content for a specific URL. 159 | 160 | Args: 161 | url (str): URL to scrape 162 | 163 | Returns: 164 | tuple[str, str]: Raw HTML content and extracted text content (in this order) 165 | """ 166 | 167 | response = self.client.get(url) 168 | content_raw = response.content.decode("utf-8") 169 | content_parsed = _get_text_bs4(content_raw) 170 | return content_raw, content_parsed 171 | -------------------------------------------------------------------------------- /emergingtrajectories/facts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Facts agent. Similar to knowledge agent but simply provides a list of facts and associated sources. 3 | 4 | This abstracts away the fact generation from forecast creation, thus allowing us to test different prompting strategies and LLMs. 5 | """ 6 | 7 | import os 8 | import json 9 | import hashlib 10 | import re 11 | 12 | # Using JSONEncoder to be consistent with the Emerging Trajectories website and platform. 13 | from django.core.serializers.json import DjangoJSONEncoder 14 | 15 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt 16 | from phasellm.agents import WebpageAgent, WebSearchAgent 17 | 18 | from datetime import datetime 19 | 20 | from . import Client 21 | from .crawlers import crawlerPlaywright 22 | from phasellm.llms import OpenAIGPTWrapper, ChatBot 23 | 24 | # Number of search results to return from web searche (default value). 25 | _DEFAULT_NUM_SEARCH_RESULTS = 10 26 | 27 | facts_base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 28 | 29 | The project description is as follows... 30 | {statement_description} 31 | 32 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking. Rather than using bullet points, please list each as F1, F2, F3, etc... So that we can reference it. 33 | 34 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 35 | 36 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number. 37 | 38 | Thus, a bullet point would look like this: 39 | F1: (information) [1] 40 | F2: (information) [1] 41 | F3: (information) [2] 42 | 43 | ... and so on, where F1, F2, F3, etc. are facts, and [1], [2] are the source documents you are extracting the facts from. 44 | """ 45 | 46 | facts_base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 47 | 48 | ---------------------- 49 | {scraped_content} 50 | ---------------------- 51 | 52 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) share any insights you might have based on the facts. 53 | 54 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 55 | 56 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the actual number. 57 | 58 | DO NOT PROVIDE A FORECAST, BUT SIMPLY STATE AND SHARE THE FACTS AND INSIGHTS YOU HAVE GATHERED. 59 | """ 60 | 61 | 62 | def uri_to_local(uri: str) -> str: 63 | """ 64 | Convert a URI to a local file name. In this case, we typically will use an MD5 sum. 65 | 66 | Args: 67 | uri (str): The URI to convert. 68 | 69 | Returns: 70 | str: The MD5 sum of the URI. 71 | """ 72 | uri_md5 = hashlib.md5(uri.encode("utf-8")).hexdigest() 73 | return uri_md5 74 | 75 | 76 | # TODO Move to Utils.py, or elsewhere. 77 | def clean_citations(assistant_analysis: str, ctr_to_source: dict) -> str: 78 | """ 79 | The analysis currently contains numerical citations that are likely not in order, or in some cases are not used. We will update the cituations to follow the proper numerical order, and also include the URLs at the very end. 80 | 81 | Args: 82 | assistant_analysis: the analysis text from the assistant 83 | ctr_to_source: the mapping of citation number to source URL 84 | 85 | Returns: 86 | str: the cleaned analysis text, with citations following a proper numerical format and URIs at the end of the analysis 87 | """ 88 | 89 | new_ctr_map = {} 90 | ctr = 1 91 | 92 | end_notes = "\n\n--- SOURCES ---\n\n" 93 | new_analysis = "" 94 | 95 | matches = re.finditer(r"\[\d+\]", assistant_analysis) 96 | 97 | last_index = 0 98 | for m in matches: 99 | 100 | m_start = m.start() + 1 101 | m_end = m.end() - 1 102 | 103 | old_ctr = int(m.group()[1:-1]) 104 | uri = ctr_to_source[old_ctr] 105 | 106 | if old_ctr not in new_ctr_map: 107 | new_ctr_map[old_ctr] = ctr 108 | end_notes += f"{ctr}: {uri}\n" 109 | ctr += 1 110 | 111 | new_analysis += assistant_analysis[last_index:m_start] + str( 112 | new_ctr_map[old_ctr] 113 | ) 114 | last_index = m_end 115 | 116 | if last_index != 0: 117 | new_analysis += assistant_analysis[last_index:] + end_notes 118 | 119 | else: 120 | new_analysis = assistant_analysis + end_notes + "No citations provided." 121 | 122 | return new_analysis 123 | 124 | 125 | # TODO If this works, it should be an agent with setllm() supported, etc. 126 | class FactBaseFileCache: 127 | 128 | def __init__( 129 | self, folder_path: str, cache_file: str = "cache.json", crawler=None 130 | ) -> None: 131 | """ 132 | The KnowledgeBaseFileCache is a simple file-based cache for web content and local files. The cache stores the original HTML, PDF, or TXT content and tracks when (if ever) an agent actually accessed the content. 133 | 134 | Args: 135 | folder_path (str): The folder where the cache will be stored. 136 | cache_file (str, optional): The name of the cache file. Defaults to "cache.json". 137 | """ 138 | self.root_path = folder_path 139 | self.root_parsed = os.path.join(folder_path, "parsed") 140 | self.root_original = os.path.join(folder_path, "original") 141 | self.cache_file = os.path.join(folder_path, cache_file) 142 | self.cache = self.load_cache() 143 | 144 | if crawler is None: 145 | self.crawler = crawlerPlaywright() 146 | else: 147 | self.crawler = crawler 148 | 149 | # TODO: this function is a new one compared to the KnowledgeBaseFileCache 150 | # TODO: refactor this + code where we run one query 151 | def summarize_new_info_multiple_queries( 152 | self, 153 | statement, 154 | chatbot, 155 | google_api_key, 156 | google_search_id, 157 | google_search_queries, 158 | fileout=None, 159 | ) -> str: 160 | 161 | self.google_api_key = google_api_key 162 | self.google_search_id = google_search_id 163 | self.google_search_queries = google_search_queries 164 | 165 | webagent = WebSearchAgent(api_key=self.google_api_key) 166 | 167 | scraped_content = "" 168 | added_new_content = False 169 | 170 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 171 | accessed_resources = [] 172 | 173 | ctr = 0 174 | ctr_to_source = {} 175 | 176 | for google_search_query in self.google_search_queries: 177 | 178 | results = webagent.search_google( 179 | query=google_search_query, 180 | custom_search_engine_id=self.google_search_id, 181 | num=_DEFAULT_NUM_SEARCH_RESULTS, 182 | ) 183 | 184 | added_new_content = False 185 | 186 | for result in results: 187 | if not self.in_cache(result.url): 188 | ctr += 1 189 | added_new_content = True 190 | 191 | try: 192 | page_content = self.get(result.url) 193 | print(page_content) 194 | except Exception as e: 195 | print(f"Failed to get content from {result.url}\n{e}") 196 | self.force_empty(result.url) 197 | page_content = "" 198 | 199 | accessed_resources.append(result.url) 200 | # knowledge_base.log_access(result.url) 201 | 202 | scraped_content += ( 203 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 204 | ) 205 | ctr_to_source[ctr] = result.url 206 | 207 | # We also check the knowledge base for content that was added manually. 208 | unaccessed_uris = self.get_unaccessed_content() 209 | for ua in unaccessed_uris: 210 | added_new_content = True 211 | ctr += 1 212 | page_content = self.get(ua) 213 | 214 | accessed_resources.append(ua) 215 | # knowledge_base.log_access(ua) 216 | 217 | scraped_content += ( 218 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 219 | ) 220 | ctr_to_source[ctr] = ua 221 | 222 | if not added_new_content: 223 | print("No new content added to the forecast.") 224 | return None 225 | 226 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 227 | 228 | prompt_template = ChatPrompt( 229 | [ 230 | {"role": "system", "content": facts_base_system_prompt}, 231 | {"role": "user", "content": facts_base_user_prompt}, 232 | ] 233 | ) 234 | 235 | chatbot.messages = prompt_template.fill( 236 | statement_title=statement.title, 237 | statement_description=statement.description, 238 | statement_fill_in_the_blank=statement.fill_in_the_blank, 239 | scraped_content=scraped_content, 240 | the_date=the_date, 241 | ) 242 | 243 | assistant_analysis = chatbot.resend() 244 | assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source) 245 | 246 | print("\n\n\n") 247 | print(assistant_analysis_sourced) 248 | 249 | if fileout is not None: 250 | with open(fileout, "w") as w: 251 | w.write(assistant_analysis_sourced) 252 | 253 | for ar in accessed_resources: 254 | self.log_access(ar) 255 | 256 | return assistant_analysis_sourced 257 | 258 | # TODO: this function is a new one compared to the KnowledgeBaseFileCache 259 | def summarize_new_info( 260 | self, 261 | statement, 262 | chatbot, 263 | google_api_key, 264 | google_search_id, 265 | google_search_query, 266 | fileout=None, 267 | ) -> str: 268 | 269 | self.google_api_key = google_api_key 270 | self.google_search_id = google_search_id 271 | self.google_search_query = google_search_query 272 | 273 | webagent = WebSearchAgent(api_key=self.google_api_key) 274 | results = webagent.search_google( 275 | query=self.google_search_query, 276 | custom_search_engine_id=self.google_search_id, 277 | num=_DEFAULT_NUM_SEARCH_RESULTS, 278 | ) 279 | 280 | scraped_content = "" 281 | 282 | added_new_content = False 283 | 284 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 285 | accessed_resources = [] 286 | 287 | ctr = 0 288 | ctr_to_source = {} 289 | 290 | for result in results: 291 | if not self.in_cache(result.url): 292 | ctr += 1 293 | added_new_content = True 294 | 295 | try: 296 | page_content = self.get(result.url) 297 | print(page_content) 298 | except Exception as e: 299 | print(f"Failed to get content from {result.url}\n{e}") 300 | self.force_empty(result.url) 301 | page_content = "" 302 | 303 | accessed_resources.append(result.url) 304 | # knowledge_base.log_access(result.url) 305 | 306 | scraped_content += ( 307 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 308 | ) 309 | ctr_to_source[ctr] = result.url 310 | 311 | # We also check the knowledge base for content that was added manually. 312 | unaccessed_uris = self.get_unaccessed_content() 313 | for ua in unaccessed_uris: 314 | added_new_content = True 315 | ctr += 1 316 | page_content = self.get(ua) 317 | 318 | accessed_resources.append(ua) 319 | # knowledge_base.log_access(ua) 320 | 321 | scraped_content += ( 322 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 323 | ) 324 | ctr_to_source[ctr] = ua 325 | 326 | if not added_new_content: 327 | print("No new content added to the forecast.") 328 | return None 329 | 330 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 331 | 332 | prompt_template = ChatPrompt( 333 | [ 334 | {"role": "system", "content": facts_base_system_prompt}, 335 | {"role": "user", "content": facts_base_user_prompt}, 336 | ] 337 | ) 338 | 339 | chatbot.messages = prompt_template.fill( 340 | statement_title=statement.title, 341 | statement_description=statement.description, 342 | statement_fill_in_the_blank=statement.fill_in_the_blank, 343 | scraped_content=scraped_content, 344 | the_date=the_date, 345 | ) 346 | 347 | assistant_analysis = chatbot.resend() 348 | assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source) 349 | 350 | print("\n\n\n") 351 | print(assistant_analysis_sourced) 352 | 353 | if fileout is not None: 354 | with open(fileout, "w") as w: 355 | w.write(assistant_analysis_sourced) 356 | 357 | for ar in accessed_resources: 358 | self.log_access(ar) 359 | 360 | return assistant_analysis_sourced 361 | 362 | def save_state(self) -> None: 363 | """ 364 | Saves the in-memory changes to the knowledge base to the JSON cache file. 365 | """ 366 | with open(self.cache_file, "w") as f: 367 | json.dump(self.cache, f, cls=DjangoJSONEncoder) 368 | 369 | def load_cache(self) -> None: 370 | """ 371 | Loads the cache from the cache file, or creates the relevant files and folders if one does not exist. 372 | """ 373 | 374 | if not os.path.exists(self.root_path): 375 | os.makedirs(self.root_path) 376 | 377 | if not os.path.exists(self.root_parsed): 378 | os.makedirs(self.root_parsed) 379 | 380 | if not os.path.exists(self.root_original): 381 | os.makedirs(self.root_original) 382 | 383 | if not os.path.exists(self.cache_file): 384 | with open(self.cache_file, "w") as f: 385 | f.write("{}") 386 | 387 | with open(self.cache_file, "r") as f: 388 | return json.load(f) 389 | 390 | def in_cache(self, uri: str) -> bool: 391 | """ 392 | Checks if a URI is in the cache already. 393 | 394 | Args: 395 | uri (str): The URI to check. 396 | 397 | Returns: 398 | bool: True if the URI is in the cache, False otherwise. 399 | """ 400 | if uri in self.cache: 401 | return True 402 | return False 403 | 404 | def update_cache( 405 | self, uri: str, obtained_on: datetime, last_accessed: datetime 406 | ) -> None: 407 | """ 408 | Updates the cache file for a given URI, specifically when it was obtained and last accessed. 409 | 410 | Args: 411 | uri (str): The URI to update. 412 | obtained_on (datetime): The date and time when the content was obtained. 413 | last_accessed (datetime): The date and time when the content was last accessed. 414 | """ 415 | uri_md5 = uri_to_local(uri) 416 | self.cache[uri] = { 417 | "obtained_on": obtained_on, 418 | "last_accessed": last_accessed, 419 | "accessed": 0, 420 | "uri_md5": uri_md5, 421 | } 422 | self.save_state() 423 | 424 | def log_access(self, uri: str) -> None: 425 | """ 426 | Saves the last accessed time and updates the accessed tracker for a given URI. 427 | 428 | Args: 429 | uri (str): The URI to update. 430 | """ 431 | self.cache[uri]["last_accessed"] = datetime.now() 432 | self.cache[uri]["accessed"] = 1 433 | self.save_state() 434 | 435 | def get_unaccessed_content(self) -> list[str]: 436 | """ 437 | Returns a list of URIs that have not been accessed by the agent. 438 | 439 | Returns: 440 | list[str]: A list of URIs that have not been accessed by the agent. 441 | """ 442 | unaccessed = [] 443 | for uri in self.cache: 444 | if self.cache[uri]["accessed"] == 0: 445 | unaccessed.append(uri) 446 | return unaccessed 447 | 448 | def force_empty(self, uri: str) -> None: 449 | """ 450 | Saves an empty file for a given URI. Used when the page is erroring out. 451 | 452 | Args: 453 | uri (str): The URI to empty the cache for. 454 | """ 455 | uri_md5 = uri_to_local(uri) 456 | 457 | with open(os.path.join(self.root_original, uri_md5), "w") as f: 458 | f.write("") 459 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 460 | f.write("") 461 | 462 | self.update_cache(uri, datetime.now(), datetime.now()) 463 | 464 | def get(self, uri: str) -> str: 465 | """ 466 | Returns the content for a given URI. If the content is not in the cache, it will be scraped and added to the cache. 467 | 468 | Args: 469 | uri (str): The URI to get the content for. 470 | 471 | Returns: 472 | str: The content for the given URI. 473 | """ 474 | uri_md5 = uri_to_local(uri) 475 | if uri in self.cache: 476 | with open(os.path.join(self.root_parsed, uri_md5), "r") as f: 477 | return f.read() 478 | else: 479 | # scraper = WebpageAgent() 480 | 481 | # content_raw = scraper.scrape(uri, text_only=False, body_only=False) 482 | # with open(os.path.join(self.root_original, uri_md5), "w") as f: 483 | # f.write(content_raw) 484 | 485 | # content_parsed = scraper.scrape(uri, text_only=True, body_only=True) 486 | # with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 487 | # f.write(content_parsed) 488 | 489 | content, text = self.crawler.get_content(uri) 490 | with open(os.path.join(self.root_original, uri_md5), "w") as f: 491 | f.write(content) 492 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 493 | f.write(text) 494 | 495 | self.update_cache(uri, datetime.now(), datetime.now()) 496 | 497 | return text 498 | 499 | def add_content(self, content: str, uri: str = None) -> None: 500 | """ 501 | Adds content to cache. 502 | 503 | Args: 504 | content (str): The content to add to the cache. 505 | uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used. 506 | """ 507 | if uri is None: 508 | uri = hashlib.md5(content.encode("utf-8")).hexdigest() 509 | uri_md5 = uri_to_local(uri) 510 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 511 | f.write(content) 512 | self.update_cache(uri, datetime.now(), datetime.now()) 513 | 514 | def add_content_from_file(self, filepath: str, uri: str = None) -> None: 515 | """ 516 | Adds content from a text file to the cache. 517 | 518 | Args: 519 | filepath (str): The path to the file to add to the cache. 520 | uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used. 521 | """ 522 | with open(filepath, "r") as f: 523 | content = f.read() 524 | self.add_content(content, uri) 525 | -------------------------------------------------------------------------------- /emergingtrajectories/factsforecaster.py: -------------------------------------------------------------------------------- 1 | from .recursiveagent import ETClient 2 | from .facts import FactBaseFileCache 3 | from .utils import UtilityHelper 4 | from . import Client, Statement, Forecast 5 | 6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt 7 | 8 | from datetime import datetime 9 | 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you.""" 11 | 12 | start_user_prompt = """Here is the research: 13 | --------------------- 14 | {content} 15 | --------------------- 16 | {additional_facts} 17 | 18 | Given the above, we need you to do your best to fill in the following blank... 19 | {fill_in_the_blank} 20 | 21 | PLEASE DO THE FOLLOWING: 22 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 23 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 24 | - Do not provide a range; provide ONE number. 25 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 26 | 27 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 28 | """ 29 | 30 | extend_user_prompt = """Here is the research: 31 | --------------------- 32 | {content} 33 | --------------------- 34 | {additional_facts} 35 | 36 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast... 37 | --------------------- 38 | FORECAST: {earlier_forecast_value} 39 | 40 | JUSTIFICATION: 41 | {earlier_forecast} 42 | --------------------- 43 | 44 | Given the above, we need you to do your best to fill in the following blank... 45 | {fill_in_the_blank} 46 | 47 | PLEASE DO THE FOLLOWING: 48 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 49 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 50 | - Do not provide a range; provide ONE number. 51 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 52 | 53 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 54 | 55 | """ 56 | 57 | 58 | class FactForecastingAgent(object): 59 | 60 | # TODO: document / clean up 61 | def __init__( 62 | self, 63 | client: ETClient, 64 | chatbot: ChatBot, 65 | factbase: FactBaseFileCache, 66 | ): 67 | 68 | self.client = client 69 | self.chatbot = chatbot 70 | self.factbase = factbase 71 | 72 | # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity. 73 | def setChatBot(self, chatbot): 74 | self.chatbot = chatbot 75 | 76 | # TODO: standardize -- camel case or snake case? Or something else? 77 | def getChatBot(self): 78 | return self.chatbot 79 | 80 | # TODO: we can do much better at disaggregating all these functions. Currently just want this to work. 81 | # TODO: Google query can be a list of queries, not just a single query. 82 | def create_forecast( 83 | self, 84 | statement: Statement, 85 | openai_api_key, 86 | et_api_key, 87 | google_api_key, 88 | google_search_id, 89 | google_search_query, 90 | facts=None, 91 | prediction_agent="Test Agent", 92 | ): 93 | 94 | fact_llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 95 | fact_chatbot = ChatBot(fact_llm) 96 | 97 | if isinstance(google_search_query, str): 98 | print("CALLING SINGLE QUERY...") 99 | content = self.factbase.summarize_new_info( 100 | statement, 101 | fact_chatbot, 102 | google_api_key, 103 | google_search_id, 104 | google_search_query, 105 | ) 106 | elif isinstance(google_search_query, list): 107 | content = self.factbase.summarize_new_info_multiple_queries( 108 | statement, 109 | fact_chatbot, 110 | google_api_key, 111 | google_search_id, 112 | google_search_query, 113 | ) 114 | else: 115 | raise ValueError( 116 | "google_search_query must be a string or a list of strings" 117 | ) 118 | 119 | if content is None: 120 | print("No new content added to the forecast.") 121 | return None 122 | 123 | chatbot_messages = [ 124 | {"role": "system", "content": start_system_prompt}, 125 | {"role": "user", "content": start_user_prompt}, 126 | ] 127 | 128 | chatbot = self.chatbot 129 | 130 | prompt_template = ChatPrompt(chatbot_messages) 131 | 132 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 133 | 134 | additional_facts = "" 135 | if facts is not None: 136 | additional_facts = "Some additional facts for consideration are below...\n" 137 | afctr = 1 138 | for f in facts: 139 | additional_facts += f"AF{afctr}: {f}\n" 140 | afctr += 1 141 | additional_facts += "---------------------\n\n" 142 | 143 | chatbot.messages = prompt_template.fill( 144 | statement_title=statement.title, 145 | statement_description=statement.description, 146 | statement_fill_in_the_blank=statement.fill_in_the_blank, 147 | fill_in_the_blank_2=statement.fill_in_the_blank, 148 | content=content, 149 | the_date=the_date, 150 | additional_facts=additional_facts, 151 | ) 152 | 153 | assistant_analysis = chatbot.resend() 154 | 155 | print("\n\n\n") 156 | print(assistant_analysis) 157 | 158 | uh = UtilityHelper(openai_api_key) 159 | prediction = uh.extract_prediction( 160 | assistant_analysis, statement.fill_in_the_blank 161 | ) 162 | 163 | client = Client(et_api_key) 164 | 165 | full_content = content + "\n\n-----------------\n\n" + assistant_analysis 166 | 167 | response = client.create_forecast( 168 | statement.id, 169 | "Prediction", 170 | full_content, 171 | prediction, 172 | prediction_agent, 173 | { 174 | "full_response_from_llm_before_source_cleanup": content, 175 | "full_response_from_llm": assistant_analysis, 176 | "extracted_value": prediction, 177 | }, 178 | ) 179 | 180 | return response 181 | 182 | def extend_forecast( 183 | self, 184 | forecast: Forecast, 185 | openai_api_key, 186 | et_api_key, 187 | google_api_key, 188 | google_search_id, 189 | google_search_query, 190 | facts=None, 191 | prediction_agent="Test Agent", 192 | ): 193 | 194 | fact_llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 195 | fact_chatbot = ChatBot(fact_llm) 196 | 197 | if isinstance(google_search_query, str): 198 | content = self.factbase.summarize_new_info( 199 | forecast.statement, 200 | fact_chatbot, 201 | google_api_key, 202 | google_search_id, 203 | google_search_query, 204 | ) 205 | elif isinstance(google_search_query, list): 206 | content = self.factbase.summarize_new_info_multiple_queries( 207 | forecast.statement, 208 | fact_chatbot, 209 | google_api_key, 210 | google_search_id, 211 | google_search_query, 212 | ) 213 | else: 214 | raise ValueError( 215 | "google_search_query must be a string or a list of strings" 216 | ) 217 | 218 | if content is None: 219 | print("No new content added to the forecast.") 220 | return None 221 | 222 | chatbot_messages = [ 223 | {"role": "system", "content": start_system_prompt}, 224 | {"role": "user", "content": extend_user_prompt}, 225 | ] 226 | 227 | chatbot = self.chatbot 228 | 229 | prompt_template = ChatPrompt(chatbot_messages) 230 | 231 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 232 | 233 | additional_facts = "" 234 | if facts is not None: 235 | additional_facts = "Some additional facts for consideration are below...\n" 236 | afctr = 1 237 | for f in facts: 238 | additional_facts += f"AF{afctr}: {f}\n" 239 | afctr += 1 240 | additional_facts += "---------------------\n\n" 241 | 242 | chatbot.messages = prompt_template.fill( 243 | statement_title=forecast.statement.title, 244 | statement_description=forecast.statement.description, 245 | statement_fill_in_the_blank=forecast.statement.fill_in_the_blank, 246 | fill_in_the_blank_2=forecast.statement.fill_in_the_blank, 247 | content=content, 248 | the_date=the_date, 249 | additional_facts=additional_facts, 250 | earlier_forecast_value=str(forecast.value), 251 | earlier_forecast=forecast.justification, 252 | ) 253 | 254 | assistant_analysis = chatbot.resend() 255 | 256 | print("\n\n\n") 257 | print(assistant_analysis) 258 | 259 | uh = UtilityHelper(openai_api_key) 260 | prediction = uh.extract_prediction( 261 | assistant_analysis, forecast.statement.fill_in_the_blank 262 | ) 263 | 264 | client = Client(et_api_key) 265 | 266 | full_content = content + "\n\n-----------------\n\n" + assistant_analysis 267 | 268 | response = client.create_forecast( 269 | forecast.statement.id, 270 | "Prediction", 271 | full_content, 272 | prediction, 273 | prediction_agent, 274 | { 275 | "full_response_from_llm_before_source_cleanup": content, 276 | "full_response_from_llm": assistant_analysis, 277 | "extracted_value": prediction, 278 | }, 279 | forecast.id, 280 | ) 281 | 282 | return response 283 | -------------------------------------------------------------------------------- /emergingtrajectories/factsrag.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is an experimental approach to tracking information regardless of source type. It will also power more than recent updates. Here's how it works... 3 | 4 | 1. All "Content Sources" (a new class type that obtains content) will send content directly to the Facts DB. 5 | 2. The "Facts DB" will then extract all relevant facts for a prediction or research theme. It will keep cache the original content, will track the sources, and will also input all the facts into a RAG database. 6 | 3. We can then query the DB for relevant facts on an ad hoc basis, rather than only for new content. 7 | 8 | """ 9 | 10 | import os 11 | import json 12 | import hashlib 13 | import re 14 | 15 | # Using JSONEncoder to be consistent with the Emerging Trajectories website and platform. 16 | from django.core.serializers.json import DjangoJSONEncoder 17 | 18 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt 19 | from phasellm.agents import WebpageAgent, WebSearchAgent 20 | 21 | from datetime import datetime, timedelta 22 | 23 | from . import Client 24 | from .crawlers import crawlerPlaywright 25 | from .prompts import * 26 | from .news import NewsAPIAgent, RSSAgent, FinancialTimesAgent 27 | 28 | import chromadb 29 | import chromadb.utils.embedding_functions as embedding_functions 30 | 31 | # Number of search results to return from web searche (default value). 32 | _DEFAULT_NUM_SEARCH_RESULTS = 10 33 | 34 | facts_base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 35 | 36 | The project description is as follows... 37 | {statement_description} 38 | 39 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking. Rather than using bullet points, please list each as F1, F2, F3, etc... So that we can reference it. 40 | 41 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 42 | 43 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number. 44 | 45 | Thus, a bullet point would look like this: 46 | F1: (information) [1] 47 | F2: (information) [1] 48 | F3: (information) [2] 49 | 50 | ... and so on, where F1, F2, F3, etc. are facts, and [1], [2] are the source documents you are extracting the facts from. 51 | """ 52 | 53 | facts_base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 54 | 55 | ---------------------- 56 | {scraped_content} 57 | ---------------------- 58 | 59 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) share any insights you might have based on the facts. 60 | 61 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 62 | 63 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the actual number. 64 | 65 | DO NOT PROVIDE A FORECAST, BUT SIMPLY STATE AND SHARE THE FACTS AND INSIGHTS YOU HAVE GATHERED. 66 | """ 67 | 68 | fact_system_prompt = """You are a researcher helping extract facts about {topic}, trends, and related observations. We will give you a piece of content scraped on the web. Please extract facts from this. Each fact should stand on its own, and can be several sentences long if need be. You can have as many facts as needed. For each fact, please start it as a new line with "---" as the bullet point. For example: 69 | 70 | --- Fact 1... This is the fact. 71 | --- Here is a second fact. 72 | --- And a third fact. 73 | 74 | Please do not include new lines between bullet points. Make sure you write your facts in ENGLISH. Translate any foreign language content/facts/observations into ENGLISH. 75 | 76 | We will simply provide you with content and you will just provide facts.""" 77 | 78 | 79 | def uri_to_local(uri: str) -> str: 80 | """ 81 | Convert a URI to a local file name. In this case, we typically will use an MD5 sum. 82 | 83 | Args: 84 | uri (str): The URI to convert. 85 | 86 | Returns: 87 | str: The MD5 sum of the URI. 88 | """ 89 | uri_md5 = hashlib.md5(uri.encode("utf-8")).hexdigest() 90 | return uri_md5 91 | 92 | 93 | # TODO If this works, it should be an agent with setllm() supported, etc. 94 | # TODO Right now, we don't actually save sources. It's an important feature (track reliability, etc. too!) but we want to ensure the POC works well first. 95 | class FactRAGFileCache: 96 | 97 | def __init__( 98 | self, 99 | folder_path: str, 100 | openai_api_key: str, 101 | cache_file: str = "cache.json", 102 | sources_file: str = "sources.json", 103 | facts_file: str = "facts.json", 104 | rag_db_folder="cdb", 105 | crawler=None, 106 | ) -> None: 107 | """ 108 | This is a RAG-based fact database. We build a database of facts available in JSON and via RAG and use this as a basic search engine for information. We use ChromaDB to index all facts, but also maintain a list of facts, sources, etc. in a JSON file. Finally, we keep a cache of all content and assume URLs do not get updated; we'll change this process in the future. 109 | 110 | Args: 111 | folder_path (str): The folder where everything will be stored. 112 | openai_api_key (str): The OpenAI API key. Used for RAG embeddings. 113 | cache_file (str, optional): The name of the cache file. Defaults to "cache.json". 114 | sources_file (str, optional): The name of the sources file. Defaults to "sources.json". 115 | facts_file (str, optional): The name of the facts file. Defaults to "facts.json". 116 | rag_db_folder (str, optional): The folder where the ChromaDB database will be stored. Defaults to "cdb". 117 | crawler (optional): The crawler to use. Defaults to None, in which case a Playwright crawler will be used. 118 | """ 119 | self.root_path = folder_path 120 | self.root_parsed = os.path.join(folder_path, "parsed") 121 | self.root_original = os.path.join(folder_path, "original") 122 | self.cache_file = os.path.join(folder_path, cache_file) 123 | self.sources_file = os.path.join(folder_path, sources_file) 124 | self.facts_file = os.path.join(folder_path, facts_file) 125 | self.rag_db_folder = os.path.join(folder_path, rag_db_folder) 126 | self.openai_api_key = openai_api_key 127 | 128 | # Use the same default crawler for all other agents. 129 | # TODO Eventually we'll want to have an array agents we use to get content. 130 | if crawler is None: 131 | self.crawler = crawlerPlaywright() 132 | else: 133 | self.crawler = crawler 134 | 135 | # Set up / load Chroma DB 136 | openai_ef = embedding_functions.OpenAIEmbeddingFunction( 137 | api_key=self.openai_api_key, model_name="text-embedding-3-small" 138 | ) 139 | self.chromadb_client = chromadb.PersistentClient(path=self.rag_db_folder) 140 | self.facts_rag_collection = self.chromadb_client.get_or_create_collection( 141 | name="facts", embedding_function=openai_ef 142 | ) 143 | 144 | # Set up / load cache 145 | self.cache = self.load_cache() 146 | 147 | # Set up / load facts dictionary 148 | # TODO Eventually, move this to a database or table or something. 149 | self.facts = self.load_facts() 150 | 151 | # Set up / load sources dictionary 152 | # TODO Eventually, move this to a database or table or something. 153 | self.sources = self.load_sources() 154 | 155 | def query_to_fact_list( 156 | self, query: str, n_results: int = 10, since_date: datetime = None 157 | ) -> dict: 158 | """ 159 | Takes a query and finds the closest semantic matches to the query in the knowledge base. 160 | 161 | Args: 162 | query (str): The query to search for. 163 | n_results (int, optional): The number of results to return. Defaults to 10. 164 | since_date (datetime, optional): The date to search from. Defaults to None, in which case all dates are searched. 165 | 166 | Returns: 167 | dict: A list of the facts found, with the key being the fact ID and each fact having its source, add date, and content info. 168 | """ 169 | 170 | r = [] 171 | if since_date is None: 172 | r = self.facts_rag_collection.query( 173 | query_texts=[query], n_results=n_results 174 | ) 175 | else: 176 | r = self.facts_rag_collection.query( 177 | query_texts=[query], 178 | n_results=n_results, 179 | where={"added_on_timestamp": {"$gt": since_date.timestamp()}}, 180 | ) 181 | 182 | facts = {} 183 | for item in r["ids"][0]: 184 | facts[item] = { 185 | "content": self.facts[item]["content"], 186 | "source": self.facts[item]["source"], 187 | "added": self.facts[item]["added"], 188 | } 189 | 190 | return facts 191 | 192 | def query_to_fact_content( 193 | self, query: str, n_results: int = 10, since_date=None, skip_separator=False 194 | ) -> str: 195 | """ 196 | Takes a query and finds the closest semantic matches to the query in the knowledge base. 197 | 198 | Args: 199 | query (str): The query to search for. 200 | n_results (int, optional): The number of results to return. Defaults to 10. 201 | since_date ([type], optional): The date to search from. Defaults to None, in which case all dates are searched. 202 | skip_separator (bool, optional): Whether to prepend and append a note horizontal line and title to the string being returned. Defaults to False. 203 | 204 | Returns: 205 | str: The content of the facts found, along with the fact IDs. 206 | 207 | """ 208 | 209 | facts = self.query_to_fact_list(query, n_results, since_date) 210 | 211 | if len(facts) == 0: 212 | return "" 213 | 214 | fact_content = "" 215 | if not skip_separator: 216 | fact_content = """--- START FACTS ---------------------------\n""" 217 | 218 | for key, fact in facts.items(): 219 | fact_content += key + ": " + fact["content"] + "\n" 220 | 221 | if not skip_separator: 222 | fact_content += """--- END FACTS ---------------------------\n""" 223 | 224 | return fact_content 225 | 226 | def get_all_recent_facts(self, days: float = 1, skip_separator=False) -> str: 227 | """ 228 | Returns a list of all facts and sources added in the last n days. 229 | 230 | Args: 231 | days (float, optional): The number of days to search back. Defaults to 1. Can be fractional as well. 232 | skip_separator (bool, optional): Whether to prepend and append a note horizontal line and title to the string being returned. Defaults to False. 233 | 234 | Returns: 235 | str: The content of the facts found, along with the fact IDs. 236 | """ 237 | 238 | fact_content = "" 239 | 240 | if not skip_separator: 241 | fact_content = """--- START FACTS ---------------------------\n""" 242 | 243 | min_date_timestamp = (datetime.now() - timedelta(days=days)).timestamp() 244 | for key, fact in self.facts.items(): 245 | if fact["added_timestamp"] > min_date_timestamp: 246 | fact_content += key + ": " + fact["content"] + "\n" 247 | 248 | if not skip_separator: 249 | fact_content += """--- END FACTS ---------------------------\n""" 250 | 251 | return fact_content 252 | 253 | def save_facts_and_sources(self) -> None: 254 | """ 255 | Saves facts and sources to their respective files. 256 | """ 257 | with open(self.facts_file, "w") as f: 258 | json.dump(self.facts, f, indent=4, cls=DjangoJSONEncoder) 259 | with open(self.sources_file, "w") as f: 260 | json.dump(self.sources, f, indent=4, cls=DjangoJSONEncoder) 261 | 262 | def add_fact(self, fact: str, url: str) -> bool: 263 | """ 264 | Adds a fact to the knowledge base. 265 | 266 | Args: 267 | fact (str): The fact to add. 268 | url (str): The URL source of the fact. 269 | 270 | Returns: 271 | bool: True if the fact was added, False otherwise. 272 | """ 273 | 274 | fact_id_start = self.facts_rag_collection.count() + 1 275 | 276 | added_now = datetime.now() 277 | added_now_timestamp = added_now.timestamp() 278 | 279 | self.facts_rag_collection.add( 280 | documents=[fact], 281 | ids=[f"f{fact_id_start}"], 282 | metadatas=[{"added_on_timestamp": added_now_timestamp}], 283 | ) 284 | 285 | self.facts[f"f{fact_id_start}"] = { 286 | "added": added_now, 287 | "added_timestamp": added_now_timestamp, 288 | "source": url, 289 | "content": fact, 290 | "cid": f"f{fact_id_start}", 291 | } 292 | 293 | self.save_facts_and_sources() 294 | 295 | return True 296 | 297 | def facts_from_url(self, url: str, topic: str) -> None: 298 | """ 299 | Given a URL, extract facts from it and save them to ChromaDB and the facts dictionary. Also returns the facts in an array, in case one wants to analyze new facts. 300 | 301 | Args: 302 | url (str): Location of the content. 303 | topic (str): a brief description of the research you are undertaking. 304 | """ 305 | 306 | content = self.get(url) 307 | 308 | llm = OpenAIGPTWrapper(self.openai_api_key, model="gpt-4-turbo-preview") 309 | chatbot = ChatBot(llm) 310 | chatbot.messages = [{"role": "system", "content": fact_system_prompt}] 311 | 312 | prompt_template = ChatPrompt( 313 | [ 314 | {"role": "system", "content": fact_system_prompt}, 315 | ] 316 | ) 317 | 318 | chatbot.messages = prompt_template.fill(topic=topic) 319 | 320 | response = chatbot.chat(content) 321 | 322 | lines = response.split("\n") 323 | 324 | for line in lines: 325 | if line[0:4] == "--- ": 326 | fact = line[4:] 327 | self.add_fact(fact, url) 328 | 329 | self.save_facts_and_sources() 330 | 331 | # This builds facts based on RSS feeds. 332 | def new_get_rss_links(self, rss_url, topic) -> None: 333 | """ 334 | Crawls an RSS feed and its posts. 335 | 336 | Args: 337 | rss_url (str): The URL of the RSS feed. 338 | topic (str): a brief description of the research you are undertaking. 339 | """ 340 | 341 | rss_agent = RSSAgent(rss_url, crawler=self.crawler) 342 | urls = rss_agent.get_news_as_list() 343 | 344 | for url in urls: 345 | if not self.in_cache(url): 346 | print("RSS RESULT: " + url) 347 | try: 348 | self.facts_from_url(url, topic) 349 | except: 350 | print("Error; failed to get content from " + url) 351 | 352 | # This builds facts based on news articles. 353 | def new_get_new_info_news( 354 | self, 355 | newsapi_api_key, 356 | topic, 357 | queries, 358 | top_headlines=False, 359 | ) -> None: 360 | """ 361 | Uses the News API to find new information and extract facts from it. 362 | 363 | Args: 364 | newsapi_api_key (str): The News API key. 365 | topic (str): a brief description of the research you are undertaking. 366 | queries (list[str]): A list of queries to search for. 367 | top_headlines (bool, optional): Whether to search for top headlines. Defaults to False. 368 | """ 369 | 370 | news_agent = NewsAPIAgent( 371 | newsapi_api_key, top_headlines=top_headlines, crawler=self.crawler 372 | ) 373 | 374 | for q in queries: 375 | results = news_agent.get_news_as_list(q) 376 | for result in results["articles"]: 377 | url = result["url"] 378 | if not self.in_cache(url): 379 | print("NEWS RESULT: " + url) 380 | self.facts_from_url(url, topic) 381 | 382 | # POC for FT 383 | def get_ft_news(self, ft_user, ft_pass, topic) -> None: 384 | """ 385 | Uses the Financial Times Agent to find new information and extract facts from it. 386 | 387 | Args: 388 | ft_user (str): The Financial Times username. 389 | ft_pass (str): The Financial Times password. 390 | topic (str): a brief description of the research you are undertaking. 391 | """ 392 | 393 | fta = FinancialTimesAgent(ft_user, ft_pass) 394 | urls, html_content, text_content = fta.get_news() 395 | 396 | if len(urls) != len(text_content): 397 | raise ValueError("URLs and text content are not the same length.") 398 | 399 | for i in range(0, len(urls)): 400 | url = urls[i] 401 | content = text_content[i] 402 | 403 | if not self.in_cache(url): 404 | print("FT RESULT: " + url) 405 | self.force_content(url, content) 406 | self.facts_from_url(url, topic) 407 | 408 | # This builds facts based on all the google searches. 409 | def new_get_new_info_google( 410 | self, 411 | google_api_key, 412 | google_search_id, 413 | google_search_queries, 414 | topic, 415 | ) -> None: 416 | """ 417 | Uses Google search to find new information and extract facts from it. 418 | 419 | Args: 420 | google_api_key (str): The Google API key. 421 | google_search_id (str): The Google search ID. 422 | google_search_queries (list[str]): A list of queries to search for. 423 | topic (str): a brief description of the research you are undertaking. 424 | """ 425 | 426 | self.google_api_key = google_api_key 427 | self.google_search_id = google_search_id 428 | self.google_search_queries = google_search_queries 429 | 430 | webagent = WebSearchAgent(api_key=self.google_api_key) 431 | 432 | for google_search_query in self.google_search_queries: 433 | 434 | results = webagent.search_google( 435 | query=google_search_query, 436 | custom_search_engine_id=self.google_search_id, 437 | num=_DEFAULT_NUM_SEARCH_RESULTS, 438 | ) 439 | 440 | for result in results: 441 | if not self.in_cache(result.url): 442 | try: 443 | print("SEARCH RESULT: " + result.url) 444 | # page_content = self.get(result.url) 445 | self.facts_from_url(result.url, topic) 446 | # print(page_content) 447 | except Exception as e: 448 | print(f"Failed to get content from {result.url}\n{e}") 449 | 450 | def save_state(self) -> None: 451 | """ 452 | Saves the in-memory changes to the knowledge base to the JSON cache file. 453 | """ 454 | with open(self.cache_file, "w") as f: 455 | json.dump(self.cache, f, cls=DjangoJSONEncoder) 456 | 457 | def load_facts(self) -> dict: 458 | """ 459 | Loads the facts from the facts file. 460 | """ 461 | if not os.path.exists(self.facts_file): 462 | with open(self.facts_file, "w") as f: 463 | f.write("{}") 464 | 465 | with open(self.facts_file, "r") as f: 466 | self.facts = json.load(f) 467 | 468 | return self.facts 469 | 470 | def load_sources(self) -> dict: 471 | """ 472 | Loads the sources from the sources file. 473 | """ 474 | if not os.path.exists(self.sources_file): 475 | with open(self.sources_file, "w") as f: 476 | f.write("{}") 477 | 478 | with open(self.sources_file, "r") as f: 479 | self.sources = json.load(f) 480 | 481 | return self.sources 482 | 483 | def load_cache(self) -> None: 484 | """ 485 | Loads the cache from the cache file, or creates the relevant files and folders if one does not exist. 486 | """ 487 | 488 | if not os.path.exists(self.root_path): 489 | os.makedirs(self.root_path) 490 | 491 | if not os.path.exists(self.root_parsed): 492 | os.makedirs(self.root_parsed) 493 | 494 | if not os.path.exists(self.root_original): 495 | os.makedirs(self.root_original) 496 | 497 | if not os.path.exists(self.cache_file): 498 | with open(self.cache_file, "w") as f: 499 | f.write("{}") 500 | 501 | with open(self.cache_file, "r") as f: 502 | return json.load(f) 503 | 504 | def in_cache(self, uri: str) -> bool: 505 | """ 506 | Checks if a URI is in the cache already. 507 | 508 | Args: 509 | uri (str): The URI to check. 510 | 511 | Returns: 512 | bool: True if the URI is in the cache, False otherwise. 513 | """ 514 | if uri in self.cache: 515 | return True 516 | return False 517 | 518 | def update_cache( 519 | self, uri: str, obtained_on: datetime, last_accessed: datetime 520 | ) -> None: 521 | """ 522 | Updates the cache file for a given URI, specifically when it was obtained and last accessed. 523 | 524 | Args: 525 | uri (str): The URI to update. 526 | obtained_on (datetime): The date and time when the content was obtained. 527 | last_accessed (datetime): The date and time when the content was last accessed. 528 | """ 529 | uri_md5 = uri_to_local(uri) 530 | self.cache[uri] = { 531 | "obtained_on": obtained_on, 532 | "last_accessed": last_accessed, 533 | "accessed": 0, 534 | "uri_md5": uri_md5, 535 | } 536 | self.save_state() 537 | 538 | def log_access(self, uri: str) -> None: 539 | """ 540 | Saves the last accessed time and updates the accessed tracker for a given URI. 541 | 542 | Args: 543 | uri (str): The URI to update. 544 | """ 545 | self.cache[uri]["last_accessed"] = datetime.now() 546 | self.cache[uri]["accessed"] = 1 547 | self.save_state() 548 | 549 | def get_unaccessed_content(self) -> list[str]: 550 | """ 551 | Returns a list of URIs that have not been accessed by the agent. 552 | 553 | Returns: 554 | list[str]: A list of URIs that have not been accessed by the agent. 555 | """ 556 | unaccessed = [] 557 | for uri in self.cache: 558 | if self.cache[uri]["accessed"] == 0: 559 | unaccessed.append(uri) 560 | return unaccessed 561 | 562 | def force_content(self, uri: str, content: str, check_exists: bool = True) -> bool: 563 | """ 564 | Forces a specific URI to have specific content (both HTML and text content). Used to fill old links that we don't actually want to crawl. 565 | 566 | Args: 567 | uri (str): The URI to force content for. 568 | content (str): The content to force. 569 | check_exists (bool): checks if content has already been included in the cache before forcing the new content. 570 | 571 | Returns: 572 | bool: True if the content was forced, False otherwise. 573 | """ 574 | 575 | # If the content already exists and we avoid overwrites, then we don't want to overwrite it. 576 | if check_exists and self.in_cache(uri): 577 | return False 578 | 579 | uri_md5 = uri_to_local(uri) 580 | with open(os.path.join(self.root_original, uri_md5), "w") as f: 581 | f.write(content) 582 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 583 | f.write(content) 584 | 585 | self.update_cache(uri, datetime.now(), datetime.now()) 586 | self.log_access(uri) 587 | 588 | return True 589 | 590 | def get(self, uri: str) -> str: 591 | """ 592 | Returns the content for a given URI. If the content is not in the cache, it will be scraped and added to the cache. 593 | 594 | Args: 595 | uri (str): The URI to get the content for. 596 | 597 | Returns: 598 | str: The content for the given URI. 599 | """ 600 | uri_md5 = uri_to_local(uri) 601 | if uri in self.cache: 602 | with open(os.path.join(self.root_parsed, uri_md5), "r") as f: 603 | return f.read() 604 | else: 605 | # scraper = WebpageAgent() 606 | 607 | # content_raw = scraper.scrape(uri, text_only=False, body_only=False) 608 | # with open(os.path.join(self.root_original, uri_md5), "w") as f: 609 | # f.write(content_raw) 610 | 611 | # content_parsed = scraper.scrape(uri, text_only=True, body_only=True) 612 | # with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 613 | # f.write(content_parsed) 614 | 615 | try: 616 | content, text = self.crawler.get_content(uri) 617 | except Exception as e: 618 | print(f"Failed to get content from {uri}\n{e}") 619 | content = "" 620 | text = "" 621 | 622 | with open(os.path.join(self.root_original, uri_md5), "w") as f: 623 | f.write(content) 624 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 625 | f.write(text) 626 | 627 | self.update_cache(uri, datetime.now(), datetime.now()) 628 | 629 | return text 630 | 631 | def add_content(self, content: str, uri: str = None) -> None: 632 | """ 633 | Adds content to cache. 634 | 635 | Args: 636 | content (str): The content to add to the cache. 637 | uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used. 638 | """ 639 | if uri is None: 640 | uri = hashlib.md5(content.encode("utf-8")).hexdigest() 641 | uri_md5 = uri_to_local(uri) 642 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 643 | f.write(content) 644 | self.update_cache(uri, datetime.now(), datetime.now()) 645 | 646 | def add_content_from_file(self, filepath: str, uri: str = None) -> None: 647 | """ 648 | Adds content from a text file to the cache. 649 | 650 | Args: 651 | filepath (str): The path to the file to add to the cache. 652 | uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used. 653 | """ 654 | with open(filepath, "r") as f: 655 | content = f.read() 656 | self.add_content(content, uri) 657 | 658 | 659 | class FactBot: 660 | 661 | def __init__( 662 | self, 663 | knowledge_db: FactRAGFileCache, 664 | openai_api_key: str = None, 665 | chatbot: ChatBot = None, 666 | ) -> None: 667 | """ 668 | The FactBot is like a ChatBot but enables you to ask questions that reference an underlying RAG database (KnowledgeBaseFileCache), which then enables the chatbot to cite sourcable facts. 669 | 670 | Args: 671 | knowledge_db (FactRAGFileCache): The knowledge database to use. 672 | openai_api_key (str, optional): The OpenAI API key. Defaults to None. 673 | chatbot (ChatBot, optional): The PhaseLLB chatbot to use. Defaults to None, in which case an OpenAI chatbot is used (and the OpenAI API key must be provided). 674 | """ 675 | if openai_api_key is None and chatbot is None: 676 | raise ValueError("One of openai_api_key or chatbot must be provided.") 677 | 678 | if chatbot is not None: 679 | self.chatbot = chatbot 680 | else: 681 | llm = OpenAIGPTWrapper(openai_api_key, model="gpt-4-turbo-preview") 682 | self.chatbot = ChatBot(llm) 683 | self.chatbot.messages = [ 684 | {"role": "system", "content": system_prompt_question_continuous} 685 | ] 686 | 687 | self.knowledge_db = knowledge_db 688 | 689 | def ask(self, question: str, clean_sources: bool = True) -> str: 690 | """ 691 | Ask a question to the FactBot. This will query the underlying knowledge database and use the returned facts to answer the question. 692 | 693 | Args: 694 | question (str): The question to ask. 695 | clean_sources (bool, optional): Whether to clean the sources in the response. Defaults to True; in this case, it will replace fact IDs with relevant source links at the end of the response. 696 | 697 | Returns: 698 | str: The response to the question. 699 | """ 700 | message = self.knowledge_db.query_to_fact_content(question) + "\n\n" + question 701 | response = self.chatbot.chat(message) 702 | if clean_sources: 703 | return clean_fact_citations(self.knowledge_db, response) 704 | else: 705 | return response 706 | 707 | def source(self, fact_id: str) -> str: 708 | """ 709 | Returns the URL source for a given fact ID. 710 | 711 | Args: 712 | fact_id (str): The fact ID to get the source for. 713 | 714 | Returns: 715 | str: The URL source for the given fact ID. 716 | """ 717 | if fact_id in self.knowledge_db.facts: 718 | return self.knowledge_db.facts[fact_id]["source"] 719 | 720 | if fact_id.lower() in self.knowledge_db.facts: 721 | return self.knowledge_db.facts[fact_id.lower()]["source"] 722 | 723 | raise ValueError( 724 | f"Fact ID " + str(fact_id) + " not found in the knowledge database." 725 | ) 726 | 727 | def clean_and_source_to_html( 728 | self, text_to_clean: str, start_count: int = 0 729 | ) -> list: 730 | """ 731 | Returns a formatted response with sourced HTML. This is used for emergingtrajectories.com and acts as a base for anyone else wanting to build similar features. 732 | 733 | Args: 734 | text_to_clean: The text to clean/cite/source. 735 | start_count: The starting count for the sources. 736 | 737 | Returns: 738 | list: two strings -- the actual response in the first case, and the sources in the second case, and an integer representing the new source count. 739 | """ 740 | 741 | pattern = r"\[f[\d\s\,f]+\]" 742 | new_text = "" 743 | sources_text = "" 744 | ref_ctr = start_count 745 | last_index = 0 746 | 747 | for match in re.finditer(pattern, text_to_clean, flags=re.IGNORECASE): 748 | 749 | if match.group(0).find(",") == -1: 750 | ref_ctr += 1 751 | ref = match.group(0)[1:-1].strip() 752 | ref = ref.lower() 753 | 754 | new_text += text_to_clean[last_index : match.start()] 755 | new_text += f"""{ref_ctr}""" 756 | 757 | # Save the source 758 | fact_text = self.knowledge_db.facts[ref]["content"] 759 | new_source_text = f"""{ref_ctr}: {fact_text} View Source""" 760 | sources_text += new_source_text + "\n" 761 | 762 | last_index = match.end() 763 | else: 764 | refs = match.group(0)[1:-1].split(",") 765 | ref_arr = [] 766 | ref_str = "" 767 | for ref in refs: 768 | ref = ref.strip() 769 | ref = ref.lower() 770 | ref_ctr += 1 771 | ref_arr.append(str(ref_ctr)) 772 | 773 | # Add the source to the text 774 | new_text_source_num = f"""{ref_ctr}""" 775 | ref_str += " " + new_text_source_num 776 | 777 | # Save the source 778 | fact_text = self.knowledge_db.facts[ref]["content"] 779 | new_source_text = f"""{ref_ctr}: ${fact_text} View Source""" 780 | sources_text += new_source_text + "\n" 781 | 782 | new_text += text_to_clean[last_index : match.start()] + ref_str 783 | last_index = match.end() 784 | 785 | new_text += text_to_clean[last_index:] 786 | 787 | return new_text, sources_text, ref_ctr 788 | 789 | 790 | def clean_fact_citations(knowledge_db: FactRAGFileCache, text_to_clean: str) -> str: 791 | """ 792 | Converts fact IDs referenced in a piece of text to relevant source links, appending sources as end notes in the document/text. 793 | 794 | Args: 795 | knowledge_db (FactRAGFileCache): The knowledge database to use for fact lookups. 796 | text_to_clean (str): The text to clean. 797 | 798 | Returns: 799 | str: The cleaned text. 800 | """ 801 | bot = FactBot(knowledge_db, knowledge_db.openai_api_key) 802 | pattern = r"\[f[\d\s\,f]+\]" 803 | new_text = "" 804 | ref_ctr = 0 805 | last_index = 0 806 | sources_list = "" 807 | for match in re.finditer(pattern, text_to_clean): 808 | if match.group(0).find(",") == -1: 809 | ref_ctr += 1 810 | ref = match.group(0)[1:-1].strip() 811 | new_text += text_to_clean[last_index : match.start()] 812 | new_text += f"[{ref_ctr}]" 813 | sources_list += f"{ref_ctr} :: " + bot.source(f"{ref}") + "\n" 814 | last_index = match.end() 815 | else: 816 | refs = match.group(0)[1:-1].split(",") 817 | ref_arr = [] 818 | for ref in refs: 819 | ref = ref.strip() 820 | ref_ctr += 1 821 | ref_arr.append(str(ref_ctr)) 822 | sources_list += f"{ref_ctr} :: " + bot.source(f"{ref}") + "\n" 823 | ref_str = "[" + ", ".join(ref_arr) + "]" 824 | new_text += text_to_clean[last_index : match.start()] + ref_str 825 | last_index = match.end() 826 | 827 | new_text += text_to_clean[last_index:] 828 | 829 | if ref_ctr == 0: 830 | return text_to_clean 831 | else: 832 | return new_text + "\n\nSources:\n" + sources_list 833 | -------------------------------------------------------------------------------- /emergingtrajectories/factsragforecaster.py: -------------------------------------------------------------------------------- 1 | from .recursiveagent import ETClient 2 | from .factsrag import FactRAGFileCache, FactBot, clean_fact_citations 3 | from .utils import UtilityHelper 4 | from . import Client, Statement, Forecast 5 | 6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt 7 | 8 | from datetime import datetime 9 | 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you. All your answers should be absed on these facts ONLY. 11 | 12 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts... 13 | 14 | F1: The President of the USA is Joe Biden. 15 | F2: The Vice President of the USA is Kamala Harris. 16 | 17 | ... your answers hould be something like this: 18 | 19 | The President of th USA is Joe Biden [F1]. 20 | 21 | We will give you a list of facts for every question. You can reference those facts, or you can also reference earlier facts from the conversatio chain. YOU CANNOT USE OTHER INFORMATION.""" 22 | 23 | start_user_prompt = """Here is the research: 24 | {content} 25 | {additional_facts} 26 | ------------ 27 | 28 | Given the above, we need you to do your best to fill in the following blank... 29 | {fill_in_the_blank} 30 | 31 | PLEASE DO THE FOLLOWING: 32 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 33 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 34 | - Do not provide a range; provide ONE number. 35 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 36 | 37 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 38 | """ 39 | 40 | extend_user_prompt = """Here is the research: 41 | {content} 42 | {additional_facts} 43 | --------------------- 44 | 45 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast... 46 | --------------------- 47 | FORECAST: {earlier_forecast_value} 48 | 49 | JUSTIFICATION: 50 | {earlier_forecast} 51 | --------------------- 52 | 53 | Given the above, we need you to do your best to fill in the following blank... 54 | {fill_in_the_blank} 55 | 56 | PLEASE DO THE FOLLOWING: 57 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 58 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 59 | - Do not provide a range; provide ONE number. 60 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 61 | 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.""" 63 | 64 | 65 | class FactsRAGForecastingAgent(object): 66 | 67 | # TODO: document / clean up 68 | def __init__( 69 | self, 70 | client: ETClient, 71 | chatbot: ChatBot, 72 | factbase: FactRAGFileCache, 73 | ): 74 | 75 | self.client = client 76 | self.chatbot = chatbot 77 | self.factbase = factbase 78 | 79 | # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity. 80 | def setChatBot(self, chatbot): 81 | self.chatbot = chatbot 82 | 83 | # TODO: standardize -- camel case or snake case? Or something else? 84 | def getChatBot(self): 85 | return self.chatbot 86 | 87 | # TODO: we can do much better at disaggregating all these functions. Currently just want this to work. 88 | def create_forecast( 89 | self, 90 | statement: Statement, 91 | openai_api_key, 92 | et_api_key, 93 | facts=None, 94 | prediction_agent="Test Agent", 95 | ): 96 | 97 | # factbot = FactBot(self.factbase, openai_api_key) 98 | query1 = self.factbase.query_to_fact_content( 99 | statement.fill_in_the_blank, n_results=25, skip_separator=True 100 | ) 101 | query2 = self.factbase.query_to_fact_content( 102 | statement.description, n_results=25, skip_separator=True 103 | ) 104 | 105 | if len(query1) == 0 and len(query2) == 0: 106 | print("No new content added to the forecast.") 107 | return None 108 | 109 | facts_to_use = ( 110 | """--- START FACTS ---------------------------\n""" 111 | + query1.strip() 112 | + "\n" 113 | + query2.strip() 114 | + """--- END FACTS ---------------------------\n""" 115 | ) 116 | 117 | chatbot_messages = [ 118 | {"role": "system", "content": start_system_prompt}, 119 | {"role": "user", "content": start_user_prompt}, 120 | ] 121 | 122 | chatbot = self.chatbot 123 | 124 | prompt_template = ChatPrompt(chatbot_messages) 125 | 126 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 127 | 128 | additional_facts = "" 129 | if facts is not None: 130 | additional_facts = "Some additional facts for consideration are below...\n" 131 | afctr = 1 132 | for f in facts: 133 | additional_facts += f"AF{afctr}: {f}\n" 134 | afctr += 1 135 | additional_facts += "---------------------\n\n" 136 | 137 | chatbot.messages = prompt_template.fill( 138 | statement_title=statement.title, 139 | statement_description=statement.description, 140 | statement_fill_in_the_blank=statement.fill_in_the_blank, 141 | fill_in_the_blank_2=statement.fill_in_the_blank, 142 | content=facts_to_use, 143 | the_date=the_date, 144 | additional_facts=additional_facts, 145 | ) 146 | 147 | assistant_analysis = chatbot.resend() 148 | full_content = clean_fact_citations(self.factbase, assistant_analysis) 149 | 150 | print("\n\n\n") 151 | print(assistant_analysis) 152 | 153 | uh = UtilityHelper(openai_api_key) 154 | prediction = uh.extract_prediction( 155 | assistant_analysis, statement.fill_in_the_blank 156 | ) 157 | 158 | client = Client(et_api_key) 159 | 160 | # full_content = content + "\n\n-----------------\n\n" + assistant_analysis 161 | 162 | response = client.create_forecast( 163 | statement.id, 164 | "Prediction", 165 | full_content, 166 | prediction, 167 | prediction_agent, 168 | { 169 | # "full_response_from_llm_before_source_cleanup": content, 170 | "full_response_from_llm": assistant_analysis, 171 | "extracted_value": prediction, 172 | }, 173 | ) 174 | 175 | return response 176 | 177 | def extend_forecast( 178 | self, 179 | forecast: Forecast, 180 | openai_api_key, 181 | et_api_key, 182 | facts=None, 183 | prediction_agent="Test Agent", 184 | ): 185 | 186 | # Note: we only update the forecast with data/info we added since the last forecast. 187 | 188 | query1 = self.factbase.query_to_fact_content( 189 | forecast.statement.fill_in_the_blank, 190 | n_results=25, 191 | skip_separator=True, 192 | since_date=forecast.created_at, 193 | ) 194 | query2 = self.factbase.query_to_fact_content( 195 | forecast.statement.description, 196 | n_results=25, 197 | skip_separator=True, 198 | since_date=forecast.created_at, 199 | ) 200 | 201 | if len(query1) == 0 and len(query2) == 0: 202 | print("No new content added to the forecast.") 203 | return None 204 | 205 | facts_to_use = ( 206 | """--- START FACTS ---------------------------\n""" 207 | + query1.strip() 208 | + "\n" 209 | + query2.strip() 210 | + """--- END FACTS ---------------------------\n""" 211 | ) 212 | 213 | chatbot_messages = [ 214 | {"role": "system", "content": start_system_prompt}, 215 | {"role": "user", "content": extend_user_prompt}, 216 | ] 217 | 218 | chatbot = self.chatbot 219 | 220 | prompt_template = ChatPrompt(chatbot_messages) 221 | 222 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 223 | 224 | additional_facts = "" 225 | if facts is not None: 226 | additional_facts = "Some additional facts for consideration are below...\n" 227 | afctr = 1 228 | for f in facts: 229 | additional_facts += f"AF{afctr}: {f}\n" 230 | afctr += 1 231 | additional_facts += "---------------------\n\n" 232 | 233 | chatbot.messages = prompt_template.fill( 234 | statement_title=forecast.statement.title, 235 | statement_description=forecast.statement.description, 236 | statement_fill_in_the_blank=forecast.statement.fill_in_the_blank, 237 | fill_in_the_blank_2=forecast.statement.fill_in_the_blank, 238 | content=facts_to_use, 239 | the_date=the_date, 240 | additional_facts=additional_facts, 241 | earlier_forecast_value=str(forecast.value), 242 | earlier_forecast=forecast.justification, 243 | ) 244 | 245 | assistant_analysis = chatbot.resend() 246 | 247 | # print("\n\n\n") 248 | # print(assistant_analysis) 249 | 250 | full_content = clean_fact_citations(self.factbase, assistant_analysis) 251 | print(full_content) 252 | 253 | uh = UtilityHelper(openai_api_key) 254 | prediction = uh.extract_prediction( 255 | assistant_analysis, forecast.statement.fill_in_the_blank 256 | ) 257 | 258 | client = Client(et_api_key) 259 | 260 | response = client.create_forecast( 261 | forecast.statement.id, 262 | "Prediction", 263 | full_content, 264 | prediction, 265 | prediction_agent, 266 | { 267 | "full_response_from_llm": assistant_analysis, 268 | "extracted_value": prediction, 269 | }, 270 | forecast.id, 271 | ) 272 | 273 | return response 274 | -------------------------------------------------------------------------------- /emergingtrajectories/factsragforecaster2.py: -------------------------------------------------------------------------------- 1 | from .recursiveagent import ETClient 2 | from .factsrag2 import FactRAGFileCache, FactBot, clean_fact_citations 3 | from .utils import UtilityHelper 4 | from . import Client, Statement, Forecast 5 | 6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt 7 | 8 | from datetime import datetime 9 | 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you. All your answers should be absed on these facts ONLY. 11 | 12 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts... 13 | 14 | F1: The President of the USA is Joe Biden. 15 | F2: The Vice President of the USA is Kamala Harris. 16 | 17 | ... your answers hould be something like this: 18 | 19 | The President of th USA is Joe Biden [F1]. 20 | 21 | We will give you a list of facts for every question. You can reference those facts, or you can also reference earlier facts from the conversatio chain. YOU CANNOT USE OTHER INFORMATION.""" 22 | 23 | start_user_prompt = """Here is the research: 24 | {content} 25 | {additional_facts} 26 | ------------ 27 | 28 | Given the above, we need you to do your best to fill in the following blank... 29 | {fill_in_the_blank} 30 | 31 | PLEASE DO THE FOLLOWING: 32 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 33 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 34 | - Do not provide a range; provide ONE number. 35 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 36 | 37 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 38 | """ 39 | 40 | extend_user_prompt = """Here is the research: 41 | {content} 42 | {additional_facts} 43 | --------------------- 44 | 45 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast... 46 | --------------------- 47 | FORECAST: {earlier_forecast_value} 48 | 49 | JUSTIFICATION: 50 | {earlier_forecast} 51 | --------------------- 52 | 53 | Given the above, we need you to do your best to fill in the following blank... 54 | {fill_in_the_blank} 55 | 56 | PLEASE DO THE FOLLOWING: 57 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 58 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 59 | - Do not provide a range; provide ONE number. 60 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 61 | 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.""" 63 | 64 | 65 | class FactsRAGForecastingAgent(object): 66 | 67 | # TODO: document / clean up 68 | def __init__( 69 | self, 70 | client: ETClient, 71 | chatbot: ChatBot, 72 | factbase: FactRAGFileCache, 73 | ): 74 | 75 | self.client = client 76 | self.chatbot = chatbot 77 | self.factbase = factbase 78 | 79 | # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity. 80 | def setChatBot(self, chatbot): 81 | self.chatbot = chatbot 82 | 83 | # TODO: standardize -- camel case or snake case? Or something else? 84 | def getChatBot(self): 85 | return self.chatbot 86 | 87 | # TODO: we can do much better at disaggregating all these functions. Currently just want this to work. 88 | def create_forecast( 89 | self, 90 | statement: Statement, 91 | openai_api_key, 92 | et_api_key, 93 | facts=None, 94 | prediction_agent="Test Agent", 95 | ): 96 | 97 | # factbot = FactBot(self.factbase, openai_api_key) 98 | query1 = self.factbase.query_to_fact_content( 99 | statement.fill_in_the_blank, n_results=25, skip_separator=True 100 | ) 101 | query2 = self.factbase.query_to_fact_content( 102 | statement.description, n_results=25, skip_separator=True 103 | ) 104 | 105 | if len(query1) == 0 and len(query2) == 0: 106 | print("No new content added to the forecast.") 107 | return None 108 | 109 | facts_to_use = ( 110 | """--- START FACTS ---------------------------\n""" 111 | + query1.strip() 112 | + "\n" 113 | + query2.strip() 114 | + """--- END FACTS ---------------------------\n""" 115 | ) 116 | 117 | chatbot_messages = [ 118 | {"role": "system", "content": start_system_prompt}, 119 | {"role": "user", "content": start_user_prompt}, 120 | ] 121 | 122 | chatbot = self.chatbot 123 | 124 | prompt_template = ChatPrompt(chatbot_messages) 125 | 126 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 127 | 128 | additional_facts = "" 129 | if facts is not None: 130 | additional_facts = "Some additional facts for consideration are below...\n" 131 | afctr = 1 132 | for f in facts: 133 | additional_facts += f"AF{afctr}: {f}\n" 134 | afctr += 1 135 | additional_facts += "---------------------\n\n" 136 | 137 | chatbot.messages = prompt_template.fill( 138 | statement_title=statement.title, 139 | statement_description=statement.description, 140 | statement_fill_in_the_blank=statement.fill_in_the_blank, 141 | fill_in_the_blank_2=statement.fill_in_the_blank, 142 | content=facts_to_use, 143 | the_date=the_date, 144 | additional_facts=additional_facts, 145 | ) 146 | 147 | assistant_analysis = chatbot.resend() 148 | full_content = clean_fact_citations(self.factbase, assistant_analysis) 149 | 150 | print("\n\n\n") 151 | print(assistant_analysis) 152 | 153 | uh = UtilityHelper(openai_api_key) 154 | prediction = uh.extract_prediction( 155 | assistant_analysis, statement.fill_in_the_blank 156 | ) 157 | 158 | client = Client(et_api_key) 159 | 160 | # full_content = content + "\n\n-----------------\n\n" + assistant_analysis 161 | 162 | response = client.create_forecast( 163 | statement.id, 164 | "Prediction", 165 | full_content, 166 | prediction, 167 | prediction_agent, 168 | { 169 | # "full_response_from_llm_before_source_cleanup": content, 170 | "full_response_from_llm": assistant_analysis, 171 | "extracted_value": prediction, 172 | }, 173 | ) 174 | 175 | return response 176 | 177 | def extend_forecast( 178 | self, 179 | forecast: Forecast, 180 | openai_api_key, 181 | et_api_key, 182 | facts=None, 183 | prediction_agent="Test Agent", 184 | ): 185 | 186 | # Note: we only update the forecast with data/info we added since the last forecast. 187 | 188 | query1 = self.factbase.query_to_fact_content( 189 | forecast.statement.fill_in_the_blank, 190 | n_results=25, 191 | skip_separator=True, 192 | since_date=forecast.created_at, 193 | ) 194 | query2 = self.factbase.query_to_fact_content( 195 | forecast.statement.description, 196 | n_results=25, 197 | skip_separator=True, 198 | since_date=forecast.created_at, 199 | ) 200 | 201 | if len(query1) == 0 and len(query2) == 0: 202 | print("No new content added to the forecast.") 203 | return None 204 | 205 | facts_to_use = ( 206 | """--- START FACTS ---------------------------\n""" 207 | + query1.strip() 208 | + "\n" 209 | + query2.strip() 210 | + """--- END FACTS ---------------------------\n""" 211 | ) 212 | 213 | chatbot_messages = [ 214 | {"role": "system", "content": start_system_prompt}, 215 | {"role": "user", "content": extend_user_prompt}, 216 | ] 217 | 218 | chatbot = self.chatbot 219 | 220 | prompt_template = ChatPrompt(chatbot_messages) 221 | 222 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 223 | 224 | additional_facts = "" 225 | if facts is not None: 226 | additional_facts = "Some additional facts for consideration are below...\n" 227 | afctr = 1 228 | for f in facts: 229 | additional_facts += f"AF{afctr}: {f}\n" 230 | afctr += 1 231 | additional_facts += "---------------------\n\n" 232 | 233 | chatbot.messages = prompt_template.fill( 234 | statement_title=forecast.statement.title, 235 | statement_description=forecast.statement.description, 236 | statement_fill_in_the_blank=forecast.statement.fill_in_the_blank, 237 | fill_in_the_blank_2=forecast.statement.fill_in_the_blank, 238 | content=facts_to_use, 239 | the_date=the_date, 240 | additional_facts=additional_facts, 241 | earlier_forecast_value=str(forecast.value), 242 | earlier_forecast=forecast.justification, 243 | ) 244 | 245 | assistant_analysis = chatbot.resend() 246 | 247 | # print("\n\n\n") 248 | # print(assistant_analysis) 249 | 250 | full_content = clean_fact_citations(self.factbase, assistant_analysis) 251 | print(full_content) 252 | 253 | uh = UtilityHelper(openai_api_key) 254 | prediction = uh.extract_prediction( 255 | assistant_analysis, forecast.statement.fill_in_the_blank 256 | ) 257 | 258 | client = Client(et_api_key) 259 | 260 | response = client.create_forecast( 261 | forecast.statement.id, 262 | "Prediction", 263 | full_content, 264 | prediction, 265 | prediction_agent, 266 | { 267 | "full_response_from_llm": assistant_analysis, 268 | "extracted_value": prediction, 269 | }, 270 | forecast.id, 271 | ) 272 | 273 | return response 274 | -------------------------------------------------------------------------------- /emergingtrajectories/factsragforecaster3.py: -------------------------------------------------------------------------------- 1 | from .recursiveagent import ETClient 2 | from .factsrag3 import FactRAGFileCache, FactBot, clean_fact_citations 3 | from .utils import UtilityHelper 4 | from . import Client, Statement, Forecast 5 | 6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt 7 | 8 | from datetime import datetime 9 | 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you. All your answers should be absed on these facts ONLY. 11 | 12 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts... 13 | 14 | f1: The President of the USA is Joe Biden. 15 | f2: The Vice President of the USA is Kamala Harris. 16 | 17 | ... your answers hould be something like this: 18 | 19 | The President of th USA is Joe Biden [f1]. 20 | 21 | We will give you a list of facts for every question. You can reference those facts, or you can also reference earlier facts from the conversatio chain. YOU CANNOT USE OTHER INFORMATION.""" 22 | 23 | start_user_prompt = """Here is the research: 24 | {content} 25 | {additional_facts} 26 | ------------ 27 | 28 | Given the above, we need you to do your best to fill in the following blank... 29 | {fill_in_the_blank} 30 | 31 | PLEASE DO THE FOLLOWING: 32 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 33 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 34 | - Do not provide a range; provide ONE number. 35 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 36 | 37 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 38 | """ 39 | 40 | extend_user_prompt = """Here is the research: 41 | {content} 42 | {additional_facts} 43 | --------------------- 44 | 45 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast... 46 | --------------------- 47 | FORECAST: {earlier_forecast_value} 48 | 49 | JUSTIFICATION: 50 | {earlier_forecast} 51 | --------------------- 52 | 53 | Given the above, we need you to do your best to fill in the following blank... 54 | {fill_in_the_blank} 55 | 56 | PLEASE DO THE FOLLOWING: 57 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE. 58 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number. 59 | - Do not provide a range; provide ONE number. 60 | - End your forecast with the filled-in statement: {fill_in_the_blank_2} 61 | 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.""" 63 | 64 | 65 | class FactsRAGForecastingAgent(object): 66 | 67 | # TODO: document / clean up 68 | def __init__( 69 | self, 70 | client: ETClient, 71 | chatbot: ChatBot, 72 | factbase: FactRAGFileCache, 73 | ): 74 | 75 | self.client = client 76 | self.chatbot = chatbot 77 | self.factbase = factbase 78 | 79 | # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity. 80 | def setChatBot(self, chatbot): 81 | self.chatbot = chatbot 82 | 83 | # TODO: standardize -- camel case or snake case? Or something else? 84 | def getChatBot(self): 85 | return self.chatbot 86 | 87 | # TODO: we can do much better at disaggregating all these functions. Currently just want this to work. 88 | def create_forecast( 89 | self, 90 | statement: Statement, 91 | openai_api_key, 92 | et_api_key, 93 | facts=None, 94 | prediction_agent="Test Agent", 95 | ): 96 | 97 | # factbot = FactBot(self.factbase, openai_api_key) 98 | query1 = self.factbase.query_to_fact_content( 99 | statement.fill_in_the_blank, n_results=25, skip_separator=True 100 | ) 101 | query2 = self.factbase.query_to_fact_content( 102 | statement.description, n_results=25, skip_separator=True 103 | ) 104 | 105 | if len(query1) == 0 and len(query2) == 0: 106 | print("No new content added to the forecast.") 107 | return None 108 | 109 | facts_to_use = ( 110 | """--- START FACTS ---------------------------\n""" 111 | + query1.strip() 112 | + "\n" 113 | + query2.strip() 114 | + """--- END FACTS ---------------------------\n""" 115 | ) 116 | 117 | chatbot_messages = [ 118 | {"role": "system", "content": start_system_prompt}, 119 | {"role": "user", "content": start_user_prompt}, 120 | ] 121 | 122 | chatbot = self.chatbot 123 | 124 | prompt_template = ChatPrompt(chatbot_messages) 125 | 126 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 127 | 128 | additional_facts = "" 129 | if facts is not None: 130 | additional_facts = "Some additional facts for consideration are below...\n" 131 | afctr = 1 132 | for f in facts: 133 | additional_facts += f"AF{afctr}: {f}\n" 134 | afctr += 1 135 | additional_facts += "---------------------\n\n" 136 | 137 | chatbot.messages = prompt_template.fill( 138 | statement_title=statement.title, 139 | statement_description=statement.description, 140 | statement_fill_in_the_blank=statement.fill_in_the_blank, 141 | fill_in_the_blank_2=statement.fill_in_the_blank, 142 | content=facts_to_use, 143 | the_date=the_date, 144 | additional_facts=additional_facts, 145 | ) 146 | 147 | assistant_analysis = chatbot.resend() 148 | full_content = clean_fact_citations(self.factbase, assistant_analysis) 149 | 150 | print("\n\n\n") 151 | print(assistant_analysis) 152 | 153 | uh = UtilityHelper(openai_api_key) 154 | prediction = uh.extract_prediction( 155 | assistant_analysis, statement.fill_in_the_blank 156 | ) 157 | 158 | client = Client(et_api_key) 159 | 160 | # full_content = content + "\n\n-----------------\n\n" + assistant_analysis 161 | 162 | response = client.create_forecast( 163 | statement.id, 164 | "Prediction", 165 | full_content, 166 | prediction, 167 | prediction_agent, 168 | { 169 | # "full_response_from_llm_before_source_cleanup": content, 170 | "full_response_from_llm": assistant_analysis, 171 | "extracted_value": prediction, 172 | }, 173 | ) 174 | 175 | return response 176 | 177 | def extend_forecast( 178 | self, 179 | forecast: Forecast, 180 | openai_api_key, 181 | et_api_key, 182 | facts=None, 183 | prediction_agent="Test Agent", 184 | ): 185 | 186 | # Note: we only update the forecast with data/info we added since the last forecast. 187 | 188 | query1 = self.factbase.query_to_fact_content( 189 | forecast.statement.fill_in_the_blank, 190 | n_results=25, 191 | skip_separator=True, 192 | since_date=forecast.created_at, 193 | ) 194 | query2 = self.factbase.query_to_fact_content( 195 | forecast.statement.description, 196 | n_results=25, 197 | skip_separator=True, 198 | since_date=forecast.created_at, 199 | ) 200 | 201 | if len(query1) == 0 and len(query2) == 0: 202 | print("No new content added to the forecast.") 203 | return None 204 | 205 | facts_to_use = ( 206 | """--- START FACTS ---------------------------\n""" 207 | + query1.strip() 208 | + "\n" 209 | + query2.strip() 210 | + """--- END FACTS ---------------------------\n""" 211 | ) 212 | 213 | chatbot_messages = [ 214 | {"role": "system", "content": start_system_prompt}, 215 | {"role": "user", "content": extend_user_prompt}, 216 | ] 217 | 218 | chatbot = self.chatbot 219 | 220 | prompt_template = ChatPrompt(chatbot_messages) 221 | 222 | the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 223 | 224 | additional_facts = "" 225 | if facts is not None: 226 | additional_facts = "Some additional facts for consideration are below...\n" 227 | afctr = 1 228 | for f in facts: 229 | additional_facts += f"AF{afctr}: {f}\n" 230 | afctr += 1 231 | additional_facts += "---------------------\n\n" 232 | 233 | chatbot.messages = prompt_template.fill( 234 | statement_title=forecast.statement.title, 235 | statement_description=forecast.statement.description, 236 | statement_fill_in_the_blank=forecast.statement.fill_in_the_blank, 237 | fill_in_the_blank_2=forecast.statement.fill_in_the_blank, 238 | content=facts_to_use, 239 | the_date=the_date, 240 | additional_facts=additional_facts, 241 | earlier_forecast_value=str(forecast.value), 242 | earlier_forecast=forecast.justification, 243 | ) 244 | 245 | assistant_analysis = chatbot.resend() 246 | 247 | # print("\n\n\n") 248 | # print(assistant_analysis) 249 | 250 | full_content = clean_fact_citations(self.factbase, assistant_analysis) 251 | print(full_content) 252 | 253 | uh = UtilityHelper(openai_api_key) 254 | prediction = uh.extract_prediction( 255 | assistant_analysis, forecast.statement.fill_in_the_blank 256 | ) 257 | 258 | client = Client(et_api_key) 259 | 260 | response = client.create_forecast( 261 | forecast.statement.id, 262 | "Prediction", 263 | full_content, 264 | prediction, 265 | prediction_agent, 266 | { 267 | "full_response_from_llm": assistant_analysis, 268 | "extracted_value": prediction, 269 | }, 270 | forecast.id, 271 | ) 272 | 273 | return response 274 | -------------------------------------------------------------------------------- /emergingtrajectories/knowledge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Solutions for finding, extracting, storing, and reviisting knowledge. 3 | """ 4 | 5 | """ 6 | This is the first knowledge base, and is meant to be a POC, really. 7 | 8 | All of our agents as of today (Feb 1) focus on web searches and website content. Today, we do a Google search and scrape the content from the top results. We repeat this process every time the agent runs. 9 | 10 | An obvious next step would be to create some sort of a cache to see if we already scraped the page and included the content elsewhere. 11 | 12 | This should also be able to do multiple searches *and* accept other URLs to scrape. 13 | 14 | How would this one work? 15 | 1. Have a folder where things get cached. 16 | 2. Have a JSON file that tracks when a knowledge base was accessed, the source URL, etc. 17 | """ 18 | 19 | import os 20 | import json 21 | import hashlib 22 | 23 | # Using JSONEncoder to be consistent with the Emerging Trajectories website and platform. 24 | from django.core.serializers.json import DjangoJSONEncoder 25 | 26 | from phasellm.agents import WebpageAgent 27 | 28 | from datetime import datetime 29 | 30 | from . import Client 31 | from phasellm.llms import OpenAIGPTWrapper, ChatBot 32 | 33 | """ 34 | CACHE STRUCTURE IN JSON... 35 | 36 | key: URI (URL or file name) 37 | value: { 38 | "obtained_on": ; when the file was downloaded 39 | "last_accessed": ; when the file was last used by the agent 40 | "accessed": 0 if not accessed, 1 if accessed 41 | "uri_md5": the MD5 sum of the URI 42 | } 43 | 44 | """ 45 | 46 | 47 | def statement_to_search_queries( 48 | statement_id: int, client: Client, openai_api_key: str, num_queries: int = 3 49 | ) -> list[str]: 50 | """ 51 | Given a specific statement ID, this will return a list of queries you can put into a search engine to get useful information. 52 | 53 | Args: 54 | statement_id (int): The ID of the statement to get search queries for. 55 | client (Client): The Emerging Trajectories API client. 56 | openai_api_key (str): The OpenAI API key. 57 | num_queries (int, optional): The number of queries to return. Defaults to 3. 58 | 59 | Returns: 60 | list[str]: A list of search queries. 61 | 62 | """ 63 | 64 | statement = client.get_statement(statement_id) 65 | # print(statement) 66 | 67 | llm = OpenAIGPTWrapper(openai_api_key, model="gpt-3.5-turbo") 68 | chatbot = ChatBot(llm) 69 | 70 | chatbot.messages = [ 71 | { 72 | "role": "system", 73 | "content": f"""I am working on a research project about this topic:\n{statement['title']}\n\n{statement['description']}\n\nHere is more information about what I am trying to do:\n{statement['description']}""", 74 | }, 75 | { 76 | "role": "user", 77 | "content": "Could you please provide me with up to {num_queries} search queries that I can input into a search engine to find info about this topic? Please do not qualify your response... Simply provide one search query per line and nothing else.", 78 | }, 79 | ] 80 | 81 | response = chatbot.resend() 82 | lines = response.strip().split("\n") 83 | 84 | if len(lines) > num_queries: 85 | lines = lines[:num_queries] 86 | 87 | return lines 88 | 89 | 90 | def uri_to_local(uri: str) -> str: 91 | """ 92 | Convert a URI to a local file name. In this case, we typically will use an MD5 sum. 93 | 94 | Args: 95 | uri (str): The URI to convert. 96 | 97 | Returns: 98 | str: The MD5 sum of the URI. 99 | """ 100 | uri_md5 = hashlib.md5(uri.encode("utf-8")).hexdigest() 101 | return uri_md5 102 | 103 | 104 | class KnowledgeBaseFileCache: 105 | 106 | def __init__(self, folder_path: str, cache_file: str = "cache.json") -> None: 107 | """ 108 | The KnowledgeBaseFileCache is a simple file-based cache for web content and local files. The cache stores the original HTML, PDF, or TXT content and tracks when (if ever) an agent actually accessed the content. 109 | 110 | Args: 111 | folder_path (str): The folder where the cache will be stored. 112 | cache_file (str, optional): The name of the cache file. Defaults to "cache.json". 113 | """ 114 | self.root_path = folder_path 115 | self.root_parsed = os.path.join(folder_path, "parsed") 116 | self.root_original = os.path.join(folder_path, "original") 117 | self.cache_file = os.path.join(folder_path, cache_file) 118 | self.cache = self.load_cache() 119 | 120 | def save_state(self) -> None: 121 | """ 122 | Saves the in-memory changes to the knowledge base to the JSON cache file. 123 | """ 124 | with open(self.cache_file, "w") as f: 125 | json.dump(self.cache, f, cls=DjangoJSONEncoder) 126 | 127 | def load_cache(self) -> None: 128 | """ 129 | Loads the cache from the cache file, or creates the relevant files and folders if one does not exist. 130 | """ 131 | 132 | if not os.path.exists(self.root_path): 133 | os.makedirs(self.root_path) 134 | 135 | if not os.path.exists(self.root_parsed): 136 | os.makedirs(self.root_parsed) 137 | 138 | if not os.path.exists(self.root_original): 139 | os.makedirs(self.root_original) 140 | 141 | if not os.path.exists(self.cache_file): 142 | with open(self.cache_file, "w") as f: 143 | f.write("{}") 144 | 145 | with open(self.cache_file, "r") as f: 146 | return json.load(f) 147 | 148 | def in_cache(self, uri: str) -> bool: 149 | """ 150 | Checks if a URI is in the cache already. 151 | 152 | Args: 153 | uri (str): The URI to check. 154 | 155 | Returns: 156 | bool: True if the URI is in the cache, False otherwise. 157 | """ 158 | if uri in self.cache: 159 | return True 160 | return False 161 | 162 | def update_cache( 163 | self, uri: str, obtained_on: datetime, last_accessed: datetime 164 | ) -> None: 165 | """ 166 | Updates the cache file for a given URI, specifically when it was obtained and last accessed. 167 | 168 | Args: 169 | uri (str): The URI to update. 170 | obtained_on (datetime): The date and time when the content was obtained. 171 | last_accessed (datetime): The date and time when the content was last accessed. 172 | """ 173 | uri_md5 = uri_to_local(uri) 174 | self.cache[uri] = { 175 | "obtained_on": obtained_on, 176 | "last_accessed": last_accessed, 177 | "accessed": 0, 178 | "uri_md5": uri_md5, 179 | } 180 | self.save_state() 181 | 182 | def log_access(self, uri: str) -> None: 183 | """ 184 | Saves the last accessed time and updates the accessed tracker for a given URI. 185 | 186 | Args: 187 | uri (str): The URI to update. 188 | """ 189 | self.cache[uri]["last_accessed"] = datetime.now() 190 | self.cache[uri]["accessed"] = 1 191 | self.save_state() 192 | 193 | def get_unaccessed_content(self) -> list[str]: 194 | """ 195 | Returns a list of URIs that have not been accessed by the agent. 196 | 197 | Returns: 198 | list[str]: A list of URIs that have not been accessed by the agent. 199 | """ 200 | unaccessed = [] 201 | for uri in self.cache: 202 | if self.cache[uri]["accessed"] == 0: 203 | unaccessed.append(uri) 204 | return unaccessed 205 | 206 | def get(self, uri: str) -> str: 207 | """ 208 | Returns the content for a given URI. If the content is not in the cache, it will be scraped and added to the cache. 209 | 210 | Args: 211 | uri (str): The URI to get the content for. 212 | 213 | Returns: 214 | str: The content for the given URI. 215 | """ 216 | uri_md5 = uri_to_local(uri) 217 | if uri in self.cache: 218 | with open(os.path.join(self.root_parsed, uri_md5), "r") as f: 219 | return f.read() 220 | else: 221 | scraper = WebpageAgent() 222 | 223 | content_raw = scraper.scrape(uri, text_only=False, body_only=False) 224 | with open(os.path.join(self.root_original, uri_md5), "w") as f: 225 | f.write(content_raw) 226 | 227 | content_parsed = scraper.scrape(uri, text_only=True, body_only=True) 228 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 229 | f.write(content_parsed) 230 | 231 | self.update_cache(uri, datetime.now(), datetime.now()) 232 | 233 | return content_parsed 234 | 235 | def add_content(self, content: str, uri: str = None) -> None: 236 | """ 237 | Adds content to cache. 238 | 239 | Args: 240 | content (str): The content to add to the cache. 241 | uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used. 242 | """ 243 | if uri is None: 244 | uri = hashlib.md5(content.encode("utf-8")).hexdigest() 245 | uri_md5 = uri_to_local(uri) 246 | with open(os.path.join(self.root_parsed, uri_md5), "w") as f: 247 | f.write(content) 248 | self.update_cache(uri, datetime.now(), datetime.now()) 249 | 250 | def add_content_from_file(self, filepath: str, uri: str = None) -> None: 251 | """ 252 | Adds content from a text file to the cache. 253 | 254 | Args: 255 | filepath (str): The path to the file to add to the cache. 256 | uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used. 257 | """ 258 | with open(filepath, "r") as f: 259 | content = f.read() 260 | self.add_content(content, uri) 261 | -------------------------------------------------------------------------------- /emergingtrajectories/news.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import feedparser 3 | import time 4 | import random 5 | 6 | from .crawlers import crawlerPlaywright, _get_text_bs4 7 | 8 | from playwright.sync_api import sync_playwright 9 | 10 | from news_search_client import NewsSearchClient 11 | from azure.core.credentials import AzureKeyCredential 12 | 13 | 14 | def force_empty_content(rss_url: str, content, cache_function) -> None: 15 | """ 16 | Force the crawler to visit every URL in the RSS feed and save it as a blank content file. We do this because some RSS feeds have a lot of old URLs we do not need to crawl, and only want to crawl the delta over some period. 17 | 18 | Args: 19 | rss_url (str): The URL of the RSS feed. 20 | content: the content string to save. 21 | cache_function: the specific function to call the rss_url and content to save. 22 | """ 23 | 24 | agent = RSSAgent(rss_url) 25 | all_urls = agent.get_news_as_list() 26 | for u in all_urls: 27 | cache_function(u, content) 28 | 29 | 30 | class RSSAgent: 31 | 32 | def __init__(self, rss_url, crawler=None) -> None: 33 | """ 34 | A simple wrapper for an RSS feed, so we can query it for URLs. 35 | 36 | Args: 37 | rss_url (str): The URL of the RSS feed. 38 | crawler (Crawler, optional): The crawler to use. Defaults to None, in which case we will use crawlerPlaywright in headless mode. 39 | """ 40 | self.rss_url = rss_url 41 | if crawler is None: 42 | self.crawler = crawlerPlaywright() 43 | else: 44 | self.crawler = crawler 45 | 46 | def get_news_as_list(self) -> list: 47 | """ 48 | Query the RSS feed for news articles, and return them as a list of dictionaries. 49 | 50 | Returns: 51 | list: A list of URLs. 52 | """ 53 | urls = [] 54 | feed = feedparser.parse(self.rss_url) 55 | for entry in feed.entries: 56 | urls.append(entry.link) 57 | return urls 58 | 59 | 60 | class NewsBingAgent: 61 | 62 | def __init__(self, api_key: str, endpoint: str): 63 | """ 64 | Creates a new Bing News API agent. To learn more, see: https://github.com/microsoft/bing-search-sdk-for-python/ 65 | 66 | Args: 67 | api_key (str): The Bing News API key. 68 | endpoint (str): The Bing News API endpoint. 69 | """ 70 | self.api_key = api_key 71 | self.endpoint = endpoint 72 | 73 | def get_news_as_list(self, query: str, market: str = "en-us") -> list: 74 | """ 75 | Gets a list of URLS from the Bing News API. For more information on markets, see: https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes 76 | 77 | Args: 78 | query (str): The query to search for. 79 | market (str, optional): The market to search in. Defaults to "en-us". (US English 80 | 81 | Returns: 82 | list: A list of URLs. 83 | """ 84 | 85 | client = NewsSearchClient( 86 | endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key) 87 | ) 88 | 89 | urls = [] 90 | 91 | try: 92 | news_result = client.news.search(query=query, market=market, count=10) 93 | for n in news_result.value: 94 | urls.append(n.url) 95 | 96 | except Exception as err: 97 | print("Encountered exception. {}".format(err)) 98 | 99 | return urls 100 | 101 | 102 | class NewsAPIAgent: 103 | 104 | def __init__(self, api_key, top_headlines=False, crawler=None) -> None: 105 | """ 106 | A simple wrapper for the News API, so we can query it for URLs. 107 | 108 | Args: 109 | api_key (str): The News API key. 110 | top_headlines (bool, optional): Whether to get top headlines. Defaults to False. 111 | crawler (Crawler, optional): The crawler to use. Defaults to None, in which case we will use crawlerPlaywright in headless mode. 112 | """ 113 | self.api_key = api_key 114 | self.top_headlines = top_headlines 115 | if crawler is None: 116 | self.crawler = crawlerPlaywright() 117 | else: 118 | self.crawler = crawler 119 | 120 | def get_news_as_list(self, query: str) -> list: 121 | """ 122 | Query the News API for news articles, and return them as a list of dictionaries. 123 | 124 | Args: 125 | query (str): The query to search for. 126 | 127 | Returns: 128 | list: A list of dictionaries, where each dictionary represents a news article. 129 | """ 130 | url = f"https://newsapi.org/v2/everything?q={query}&apiKey={self.api_key}" 131 | if self.top_headlines: 132 | url = ( 133 | f"https://newsapi.org/v2/top-headlines?q={query}&apiKey={self.api_key}" 134 | ) 135 | response = requests.get(url) 136 | return response.json() 137 | 138 | 139 | class FinancialTimesAgent: 140 | 141 | # The RSS feed URLs for the Financial Times. 142 | ft_rss_feed_urls = [ 143 | "https://www.ft.com/rss/home", 144 | "https://www.ft.com/world?format=rss", 145 | "https://www.ft.com/global-economy?format=rss", 146 | "https://www.ft.com/companies?format=rss", 147 | "https://www.ft.com/opinion?format=rss", 148 | ] 149 | 150 | ft_login_url = "https://ft.com/login" 151 | ft_main_url = "https://ft.com/" 152 | 153 | def __init__(self, user_email, user_password) -> None: 154 | """ 155 | This is a POC agent that uses Playwright to crawl the Financial Times articles you are interested in. Note that you *NEED* to be a subscriber to the FT to make this work, and thus need to provide your FT user name and password. 156 | 157 | Args: 158 | user_email (str): Your FT email. 159 | user_password (str): Your FT password. 160 | """ 161 | self.user_email = user_email 162 | self.user_password = user_password 163 | 164 | def get_news(self, urls: list[str] = None) -> list: 165 | """ 166 | Get the news from the Financial Times as a list of tuples, where each tuple contains the URL and the extracted text content. 167 | 168 | Args: 169 | urls: a list of URLs to get content for. 170 | 171 | Returns: 172 | A list of lists -- urls, html, and text content 173 | """ 174 | 175 | if urls is None: 176 | urls = set() 177 | for rss_url in self.ft_rss_feed_urls: 178 | agent = RSSAgent(rss_url) 179 | rss_url_list = agent.get_news_as_list() 180 | for r in rss_url_list: 181 | urls.add(r) 182 | urls = list(urls) 183 | 184 | html_content_array = [] 185 | text_content_array = [] 186 | 187 | with sync_playwright() as playwright: 188 | 189 | browser = playwright.firefox.launch(headless=False) 190 | page = browser.new_page() 191 | 192 | # Navigate to the webpage 193 | page.goto(self.ft_main_url) 194 | 195 | print("Accepting Cookies") 196 | page.frame_locator('*[title="SP Consent Message"]').get_by_text( 197 | "Accept Cookies" 198 | ).click() 199 | 200 | time.sleep(2) 201 | 202 | page.goto(self.ft_login_url) 203 | 204 | time.sleep(2) 205 | 206 | print("Entering user name + hitting enter") 207 | 208 | page.locator("#enter-email").fill(self.user_email) 209 | page.keyboard.press("Enter") 210 | 211 | time.sleep(5) 212 | 213 | page.locator("#enter-password").fill(self.user_password) 214 | page.keyboard.press("Enter") 215 | 216 | time.sleep(5) 217 | 218 | url_ctr = 1 219 | for url in urls: 220 | print(f"Getting content for URL {url_ctr} of {len(urls)}") 221 | 222 | html_content = "" 223 | text_content = "" 224 | 225 | try: 226 | page.goto(url) 227 | html_content = page.content() 228 | text_content = _get_text_bs4(html_content) 229 | 230 | print(url) 231 | print(text_content) 232 | 233 | except: 234 | print(url) 235 | print(f"Error getting content for URL {url_ctr} of {len(urls)}") 236 | 237 | html_content_array.append(html_content) 238 | text_content_array.append(text_content) 239 | 240 | url_ctr += 1 241 | 242 | time.sleep(2 + random.randint(0, 5)) 243 | 244 | # Close the browser 245 | browser.close() 246 | 247 | return urls, html_content_array, text_content_array 248 | -------------------------------------------------------------------------------- /emergingtrajectories/pdf.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a very simple set of utility function(s) for loading PDF content. In fact, it might be easier to just use PyPDF directly and avoid this altogether. In the future, we might create specialized functions and classes for doing "fancy" things with PDFs (e.g., OCR, tables, etc.) so have created this module as a way to keep this in mind. 3 | """ 4 | 5 | from pypdf import PdfReader 6 | import requests 7 | import io 8 | 9 | 10 | def get_PDF_content_from_file_by_page(file_path: str) -> list: 11 | """ 12 | Loads a PDF file and extracts the text into a list of strings, one for each page. 13 | 14 | Args: 15 | file_path (str): The path to the PDF file. 16 | 17 | Returns: 18 | list: A list of strings, one for each page. 19 | """ 20 | reader = PdfReader(file_path) 21 | content = [] 22 | for page in reader.pages: 23 | content.append(page.extract_text()) 24 | return content 25 | 26 | 27 | def get_PDF_content_from_url_by_page(url: str) -> list: 28 | """ 29 | Loads a PDF file from a URL and extracts the text into a list of strings, one for each page. 30 | 31 | Args: 32 | url (str): The URL to the PDF file. 33 | 34 | Returns: 35 | list: A list of strings, one for each page. 36 | """ 37 | response = requests.get(url=url, timeout=120) 38 | pdf_file = io.BytesIO(response.content) 39 | reader = PdfReader(pdf_file) 40 | content = [] 41 | for page in reader.pages: 42 | content.append(page.extract_text()) 43 | return content 44 | 45 | 46 | def get_PDF_content_by_page_from_file(file_path: str) -> str: 47 | """ 48 | Loads a PDF file and extracts the text into one big string. 49 | 50 | Args: 51 | file_path (str): The path to the PDF file. 52 | 53 | Returns: 54 | str: The text content of the PDF file. 55 | """ 56 | reader = PdfReader(file_path) 57 | content = "" 58 | for page in reader.pages: 59 | content += page.extract_text() + "\n" 60 | return content 61 | 62 | 63 | def get_PDF_content_by_page_from_url(url: str) -> str: 64 | """ 65 | Loads a PDF file from a URL and extracts the text into one big string. 66 | 67 | Args: 68 | url (str): The URL to the PDF file. 69 | 70 | Returns: 71 | str: The text content of the PDF file. 72 | """ 73 | response = requests.get(url=url, timeout=120) 74 | pdf_file = io.BytesIO(response.content) 75 | reader = PdfReader(pdf_file) 76 | content = "" 77 | for page in reader.pages: 78 | content += page.extract_text() + "\n" 79 | return content 80 | -------------------------------------------------------------------------------- /emergingtrajectories/prompts.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a convenience file for tracking prompts. We'll likely remove this in the (near) future. 3 | """ 4 | 5 | system_prompt_question_continuous = """You are a research agent that is meant to answer questions about specific points and topics. The facts you reference in answering these questions should all be based on information we provide you. We will provide you knowledge base below, where each fact is preceded by an ID (e.g., F1, F2, etc.). All your answers should be absed on these facts ONLY. 6 | 7 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts... 8 | 9 | F1: The President of the USA is Joe Biden. 10 | F2: The Vice President of the USA is Kamala Harris. 11 | 12 | ... your answers hould be something like this: 13 | 14 | The President of th USA is Joe Biden [F1]. 15 | 16 | We will give you a list of facts for every question. You can reference those facts using square brackets and the fact ID, so [F123] for fact 123, or you can also reference earlier facts from the conversation chain. YOU CANNOT USE OTHER INFORMATION.""" 17 | -------------------------------------------------------------------------------- /emergingtrajectories/recursiveagent.py: -------------------------------------------------------------------------------- 1 | """ 2 | x(t+1) = x(t) + z 3 | ... where x(t) is the current observation about the world 4 | ... z is the set of scenarios that will impact x in the future (i.e., x(t+1)) 5 | 6 | This is influenced by Yann LeCun's world modeling approach discussion: https://www.linkedin.com/feed/update/urn:li:activity:7165738293223931904/ 7 | 8 | We are aiming to eventually build some sort of a fact base system. Until then, however, we will be passing information directly through to the agent here. 9 | 10 | We're also using this as a way to test how well our new approach to classes (new Client, new Forecast, etc.) will work, so we can plug and play different types of agents here. 11 | 12 | Note that this approach will *not* test new knowledge bases *yet*. 13 | 14 | """ 15 | 16 | from .knowledge import KnowledgeBaseFileCache 17 | from .utils import UtilityHelper 18 | 19 | from . import Client, Statement, Forecast 20 | 21 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt 22 | from phasellm.agents import WebpageAgent, WebSearchAgent 23 | 24 | import requests 25 | import dateparser 26 | import re 27 | import datetime 28 | 29 | 30 | class ETClient(object): 31 | 32 | # The base URL for the API, in case we need to change it or if someone wants to self-host anything. 33 | base_url = "https://emergingtrajectories.com/a/api/v0.2/" 34 | 35 | def __init__(self, api_key: str) -> None: 36 | self.api_key = api_key 37 | 38 | def get_statement(self, statement_id: int) -> Statement: 39 | """ 40 | Returns a given statement from the platform. Includes title, description, deadline, and fill-in-the-blank. 41 | 42 | Args: 43 | statement_id: the ID of the statement to retrieve 44 | 45 | Returns: 46 | Statement: the statement from the platform 47 | """ 48 | url = self.base_url + "get_statement" + "/" + str(statement_id) 49 | headers = { 50 | "Authorization": f"Bearer {self.api_key}", 51 | "Content-Type": "application/json", 52 | } 53 | response = requests.post(url, headers=headers) 54 | if response.status_code == 200: 55 | r_obj = response.json() 56 | s = Statement(r_obj["title"], r_obj["fill_in_the_blank"]) 57 | s.id = int(r_obj["id"]) 58 | s.description = r_obj["description"] 59 | s.deadline = dateparser.parse(r_obj["deadline"]) 60 | s.created_at = dateparser.parse(r_obj["created_at"]) 61 | s.updated_at = dateparser.parse(r_obj["updated_at"]) 62 | s.created_by = r_obj["created_by"] 63 | return s 64 | else: 65 | raise Exception(response.text) 66 | 67 | def get_forecast(self, forecast_id: int) -> Forecast: 68 | """ 69 | Returns a given forecast from the platform. 70 | 71 | Args: 72 | forecast_id: the ID of the statement to retrieve 73 | 74 | Returns: 75 | Forecast: the forecast from the platform 76 | """ 77 | url = self.base_url + "get_forecast" + "/" + str(forecast_id) 78 | headers = { 79 | "Authorization": f"Bearer {self.api_key}", 80 | "Content-Type": "application/json", 81 | } 82 | response = requests.post(url, headers=headers) 83 | if response.status_code == 200: 84 | 85 | r_obj = response.json() 86 | 87 | f = Forecast(r_obj["title"], float(r_obj["value"]), r_obj["justification"]) 88 | 89 | f.id = int(r_obj["forecast_id"]) 90 | 91 | f.statement_id = int(r_obj["statement_id"]) 92 | f.statement = self.get_statement(int(r_obj["statement_id"])) 93 | 94 | f.created_at = dateparser.parse(r_obj["created_at"]) 95 | f.updated_at = dateparser.parse(r_obj["updated_at"]) 96 | # f.created_by = r_obj["created_by"] 97 | f.prediction_agent = r_obj["prediction_agent"] 98 | 99 | f.additional_data = r_obj["additional_data"] 100 | 101 | if "prior_forecast" in r_obj: 102 | if r_obj["prior_forecast"] is not None: 103 | f.prior_forecast = int(r_obj["prior_forecast"]) 104 | f.is_human = bool(r_obj["is_human"]) 105 | 106 | if "next_forecasts" in r_obj: 107 | if r_obj is not None: 108 | f.next_forecasts = r_obj["next_forecasts"] 109 | 110 | return f 111 | else: 112 | raise Exception(response.text) 113 | 114 | def add_facts_to_factbase( 115 | self, fact_db_slug: str, url: str, facts: list[str] 116 | ) -> bool: 117 | """ 118 | Adds a list of facts to a factbase on the Emerging Trajectories website. 119 | 120 | Args: 121 | fact_db_slug: the slug of the fact database to add the fact to. 122 | url: the URL of the fact. 123 | facts: the facts to add (a list of strings). 124 | 125 | Reutnr: 126 | bool: True if successful, False otherwise. 127 | """ 128 | 129 | api_url = self.base_url + "add_facts/" + fact_db_slug 130 | headers = { 131 | "Authorization": f"Bearer {self.api_key}", 132 | "Content-Type": "application/json", 133 | } 134 | j = { 135 | "facts": facts, 136 | "url": url, 137 | } 138 | response = requests.post(api_url, headers=headers, json=j) 139 | 140 | if response.status_code == 200 or response.status_code == 201: 141 | return True 142 | print(response) 143 | return False 144 | 145 | def add_fact_to_factbase(self, fact_db_slug: str, url: str, fact: str) -> bool: 146 | """ 147 | Adds a fact to a factbase on the Emerging Trajectories website. 148 | 149 | Args: 150 | fact_db_slug: the slug of the fact database to add the fact to. 151 | url: the URL of the fact. 152 | fact: the fact to add. 153 | 154 | Reutnr: 155 | bool: True if successful, False otherwise. 156 | """ 157 | api_url = self.base_url + "add_fact/" + fact_db_slug 158 | headers = { 159 | "Authorization": f"Bearer {self.api_key}", 160 | "Content-Type": "application/json", 161 | } 162 | j = { 163 | "fact": fact, 164 | "url": url, 165 | } 166 | response = requests.post(api_url, headers=headers, json=j) 167 | 168 | if response.status_code == 200 or response.status_code == 201: 169 | return True 170 | print(response) 171 | return False 172 | 173 | def add_content_to_factbase( 174 | self, fact_db_slug: str, url: str, content: str, topic: str 175 | ) -> bool: 176 | """ 177 | Sends content to the Emerging Trajectories website and extract facts from it. 178 | 179 | Args: 180 | fact_db_slug: the slug of the fact database to add the content to. 181 | url: the URL of the content. Note: we do not actually crawl this, we assume the content passed is the right conent. 182 | content: the content to extract facts from. 183 | topic: the topic of the content. 184 | 185 | Returns: 186 | bool: True if successful, False otherwise. 187 | """ 188 | 189 | api_url = self.base_url + "add_content_to_factbase/" + fact_db_slug 190 | headers = { 191 | "Authorization": f"Bearer {self.api_key}", 192 | "Content-Type": "application/json", 193 | } 194 | j = { 195 | "content": content, 196 | "url": url, 197 | "topic": topic, 198 | } 199 | response = requests.post(api_url, headers=headers, json=j) 200 | 201 | if response.status_code == 200 or response.status_code == 201: 202 | return True 203 | print(response) 204 | return False 205 | 206 | 207 | # TODO Move to Utils.py, or elsewhere. 208 | def clean_citations(assistant_analysis: str, ctr_to_source: dict) -> str: 209 | """ 210 | The analysis currently contains numerical citations that are likely not in order, or in some cases are not used. We will update the cituations to follow the proper numerical order, and also include the URLs at the very end. 211 | 212 | Args: 213 | assistant_analysis: the analysis text from the assistant 214 | ctr_to_source: the mapping of citation number to source URL 215 | 216 | Returns: 217 | str: the cleaned analysis text, with citations following a proper numerical format and URIs at the end of the analysis 218 | """ 219 | 220 | new_ctr_map = {} 221 | ctr = 1 222 | 223 | end_notes = "\n\n--- SOURCES ---\n\n" 224 | new_analysis = "" 225 | 226 | matches = re.finditer(r"\[\d+\]", assistant_analysis) 227 | 228 | last_index = 0 229 | for m in matches: 230 | 231 | m_start = m.start() + 1 232 | m_end = m.end() - 1 233 | 234 | old_ctr = int(m.group()[1:-1]) 235 | uri = ctr_to_source[old_ctr] 236 | 237 | if old_ctr not in new_ctr_map: 238 | new_ctr_map[old_ctr] = ctr 239 | end_notes += f"{ctr}: {uri}\n" 240 | ctr += 1 241 | 242 | new_analysis += assistant_analysis[last_index:m_start] + str( 243 | new_ctr_map[old_ctr] 244 | ) 245 | last_index = m_end 246 | 247 | if last_index != 0: 248 | new_analysis += assistant_analysis[last_index:] + end_notes 249 | 250 | else: 251 | new_analysis = assistant_analysis + end_notes + "No citations provided." 252 | 253 | return new_analysis 254 | 255 | 256 | #### 257 | # INITIAL FORECAST 258 | # 259 | 260 | base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}. 261 | 262 | The project description is as follows... 263 | {statement_description} 264 | 265 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking, and then finally provide a forecast for us based on the points. 266 | 267 | The format of the forecast needs to be, verbatim, as follows: {statement_fill_in_the_blank} 268 | """ 269 | 270 | base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 271 | 272 | ---------------------- 273 | {scraped_content} 274 | ---------------------- 275 | 276 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) discuss your logic and rationale for making a forecast based on the above. 277 | 278 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you. 279 | 280 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number. 281 | 282 | Do this for the final justification of your forecast as well. 283 | 284 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI. 285 | """ 286 | 287 | base_user_prompt_followup = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR". 288 | 289 | {statement_fill_in_the_blank} 290 | """ 291 | 292 | 293 | class RecursiveForecastingAgent(object): 294 | 295 | # TODO: eventually, should move the Google / KnowledgeBaseFileCache to some other knowledge process. 296 | def __init__( 297 | self, 298 | client: ETClient, 299 | chatbot: ChatBot, 300 | google_api_key: str, 301 | google_search_id: str, 302 | google_search_query: str, 303 | knowledge_base: KnowledgeBaseFileCache, 304 | ): 305 | 306 | self.google_api_key = google_api_key 307 | self.google_search_id = google_search_id 308 | self.google_search_query = google_search_query 309 | self.knowledge_base = knowledge_base 310 | self.client = client 311 | self.chatbot = chatbot 312 | 313 | # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity. 314 | def setChatBot(self, chatbot): 315 | self.chatbot = chatbot 316 | 317 | # TODO: standardize -- camel case or snake case? Or something else? 318 | def getChatBot(self): 319 | return self.chatbot 320 | 321 | def create_forecast( 322 | self, statement: Statement, openai_api_key, et_api_key, facts=None 323 | ): 324 | """ 325 | Options for taking in x(t) or z... 326 | 1) x(t) and z are strings... An array of facts. 327 | 2) x(t) and z are specific preprogrammed/strict facts, like "today's date" and "last forecast". 328 | 3) Facts are "Fact Objects" that have specific string representations. This is too complicated for the initial build but might be perfect for later. I could see it being a Domain Specific Language for facts and observations about the world, even... 329 | """ 330 | 331 | statement_id = statement.id 332 | statement_title = statement.title 333 | statement_description = statement.description 334 | fill_in_the_blank = statement.fill_in_the_blank 335 | 336 | knowledge_base = self.knowledge_base 337 | 338 | webagent = WebSearchAgent(api_key=self.google_api_key) 339 | results = webagent.search_google( 340 | query=self.google_search_query, 341 | custom_search_engine_id=self.google_search_id, 342 | num=10, 343 | ) 344 | 345 | scraped_content = "" 346 | 347 | added_new_content = False 348 | 349 | # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt. 350 | accessed_resources = [] 351 | 352 | ctr = 0 353 | ctr_to_source = {} 354 | 355 | for result in results: 356 | if not knowledge_base.in_cache(result.url): 357 | ctr += 1 358 | added_new_content = True 359 | page_content = knowledge_base.get(result.url) 360 | 361 | accessed_resources.append(result.url) 362 | # knowledge_base.log_access(result.url) 363 | 364 | scraped_content += ( 365 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 366 | ) 367 | ctr_to_source[ctr] = result.url 368 | 369 | # We also check the knowledge base for content that was added manually. 370 | unaccessed_uris = knowledge_base.get_unaccessed_content() 371 | for ua in unaccessed_uris: 372 | added_new_content = True 373 | ctr += 1 374 | page_content = knowledge_base.get(ua) 375 | 376 | accessed_resources.append(ua) 377 | # knowledge_base.log_access(ua) 378 | 379 | scraped_content += ( 380 | f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n" 381 | ) 382 | ctr_to_source[ctr] = ua 383 | 384 | if not added_new_content: 385 | print("No new content added to the forecast.") 386 | return None 387 | 388 | the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") 389 | 390 | # llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 391 | # chatbot = ChatBot(llm) 392 | chatbot = self.chatbot 393 | 394 | first_user_message = base_user_prompt 395 | if facts is not None: 396 | fact_str = "" 397 | for f in facts: 398 | fact_str += "-- " + f + "\n" 399 | first_user_message = ( 400 | "We know the following facts. These are fully correct and should be used to inform your forecast:" 401 | + fact_str.strip() 402 | + "\n\n" 403 | + first_user_message 404 | ) 405 | 406 | prompt_template = ChatPrompt( 407 | [ 408 | {"role": "system", "content": base_system_prompt}, 409 | {"role": "user", "content": first_user_message}, 410 | ] 411 | ) 412 | 413 | chatbot.messages = prompt_template.fill( 414 | statement_title=statement_title, 415 | statement_description=statement_description, 416 | statement_fill_in_the_blank=fill_in_the_blank, 417 | scraped_content=scraped_content, 418 | the_date=the_date, 419 | ) 420 | 421 | assistant_analysis = chatbot.resend() 422 | 423 | print("\n\n\n") 424 | print(assistant_analysis) 425 | 426 | prompt_template_2 = ChatPrompt( 427 | [ 428 | {"role": "system", "content": base_system_prompt}, 429 | {"role": "user", "content": first_user_message}, 430 | {"role": "assistant", "content": "{assistant_analysis}"}, 431 | {"role": "user", "content": base_user_prompt_followup}, 432 | ] 433 | ) 434 | 435 | chatbot.messages = prompt_template_2.fill( 436 | statement_title=statement_title, 437 | statement_description=statement_description, 438 | statement_fill_in_the_blank=fill_in_the_blank, 439 | scraped_content=scraped_content, 440 | assistant_analysis=assistant_analysis, 441 | the_date=the_date, 442 | ) 443 | 444 | filled_in_statement = chatbot.resend() 445 | 446 | print("\n\n\n") 447 | print(filled_in_statement) 448 | 449 | assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source) 450 | 451 | print("\n\n\n*** ANALYSIS WITH CITATIONS***\n\n\n") 452 | print(assistant_analysis_sourced) 453 | 454 | uh = UtilityHelper(openai_api_key) 455 | prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank) 456 | 457 | client = Client(et_api_key) 458 | 459 | response = client.create_forecast( 460 | statement_id, 461 | "Prediction", 462 | assistant_analysis_sourced, 463 | prediction, 464 | "Test Agent", 465 | { 466 | "full_response_from_llm_before_source_cleanup": assistant_analysis, 467 | "full_response_from_llm": assistant_analysis_sourced, 468 | "raw_forecast": filled_in_statement, 469 | "extracted_value": prediction, 470 | }, 471 | ) 472 | 473 | for ar in accessed_resources: 474 | knowledge_base.log_access(ar) 475 | 476 | return response 477 | 478 | def extend_forecast(self, forecast: Forecast): 479 | pass 480 | -------------------------------------------------------------------------------- /emergingtrajectories/utils.py: -------------------------------------------------------------------------------- 1 | from phasellm.llms import OpenAIGPTWrapper, ChatBot 2 | 3 | # Prompt used for extracting predictions from text messages. 4 | _extract_prediction_prompt = """You are helping a researcher with a data extraction exercise. You will be provided with a prediction statement and a broader piece of text. Your objective is to extract the specific numerical prediction and provide it as a response. DO NOT qualify your response in any way. 5 | 6 | For example, suppose you have the following... 7 | 8 | |||---||| 9 | PREDICTION STATEMENT: please predict the probability that the price of Bitcoin will exceed $100,000 by the end of 2024. 10 | 11 | TEXT: The probability that bitcoin will exceed $100,000 by the end of 2024 is 0.37. 12 | |||---||| 13 | 14 | In the case above, your response would simply be "0.37". 15 | 16 | The actual metrics (i.e., prediction) might be provided with formatting. For example... 17 | 18 | |||---||| 19 | PREDICTION STATEMENT: The probability that Boeing's (NYSE:BA) share price at the close of markets on or before March 1, 2024 will be $220.00 USD or higher is _____ (value between 0.00 and 1.00). 20 | 21 | TEXT: The probability that Boeing's (NYSE:BA) share price at the close of markets on or before March 1, 2024, will be $220.00 USD or higher is **0.65**. 22 | |||---||| 23 | 24 | In this case, ignore the asterisks or instructions ("value between 0.00 and 1.00") and provide the correct response, which is 0.65. 25 | 26 | The user will provide you with a PREDICTION STATEMENT and TEXT and you need to answer like the above. 27 | 28 | On the extremely rare occasion that the TEXT does not have a proper numerical prediction or you are unable to extract it, simply respond with "UNCLEAR". 29 | """ 30 | 31 | # Error message used when the prediction cannot be extracted from the response. 32 | _extract_prediction_prompt_error = "UNCLEAR" 33 | 34 | 35 | def is_numeric(string: str) -> bool: 36 | """ 37 | Checks whether the 'string' passed as an argument can be converted into a numeric value. 38 | 39 | Args: 40 | string: the string in question 41 | 42 | Returns: 43 | Boolean value; True if the string can be converted into a numeric value, False otherwise. 44 | """ 45 | if string is None: 46 | return False 47 | try: 48 | float(string) 49 | return True 50 | except ValueError: 51 | return False 52 | 53 | 54 | # TODO document 55 | def run_forecast(function_to_call, n, *args, **kwargs): 56 | 57 | if n == 0: 58 | return None 59 | 60 | result = None 61 | 62 | try: 63 | result = function_to_call(*args, **kwargs) 64 | except Exception as e: 65 | print(f"Forecast failed with error: {e}") 66 | print(f"Trying up to {n-1} more times.") 67 | result = run_forecast(function_to_call, n - 1, *args, **kwargs) 68 | 69 | if result is None: 70 | print(f"Forecast failed after {n} attempts.") 71 | 72 | return result 73 | 74 | 75 | class UtilityHelper(object): 76 | 77 | def __init__(self, api_key, model="gpt-4-0125-preview") -> None: 78 | """ 79 | The UtilityHelper class is used to extract predictions from text messages. 80 | 81 | Args: 82 | api_key: the OpenAI API key 83 | model: the OpenAI model to use for the extraction process 84 | """ 85 | 86 | self.api_key = api_key 87 | self.model = model 88 | 89 | def extract_prediction(self, response: str, statement_challenge: str) -> float: 90 | """ 91 | Extracts the prediction value from the response to a statement challenge. 92 | 93 | Args: 94 | response: the response to the statement challenge (i.e., what was predicted by another LLM) 95 | statement_challenge: the statement challenge -- what is being predicted 96 | 97 | Returns: 98 | The extracted prediction value as a float. Raises an exception if the prediction cannot be extracted. 99 | """ 100 | 101 | message_stack = [ 102 | {"role": "system", "content": _extract_prediction_prompt}, 103 | { 104 | "role": "user", 105 | "content": f"PREDICTION STATEMENT: {statement_challenge}\n\nTEXT: {response}", 106 | }, 107 | ] 108 | 109 | # print(f"PREDICTION STATEMENT: {statement_challenge}\n\nTEXT: {response}") 110 | 111 | llm = OpenAIGPTWrapper(apikey=self.api_key, model=self.model) 112 | chatbot = ChatBot(llm) 113 | chatbot.messages = message_stack 114 | 115 | output = chatbot.resend() 116 | 117 | # print(f"\n\n\n{output}\b\b\b") 118 | 119 | if output == _extract_prediction_prompt_error: 120 | raise Exception("Unable to extract prediction from response.") 121 | 122 | if output[0] == "$": 123 | output = output[1:] 124 | 125 | # Remove commas... 126 | output = output.replace(",", "") 127 | 128 | if not is_numeric(output): 129 | raise Exception(f"Prediction does not appear to be numeric:\n{output}") 130 | 131 | return float(output) 132 | -------------------------------------------------------------------------------- /forecast1.py: -------------------------------------------------------------------------------- 1 | # This is a sample model for tracking oil prices. 2 | 3 | import os 4 | from dotenv import load_dotenv 5 | 6 | load_dotenv() 7 | openai_api_key = os.getenv("OPENAI_API_KEY") 8 | et_api_key = os.getenv("ET_API_KEY") 9 | google_api_key = os.getenv("GOOGLE_API_KEY") 10 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 11 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 12 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 13 | news_api_key = os.getenv("NEWS_API_KEY") 14 | 15 | from emergingtrajectories.news import RSSAgent 16 | from emergingtrajectories.crawlers import crawlerPlaywright 17 | from emergingtrajectories.factsrag import ( 18 | FactRAGFileCache, 19 | FactBot, 20 | clean_fact_citations, 21 | ) 22 | from emergingtrajectories.factsragforecaster import FactsRAGForecastingAgent 23 | from emergingtrajectories import Client 24 | from emergingtrajectories.recursiveagent import ETClient 25 | 26 | from phasellm.llms import ( 27 | OpenAIGPTWrapper, 28 | ClaudeWrapper, 29 | VertexAIWrapper, 30 | ChatBot, 31 | ReplicateLlama2Wrapper, 32 | ) 33 | 34 | topic = "oil futures and oil prices for 2024" 35 | 36 | queries = ["oil prices end of 2024 and early 2025", "oil prices today"] 37 | 38 | crawler = crawlerPlaywright(True) 39 | fr = FactRAGFileCache("forecasting_oil", openai_api_key, crawler=crawler) 40 | 41 | """ 42 | # Testing cleaning citations... 43 | 44 | str_out = clean_fact_citations( 45 | fr, "Hey, this is a set of facts [f1]. This is also a set of facts [f2, f3]." 46 | ) 47 | print(str_out) 48 | """ 49 | 50 | """ 51 | # Get Content 52 | 53 | fr.new_get_rss_links( 54 | "https://www.oilholicssynonymous.com/feeds/posts/default", topic=topic 55 | ) 56 | # TODO Need to fix timeout bug. 57 | # fr.new_get_rss_links("https://oilprice.com/rss/main", topic=topic) 58 | 59 | fr.new_get_new_info_google(google_api_key, google_search_id, queries, topic=topic) 60 | """ 61 | 62 | """ 63 | # Create a forecast 64 | 65 | # client = Client(et_api_key) 66 | # s = client.get_statement(5) 67 | etc = ETClient(et_api_key) 68 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 69 | chatbot = ChatBot(llm) 70 | f = FactsRAGForecastingAgent(ETClient(et_api_key), chatbot, fr) 71 | f.create_forecast( 72 | etc.get_statement(5), 73 | openai_api_key, 74 | et_api_key, 75 | ["Today's oil price is about $85."], 76 | prediction_agent="FactsRAGForecastingAgent", 77 | ) 78 | """ 79 | 80 | """ 81 | # Extend a forecast 82 | etc = ETClient(et_api_key) 83 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 84 | chatbot = ChatBot(llm) 85 | f = FactsRAGForecastingAgent(ETClient(et_api_key), chatbot, fr) 86 | f.extend_forecast( 87 | etc.get_forecast(252), 88 | openai_api_key, 89 | et_api_key, 90 | prediction_agent="FactsRAGForecastingAgent", 91 | ) 92 | """ 93 | -------------------------------------------------------------------------------- /forecasttest1.py: -------------------------------------------------------------------------------- 1 | # See forecast1.py instead!! 2 | 3 | import os 4 | from dotenv import load_dotenv 5 | 6 | from google.cloud import aiplatform 7 | 8 | aiplatform.init(project="phasellm-gemini-testing") 9 | 10 | load_dotenv() 11 | openai_api_key = os.getenv("OPENAI_API_KEY") 12 | et_api_key = os.getenv("ET_API_KEY") 13 | google_api_key = os.getenv("GOOGLE_API_KEY") 14 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 15 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 16 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 17 | news_api_key = os.getenv("NEWS_API_KEY") 18 | 19 | from emergingtrajectories import Client 20 | from emergingtrajectories.news import NewsAPIAgent 21 | from emergingtrajectories.crawlers import crawlerPlaywright 22 | from emergingtrajectories.factsrag import FactRAGFileCache 23 | from emergingtrajectories.recursiveagent import ETClient 24 | from emergingtrajectories.factsragforecaster import FactsRAGForecastingAgent 25 | 26 | from phasellm.llms import OpenAIGPTWrapper, ChatBot 27 | 28 | topic = "drivers of oil prices, associated political/economic/military climate, and oil futures commodity prices" 29 | 30 | crawler = crawlerPlaywright(False) 31 | fr = FactRAGFileCache("testing_oil", openai_api_key, crawler=crawler) 32 | 33 | etc = ETClient(et_api_key) 34 | statement = etc.get_statement(5) 35 | 36 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 37 | chatbot = ChatBot(llm) 38 | 39 | google_search_queries = [ 40 | "oil price projections", 41 | "how do oil prices work", 42 | "oil and geopolitics 2024", 43 | ] 44 | 45 | # fr.new_get_new_info_google( 46 | # google_api_key, 47 | # google_search_id, 48 | # google_search_queries, 49 | # topic, 50 | # ) 51 | 52 | forecaster = FactsRAGForecastingAgent(etc, chatbot, fr) 53 | 54 | """ 55 | result = forecaster.create_forecast( 56 | etc.get_statement(5), 57 | openai_api_key, 58 | et_api_key, 59 | ) 60 | print(result) 61 | """ 62 | 63 | """ 64 | client = Client(et_api_key) 65 | forecast_id = client.get_most_recent_forecast(5) 66 | result = forecaster.extend_forecast( 67 | etc.get_forecast(forecast_id), 68 | openai_api_key, 69 | et_api_key, 70 | ) 71 | print(result) 72 | """ 73 | -------------------------------------------------------------------------------- /newstest1.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | from google.cloud import aiplatform 5 | 6 | aiplatform.init(project="phasellm-gemini-testing") 7 | 8 | load_dotenv() 9 | openai_api_key = os.getenv("OPENAI_API_KEY") 10 | et_api_key = os.getenv("ET_API_KEY") 11 | google_api_key = os.getenv("GOOGLE_API_KEY") 12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 14 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 15 | news_api_key = os.getenv("NEWS_API_KEY") 16 | 17 | from emergingtrajectories.news import NewsAPIAgent 18 | from emergingtrajectories.crawlers import crawlerPlaywright 19 | 20 | """ 21 | na = NewsAPIAgent(news_api_key) 22 | 23 | r = na.get_news("lng") 24 | r = na.get_news("covid") 25 | 26 | for result in r['articles']: 27 | print("") 28 | print(result['title']) 29 | print(result['url']) 30 | print(result['publishedAt']) 31 | print("\n\n") 32 | """ 33 | 34 | """from emergingtrajectories.factsrag import FactRAGFileCache 35 | 36 | topic = "Liquefied Natural Gas (LNG) futures + commodity prices" 37 | 38 | crawler = crawlerPlaywright(False) 39 | fr = FactRAGFileCache("test_rag", openai_api_key, crawler=crawler) 40 | # fr.facts_from_url("https://www.bbc.com/news/business-63585732", topic=topic) 41 | 42 | print(fr.query_to_fact_content("What is LNG?")) 43 | """ 44 | 45 | na = NewsAPIAgent(news_api_key) 46 | 47 | r = na.get_news_as_list("autonomous trucking") 48 | 49 | for result in r["articles"]: 50 | print("") 51 | print(result["title"]) 52 | print(result["url"]) 53 | print(result["publishedAt"]) 54 | print("\n\n") 55 | -------------------------------------------------------------------------------- /newstest2.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | from google.cloud import aiplatform 5 | 6 | aiplatform.init(project="phasellm-gemini-testing") 7 | 8 | load_dotenv() 9 | openai_api_key = os.getenv("OPENAI_API_KEY") 10 | et_api_key = os.getenv("ET_API_KEY") 11 | google_api_key = os.getenv("GOOGLE_API_KEY") 12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 14 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 15 | news_api_key = os.getenv("NEWS_API_KEY") 16 | 17 | from emergingtrajectories.news import NewsAPIAgent 18 | from emergingtrajectories.crawlers import crawlerPlaywright 19 | from emergingtrajectories.factsrag import FactRAGFileCache, FactBot 20 | 21 | topic = "Liquefied Natural Gas (LNG) futures + commodity prices" 22 | crawler = crawlerPlaywright(False) 23 | fr = FactRAGFileCache("test_rag", openai_api_key, crawler=crawler) 24 | bot = FactBot(fr, openai_api_key) 25 | -------------------------------------------------------------------------------- /newstest3.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | from google.cloud import aiplatform 5 | 6 | aiplatform.init(project="phasellm-gemini-testing") 7 | 8 | load_dotenv() 9 | openai_api_key = os.getenv("OPENAI_API_KEY") 10 | et_api_key = os.getenv("ET_API_KEY") 11 | google_api_key = os.getenv("GOOGLE_API_KEY") 12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 14 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 15 | news_api_key = os.getenv("NEWS_API_KEY") 16 | 17 | from emergingtrajectories.news import NewsAPIAgent 18 | from emergingtrajectories.crawlers import crawlerPlaywright 19 | from emergingtrajectories.factsrag import FactRAGFileCache 20 | from emergingtrajectories.recursiveagent import ETClient 21 | 22 | from phasellm.llms import OpenAIGPTWrapper, ChatBot 23 | 24 | topic = "drivers of Liquefied Natural Gas (LNG) prices, associated political/economic/military climate, and LNG futures commodity prices" 25 | 26 | crawler = crawlerPlaywright(False) 27 | fr = FactRAGFileCache("rag_lng", openai_api_key, crawler=crawler) 28 | 29 | etc = ETClient(et_api_key) 30 | statement = etc.get_statement(37) 31 | 32 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 33 | chatbot = ChatBot(llm) 34 | 35 | google_search_queries = [ 36 | "LNG futures prices and estimmates", 37 | "LNG prices in 2024 and 2025", 38 | "Political, economic, and military drivers of LNG prices", 39 | "Biggest producers of LNG", 40 | "Biggest consumers of LNG", 41 | "LNG and Natural Gas price and structural relationships", 42 | "The politics of LNG", 43 | "LNG regulations in USA", 44 | "LNG regulations in China", 45 | "LNG regulations in Europe", 46 | "LNG and the Russia/Ukraine war", 47 | "The economics of LNG production", 48 | "Economic models and LNG", 49 | ] 50 | 51 | fr.summarize_new_info_multiple_queries( 52 | statement, 53 | chatbot, 54 | google_api_key, 55 | google_search_id, 56 | google_search_queries, 57 | topic, 58 | ) 59 | -------------------------------------------------------------------------------- /newstest4.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | from google.cloud import aiplatform 5 | 6 | aiplatform.init(project="phasellm-gemini-testing") 7 | 8 | load_dotenv() 9 | openai_api_key = os.getenv("OPENAI_API_KEY") 10 | et_api_key = os.getenv("ET_API_KEY") 11 | google_api_key = os.getenv("GOOGLE_API_KEY") 12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 14 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 15 | news_api_key = os.getenv("NEWS_API_KEY") 16 | 17 | from emergingtrajectories.news import NewsAPIAgent 18 | from emergingtrajectories.crawlers import crawlerPlaywright 19 | from emergingtrajectories.factsrag import FactRAGFileCache, FactBot 20 | 21 | topic = "drivers of Liquefied Natural Gas (LNG) prices, associated political/economic/military climate, and LNG futures commodity prices" 22 | crawler = crawlerPlaywright(False) 23 | fr = FactRAGFileCache("rag_lng", openai_api_key, crawler=crawler) 24 | bot = FactBot(fr, openai_api_key) 25 | -------------------------------------------------------------------------------- /newstest5.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | from google.cloud import aiplatform 5 | 6 | aiplatform.init(project="phasellm-gemini-testing") 7 | 8 | load_dotenv() 9 | openai_api_key = os.getenv("OPENAI_API_KEY") 10 | et_api_key = os.getenv("ET_API_KEY") 11 | google_api_key = os.getenv("GOOGLE_API_KEY") 12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 14 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 15 | news_api_key = os.getenv("NEWS_API_KEY") 16 | 17 | from emergingtrajectories.news import NewsAPIAgent 18 | from emergingtrajectories.crawlers import crawlerPlaywright 19 | from emergingtrajectories.factsrag import FactRAGFileCache, clean_fact_citations 20 | from emergingtrajectories.recursiveagent import ETClient 21 | 22 | from phasellm.llms import OpenAIGPTWrapper, ChatBot 23 | 24 | from datetime import datetime 25 | 26 | topic = "drivers of Liquefied Natural Gas (LNG) prices, associated political/economic/military climate, and LNG futures commodity prices" 27 | 28 | crawler = crawlerPlaywright(False) 29 | fr = FactRAGFileCache("testing_news_lng", openai_api_key, crawler=crawler) 30 | 31 | etc = ETClient(et_api_key) 32 | statement = etc.get_statement(37) 33 | 34 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview") 35 | chatbot = ChatBot(llm) 36 | 37 | google_search_queries = [ 38 | "Economic models and LNG", 39 | ] 40 | 41 | """fr.new_get_new_info_google( 42 | google_api_key, 43 | google_search_id, 44 | google_search_queries, 45 | topic, 46 | )""" 47 | 48 | """fr.new_get_new_info_news( 49 | news_api_key, topic, ["lng", "liquefied natural gas"], top_headlines=False 50 | )""" 51 | 52 | """print( 53 | fr.query_to_fact_content( 54 | "What is LNG?", n_results=3, since_date=datetime(2023, 1, 1) 55 | ) 56 | )""" 57 | 58 | # text = "abcd [f1], [f3, f6] and so on [f4]" 59 | # print(clean_fact_citations(fr, text)) 60 | -------------------------------------------------------------------------------- /newstest6.py: -------------------------------------------------------------------------------- 1 | # Testing RSS feeds 2 | # https://www.oilholicssynonymous.com/feeds/posts/default 3 | 4 | import os 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | openai_api_key = os.getenv("OPENAI_API_KEY") 9 | et_api_key = os.getenv("ET_API_KEY") 10 | google_api_key = os.getenv("GOOGLE_API_KEY") 11 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 12 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 13 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 14 | news_api_key = os.getenv("NEWS_API_KEY") 15 | 16 | from emergingtrajectories.news import RSSAgent 17 | from emergingtrajectories.crawlers import crawlerPlaywright 18 | from emergingtrajectories.factsrag import FactRAGFileCache 19 | 20 | topic = "oil futures and oil prices" 21 | 22 | crawler = crawlerPlaywright(False) 23 | fr = FactRAGFileCache("test_rss_oil", openai_api_key, crawler=crawler) 24 | 25 | fr.new_get_rss_links( 26 | "https://www.oilholicssynonymous.com/feeds/posts/default", topic=topic 27 | ) 28 | 29 | print(fr.query_to_fact_content("How are oil prices doing in March 2024?")) 30 | -------------------------------------------------------------------------------- /newstest7.py: -------------------------------------------------------------------------------- 1 | topic = ( 2 | "Any news related to finance, economics, government, diplomacy, or current affairs" 3 | ) 4 | 5 | import os 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | openai_api_key = os.getenv("OPENAI_API_KEY") 10 | et_api_key = os.getenv("ET_API_KEY") 11 | google_api_key = os.getenv("GOOGLE_API_KEY") 12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID") 13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 14 | replicate_api_key = os.getenv("REPLICATE_API_KEY") 15 | news_api_key = os.getenv("NEWS_API_KEY") 16 | 17 | ft_user = os.getenv("FT_USER_NAME") 18 | ft_pass = os.getenv("FT_PASSWORD") 19 | 20 | from emergingtrajectories.news import FinancialTimesAgent 21 | from emergingtrajectories.factsrag import FactRAGFileCache 22 | 23 | # fta = FinancialTimesAgent(ft_user, ft_pass) 24 | # a = fta.get_news() 25 | 26 | f = FactRAGFileCache("rag_demo_ft_rss", openai_api_key) 27 | f.get_ft_news(ft_user, ft_pass, topic) 28 | -------------------------------------------------------------------------------- /project_metadata.py: -------------------------------------------------------------------------------- 1 | NAME = "emergingtrajectories" 2 | 3 | AUTHOR = "Wojciech Gryc" 4 | 5 | VERSION = "0.2.53" 6 | 7 | DESCRIPTION = "Open source library for tracking and saving forecasts of political, economic, and social events." 8 | 9 | LONG_DESCRIPTION = "Open source library for tracking and saving forecasts of political, economic, and social events." 10 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | twine>=4.0.2 2 | wheel>=0.41.3 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | phasellm>=0.0.22 2 | Django>=5.0.0 3 | python-dotenv>=1.0.0 4 | dateparser>=1.2.0 5 | pytest-playwright 6 | beautifulsoup4 7 | chromadb 8 | feedparser 9 | pypdf 10 | faiss-cpu 11 | microsoft-bing-newssearch 12 | scrapingbee 13 | tiktoken 14 | # db-dtypes # For Google Cloud, unsure if needed -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | from project_metadata import NAME, VERSION, AUTHOR, DESCRIPTION, LONG_DESCRIPTION 4 | 5 | setup( 6 | name=NAME, 7 | version=VERSION, 8 | description=DESCRIPTION, 9 | long_description=LONG_DESCRIPTION, 10 | author=AUTHOR, 11 | author_email="hello@phaseai.com", 12 | license="MIT", 13 | packages=find_packages(), 14 | install_requires=[ 15 | "phasellm>=0.0.22", 16 | "Django>=5.0.0", 17 | "python-dotenv>=1.0.0", 18 | "dateparser>=1.2.0", 19 | "pytest-playwright", 20 | "beautifulsoup4", 21 | "chromadb", 22 | "feedparser", 23 | "pypdf", 24 | "faiss-cpu", 25 | "microsoft-bing-newssearch", 26 | "scrapingbee", 27 | "tiktoken", 28 | ], 29 | extras_require={ 30 | "docs": [ 31 | "furo", 32 | "sphinx>=7.1.2", 33 | "myst_parser>=2.0.0", 34 | "sphinx-autoapi>=2.1.1", 35 | "sphinx-autobuild>=2021.3.14", 36 | ] 37 | }, 38 | python_requires=">=3.10.0", 39 | keywords="llm, nlp, ai, social, politics, economics", 40 | classifiers=[ 41 | "Development Status :: 3 - Alpha", 42 | "Intended Audience :: Developers", 43 | "License :: OSI Approved :: MIT License", 44 | "Programming Language :: Python :: 3", 45 | ], 46 | ) 47 | --------------------------------------------------------------------------------