├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    └── source
    │   ├── conf.py
    │   └── index.md
├── emergingtrajectories
    ├── __init__.py
    ├── agents.py
    ├── chunkers.py
    ├── citationagents.py
    ├── crawlers.py
    ├── facts.py
    ├── factsforecaster.py
    ├── factsrag.py
    ├── factsrag2.py
    ├── factsrag3.py
    ├── factsragforecaster.py
    ├── factsragforecaster2.py
    ├── factsragforecaster3.py
    ├── knowledge.py
    ├── news.py
    ├── pdf.py
    ├── prompts.py
    ├── recursiveagent.py
    └── utils.py
├── forecast1.py
├── forecasttest1.py
├── newstest1.py
├── newstest2.py
├── newstest3.py
├── newstest4.py
├── newstest5.py
├── newstest6.py
├── newstest7.py
├── project_metadata.py
├── requirements-dev.txt
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.10"
 7 | 
 8 | sphinx:
 9 |   configuration: docs/source/conf.py
10 | 
11 | python:
12 |   install:
13 |     - method: pip
14 |       path: .
15 |       extra_requirements:
16 |         - docs


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Phase AI Technologies Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Emerging Trajectories
 2 | 
 3 | Open source library for tracking and saving forecasts of political, economic, and social events.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | pip install emergingtrajectories
 9 | ```
10 | 
11 | ## Questions?
12 | 
13 | Visit our site: https://emergingtrajectories.com/
14 | 
15 | Please reach out: hello --at-- emergingtrajectories --dot-- com
16 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | ### Docs Setup
 2 | 
 3 | 1) Install docs dependencies
 4 |     ```
 5 |     pip install -e .[docs]
 6 |     ```
 7 | 
 8 | 2) Run a local docs server
 9 |    ```
10 |    sphinx-autobuild docs/source/ docs/build/html
11 |    ```
12 | 
13 | ### Manual Build
14 | 
15 | ```
16 | cd docs
17 | make html
18 | ```
19 | 
20 | ### Helpful Tools
21 | 
22 | * Convert reStructuredText (.rst) to Markdown (.md)
23 |     ```
24 |     pip install rst-to-myst[sphinx]
25 |     rst2myst convert docs/**/*.rst
26 |     ```
27 | 
28 | ### Useful Resources
29 | 
30 | * Document Your Scientific Project With Markdown, Sphinx, and Read the Docs | PyData Global 2021
31 |   * https://www.sphinx-doc.org/en/master/usage/quickstart.html
32 |   * https://www.youtube.com/watch?v=qRSb299awB0


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append("../../")
 4 | 
 5 | from project_metadata import NAME, VERSION, AUTHOR  # noqa: E402
 6 | 
 7 | # Configuration file for the Sphinx documentation builder.
 8 | #
 9 | # For the full list of built-in configuration values, see the documentation:
10 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
11 | 
12 | # -- Project information -----------------------------------------------------
13 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
14 | 
15 | project = NAME
16 | copyright = f"2024, {AUTHOR}"
17 | author = AUTHOR
18 | release = VERSION
19 | 
20 | # -- General configuration ---------------------------------------------------
21 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
22 | 
23 | # Add paths to the Python source code.
24 | sys.path.append("../../emergingtrajectories")
25 | 
26 | # Allow markdown files to be used.
27 | extensions = [
28 |     "myst_parser",
29 |     "autoapi.extension",
30 |     "sphinx.ext.duration",
31 |     "sphinx.ext.autodoc",
32 |     "sphinx.ext.napoleon",
33 | ]
34 | 
35 | # Configure autoapi.
36 | autoapi_dirs = ["../../emergingtrajectories"]
37 | autoapi_python_class_content = "init"
38 | 
39 | templates_path = ["_templates"]
40 | exclude_patterns = []
41 | 
42 | # -- Options for HTML output -------------------------------------------------
43 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
44 | 
45 | html_theme = "furo"
46 | html_static_path = ["_static"]
47 | 


--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
 1 | % Phasellm documentation master file, created by
 2 | % sphinx-quickstart on Tue Aug  8 15:42:56 2023.
 3 | % You can adapt this file completely to your liking, but it should at least
 4 | % contain the root `toctree` directive.
 5 | 
 6 | ```{include} ../../README.md
 7 | :relative-images:
 8 | ```
 9 | 
10 | ## Contents
11 | ```{toctree}
12 | :maxdepth: 2
13 | 
14 | ```
15 | 


--------------------------------------------------------------------------------
/emergingtrajectories/agents.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agents for generating forecasts.
  3 | """
  4 | 
  5 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt
  6 | from phasellm.agents import WebpageAgent, WebSearchAgent
  7 | 
  8 | from . import Client
  9 | from .utils import UtilityHelper
 10 | from .knowledge import KnowledgeBaseFileCache
 11 | 
 12 | # from . import scrapeandpredict as sap
 13 | 
 14 | import datetime
 15 | 
 16 | # Step 0: provide context
 17 | # Step 1: provide content and extract facts
 18 | # Step 2: review past forecast and determine if new information changes the forecast
 19 | # Step 3: update the actual forecast statement
 20 | 
 21 | base_system_prompt_ext = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
 22 | 
 23 | The project description is as follows...
 24 | {statement_description}
 25 | 
 26 | We need your help analyzing content and extracting any relevant information. We'll have a few requests for you... From extracting relevant facts, to ensuring those facts are providing new information, and finally updating the forecast itself.
 27 | 
 28 | The user will provide the relevant requests.
 29 | """
 30 | 
 31 | ext_message_1 = """Today's date is {the_date}.
 32 | 
 33 | Here is all the content we've managed to collect.
 34 | 
 35 | ----------------------
 36 | {scraped_content}
 37 | ----------------------
 38 | 
 39 | Could you please extract the relevant facts from the content provided? Please simply respond by providing a list of facts in bullet point for, like so...
 40 | 
 41 | - Fact 1
 42 | - Fact 2
 43 | ... and so on.
 44 | """
 45 | 
 46 | ext_message_2 = """Today's date is {the_date}.
 47 | 
 48 | Assume all the content and facts above are accurate and correct up to today's date. The forecasting challenge we are working on is outlined below:
 49 | {statement_fill_in_the_blank}
 50 | 
 51 | The earlier forecast was as follows...
 52 | ----------------------
 53 | PREDICTION: {forecast_value}
 54 | 
 55 | JUSTIFICATION...
 56 | 
 57 | {forecast_justification}
 58 | ----------------------
 59 | 
 60 | Given the above, please use your logical thinking and reasoning to update the "justification" by including any new facts you provided earlier. Update the actual forecast prediction accordingly.
 61 | 
 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 63 | """
 64 | 
 65 | ext_message_3 = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank below... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR".
 66 | 
 67 | {statement_fill_in_the_blank}
 68 | """
 69 | 
 70 | 
 71 | def ExtendScrapePredictAgent(
 72 |     openai_api_key: str,
 73 |     google_api_key: str,
 74 |     google_search_id: str,
 75 |     google_search_query: str,
 76 |     knowledge_base: KnowledgeBaseFileCache,
 77 |     forecast_id: int,
 78 |     et_api_key: str = None,
 79 |     statement_title: str = None,
 80 |     statement_description: str = None,
 81 |     fill_in_the_blank: str = None,
 82 |     chat_prompt_system: str = base_system_prompt_ext,
 83 |     ext_message_1: str = ext_message_1,
 84 |     ext_message_2: str = ext_message_2,
 85 |     ext_message_3: str = ext_message_3,
 86 |     prediction_title: str = "Prediction",
 87 |     prediction_agent: str = "Generic Agent",
 88 | ) -> dict:
 89 |     """
 90 |     Extends an existing forecast by scraping content and including any content from a knowledge base (assuming there's new content).
 91 | 
 92 |     Args:
 93 |         openai_api_key: the OpenAI API key
 94 |         google_api_key: the Google Search API key
 95 |         google_search_id: the Google search ID
 96 |         google_search_query: the Google search query
 97 |         knowledge_base: the KnowledgeBaseFileCache object
 98 |         forecast_id: the ID of the forecast to extend
 99 |         et_api_key: the Emerging Trajectories API key
100 |         statement_title: the title of the statement (if not submitting a statement ID)
101 |         statement_description: the description of the statement (if not submitting a statement ID)
102 |         fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID)
103 |         ext_message_1: the first message to the LLM
104 |         ext_message_2: the second message to the LLM
105 |         ext_message_3: the third message to the LLM
106 |         prediction_title: the title of the forecast
107 |         prediction_agent: the agent making the forecast
108 | 
109 |     Returns:
110 |         dict: the response from the Emerging Trajectories platform
111 |     """
112 | 
113 |     if et_api_key is not None:
114 |         client = Client(et_api_key)
115 |         forecast = client.get_forecast(forecast_id)
116 |         statement_id = forecast["statement_id"]
117 |         statement = client.get_statement(statement_id)
118 |         statement_title = statement["title"]
119 |         statement_description = statement["description"]
120 |         fill_in_the_blank = statement["fill_in_the_blank"]
121 |         justification = forecast["justification"]
122 |         forecast_value = forecast["value"]
123 | 
124 |     webagent = WebSearchAgent(api_key=google_api_key)
125 |     results = webagent.search_google(
126 |         query=google_search_query, custom_search_engine_id=google_search_id, num=10
127 |     )
128 | 
129 |     scraped_content = ""
130 | 
131 |     added_new_content = False
132 | 
133 |     # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
134 |     accessed_resources = []
135 | 
136 |     for result in results:
137 |         if not knowledge_base.in_cache(result.url):
138 |             added_new_content = True
139 |             page_content = knowledge_base.get(result.url)
140 | 
141 |             accessed_resources.append(result.url)
142 |             # knowledge_base.log_access(result.url)
143 | 
144 |             scraped_content += f"{page_content}\n\n----------------------\n\n"
145 | 
146 |     # We also check the knowledge base for content that was added manually.
147 |     unaccessed_uris = knowledge_base.get_unaccessed_content()
148 |     for ua in unaccessed_uris:
149 |         added_new_content = True
150 |         page_content = knowledge_base.get(ua)
151 | 
152 |         # knowledge_base.log_access(ua)
153 |         accessed_resources.append(ua)
154 | 
155 |         scraped_content += f"{page_content}\n\n----------------------\n\n"
156 | 
157 |     if not added_new_content:
158 |         print("No new content added to the forecast.")
159 |         return None
160 | 
161 |     the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
162 | 
163 |     llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
164 |     chatbot = ChatBot(llm)
165 | 
166 |     # Steps 0 and 1
167 | 
168 |     prompt_template = ChatPrompt(
169 |         [
170 |             {"role": "system", "content": chat_prompt_system},
171 |             {"role": "user", "content": ext_message_1},
172 |         ]
173 |     )
174 | 
175 |     chatbot.messages = prompt_template.fill(
176 |         statement_title=statement_title,
177 |         statement_description=statement_description,
178 |         statement_fill_in_the_blank=fill_in_the_blank,
179 |         scraped_content=scraped_content,
180 |         the_date=the_date,
181 |         forecast_value=str(forecast_value),
182 |         forecast_justification=justification,
183 |     )
184 | 
185 |     new_facts = chatbot.resend()
186 | 
187 |     print("\n\n\n")
188 |     print(new_facts)
189 | 
190 |     # Step 3
191 | 
192 |     prompt_template_2 = ChatPrompt(
193 |         [
194 |             {"role": "system", "content": chat_prompt_system},
195 |             {"role": "user", "content": ext_message_1},
196 |             {"role": "assistant", "content": "{new_facts}"},
197 |             {"role": "user", "content": ext_message_2},
198 |         ]
199 |     )
200 | 
201 |     chatbot.messages = prompt_template_2.fill(
202 |         statement_title=statement_title,
203 |         statement_description=statement_description,
204 |         statement_fill_in_the_blank=fill_in_the_blank,
205 |         scraped_content=scraped_content,
206 |         new_facts=new_facts,
207 |         the_date=the_date,
208 |         forecast_value=str(forecast_value),
209 |         forecast_justification=justification,
210 |     )
211 | 
212 |     assistant_analysis = chatbot.resend()
213 | 
214 |     print("\n\n\n")
215 |     print(assistant_analysis)
216 | 
217 |     # Step 4
218 | 
219 |     prompt_template_3 = ChatPrompt(
220 |         [
221 |             {"role": "system", "content": chat_prompt_system},
222 |             {"role": "user", "content": ext_message_1},
223 |             {"role": "assistant", "content": "{new_facts}"},
224 |             {"role": "user", "content": ext_message_2},
225 |             {"role": "assistant", "content": "{assistant_analysis}"},
226 |             {"role": "user", "content": ext_message_3},
227 |         ]
228 |     )
229 | 
230 |     chatbot.messages = prompt_template_3.fill(
231 |         statement_title=statement_title,
232 |         statement_description=statement_description,
233 |         statement_fill_in_the_blank=fill_in_the_blank,
234 |         scraped_content=scraped_content,
235 |         new_facts=new_facts,
236 |         assistant_analysis=assistant_analysis,
237 |         the_date=the_date,
238 |         forecast_value=str(forecast_value),
239 |         forecast_justification=justification,
240 |     )
241 | 
242 |     filled_in_statement = chatbot.resend()
243 | 
244 |     print("\n\n\n")
245 |     print(filled_in_statement)
246 | 
247 |     uh = UtilityHelper(openai_api_key)
248 |     prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank)
249 | 
250 |     response = client.create_forecast(
251 |         statement_id,
252 |         prediction_title,
253 |         assistant_analysis,
254 |         prediction,
255 |         prediction_agent,
256 |         {
257 |             "full_response_from_llm": assistant_analysis,
258 |             "raw_forecast": filled_in_statement,
259 |             "extracted_value": prediction,
260 |         },
261 |         forecast_id,
262 |     )
263 | 
264 |     for ar in accessed_resources:
265 |         knowledge_base.log_access(ar)
266 | 
267 |     return response
268 | 
269 | 
270 | base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
271 | 
272 | The project description is as follows...
273 | {statement_description}
274 | 
275 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking, and then finally provide a forecast for us based on the points.
276 | 
277 | The format of the forecast needs to be, verbatim, as follows: {statement_fill_in_the_blank}
278 | """
279 | 
280 | base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 
281 | 
282 | ----------------------
283 | {scraped_content}
284 | ----------------------
285 | 
286 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) discuss your logic and rationale for making a forecast based on the above.
287 | 
288 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
289 | """
290 | 
291 | base_user_prompt_followup = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR".
292 | 
293 | {statement_fill_in_the_blank}
294 | """
295 | 
296 | 
297 | # In this case, we also get any documents that haven't been accessed by the agent.
298 | # This is why agent <-> kb needs to be a 1:1 relationship.
299 | def ScrapeAndPredictAgent(
300 |     openai_api_key: str,
301 |     google_api_key: str,
302 |     google_search_id: str,
303 |     google_search_query: str,
304 |     knowledge_base: KnowledgeBaseFileCache = None,
305 |     statement_id: int = -1,
306 |     et_api_key: str = None,
307 |     statement_title: str = None,
308 |     statement_description: str = None,
309 |     fill_in_the_blank: str = None,
310 |     chat_prompt_system: str = base_system_prompt,
311 |     chat_prompt_user: str = base_user_prompt,
312 |     chat_prompt_user_followup: str = base_user_prompt_followup,
313 |     prediction_title: str = "Prediction",
314 |     prediction_agent: str = "Generic Agent",
315 | ) -> dict:
316 |     """
317 |     This agent submits a search query to Google to find information related to its forecast. It also uses any information that it has not previously accessed in its KnowledgeBase. It then generates a forecast with all the relevant information.
318 | 
319 |     Args:
320 |         openai_api_key: the OpenAI API key
321 |         google_api_key: the Google Search API key
322 |         google_search_id: the Google search ID
323 |         google_search_query: the Google search query
324 |         knowledge_base: the KnowledgeBaseFileCache object
325 |         statement_id: the ID of the statement to use
326 |         et_api_key: the Emerging Trajectories API key
327 |         statement_title: the title of the statement (if not submitting a statement ID)
328 |         statement_description: the description of the statement (if not submitting a statement ID)
329 |         fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID)
330 |         chat_prompt_system: the system prompt for the chatbot (optional, for overriding defaults)
331 |         chat_prompt_user: the user prompt for the chatbot (optional, for overriding defaults)
332 |         chat_prompt_user_followup: the follow-up user prompt for the chatbot (optional, for overriding defaults)
333 |         prediction_title: the title of the forecast
334 |         prediction_agent: the agent making the forecast
335 | 
336 |     Returns:
337 |         dict: the response from the Emerging Trajectories platform
338 |     """
339 | 
340 |     if et_api_key is not None:
341 |         client = Client(et_api_key)
342 |         statement = client.get_statement(statement_id)
343 |         statement_title = statement["title"]
344 |         statement_description = statement["description"]
345 |         fill_in_the_blank = statement["fill_in_the_blank"]
346 | 
347 |     if statement_id == -1 and (
348 |         statement_title is None
349 |         or statement_description is None
350 |         or fill_in_the_blank is None
351 |     ):
352 |         raise Exception(
353 |             "You must provide either a statement ID or a statement title, description, and fill-in-the-blank."
354 |         )
355 | 
356 |     webagent = WebSearchAgent(api_key=google_api_key)
357 |     results = webagent.search_google(
358 |         query=google_search_query, custom_search_engine_id=google_search_id, num=10
359 |     )
360 | 
361 |     scraped_content = ""
362 | 
363 |     added_new_content = False
364 | 
365 |     # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
366 |     accessed_resources = []
367 | 
368 |     for result in results:
369 |         if not knowledge_base.in_cache(result.url):
370 |             added_new_content = True
371 |             page_content = knowledge_base.get(result.url)
372 | 
373 |             accessed_resources.append(result.url)
374 |             # knowledge_base.log_access(result.url)
375 | 
376 |             scraped_content += f"{page_content}\n\n----------------------\n\n"
377 | 
378 |     # We also check the knowledge base for content that was added manually.
379 |     unaccessed_uris = knowledge_base.get_unaccessed_content()
380 |     for ua in unaccessed_uris:
381 |         added_new_content = True
382 |         page_content = knowledge_base.get(ua)
383 | 
384 |         accessed_resources.append(ua)
385 |         # knowledge_base.log_access(ua)
386 | 
387 |         scraped_content += f"{page_content}\n\n----------------------\n\n"
388 | 
389 |     if not added_new_content:
390 |         print("No new content added to the forecast.")
391 |         return None
392 | 
393 |     the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
394 | 
395 |     llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
396 |     chatbot = ChatBot(llm)
397 | 
398 |     prompt_template = ChatPrompt(
399 |         [
400 |             {"role": "system", "content": chat_prompt_system},
401 |             {"role": "user", "content": chat_prompt_user},
402 |         ]
403 |     )
404 | 
405 |     chatbot.messages = prompt_template.fill(
406 |         statement_title=statement_title,
407 |         statement_description=statement_description,
408 |         statement_fill_in_the_blank=fill_in_the_blank,
409 |         scraped_content=scraped_content,
410 |         the_date=the_date,
411 |     )
412 | 
413 |     assistant_analysis = chatbot.resend()
414 | 
415 |     print("\n\n\n")
416 |     print(assistant_analysis)
417 | 
418 |     prompt_template_2 = ChatPrompt(
419 |         [
420 |             {"role": "system", "content": chat_prompt_system},
421 |             {"role": "user", "content": chat_prompt_user},
422 |             {"role": "assistant", "content": "{assistant_analysis}"},
423 |             {"role": "user", "content": chat_prompt_user_followup},
424 |         ]
425 |     )
426 | 
427 |     chatbot.messages = prompt_template_2.fill(
428 |         statement_title=statement_title,
429 |         statement_description=statement_description,
430 |         statement_fill_in_the_blank=fill_in_the_blank,
431 |         scraped_content=scraped_content,
432 |         assistant_analysis=assistant_analysis,
433 |         the_date=the_date,
434 |     )
435 | 
436 |     filled_in_statement = chatbot.resend()
437 | 
438 |     print("\n\n\n")
439 |     print(filled_in_statement)
440 | 
441 |     uh = UtilityHelper(openai_api_key)
442 |     prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank)
443 | 
444 |     response = client.create_forecast(
445 |         statement_id,
446 |         prediction_title,
447 |         assistant_analysis,
448 |         prediction,
449 |         prediction_agent,
450 |         {
451 |             "full_response_from_llm": assistant_analysis,
452 |             "raw_forecast": filled_in_statement,
453 |             "extracted_value": prediction,
454 |         },
455 |     )
456 | 
457 |     for ar in accessed_resources:
458 |         knowledge_base.log_access(ar)
459 | 
460 |     return response
461 | 


--------------------------------------------------------------------------------
/emergingtrajectories/chunkers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | chunkers.py is used to chunk facts using different strategies. Emerging Trajectories started by chunking via GPT-4, but we can also appreciate using sentences, paragraphs, or other verbatim approaches. We'll be adding more chunkers as time goes on.
  3 | 
  4 | Chunkers should simply take a piece of content and chunk it into a list of facts.
  5 | """
  6 | 
  7 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt
  8 | from phasellm.agents import WebpageAgent, WebSearchAgent
  9 | 
 10 | fact_system_prompt = """You are a researcher helping extract facts about {topic}, trends, and related observations. We will give you a piece of content scraped on the web. Please extract facts from this. Each fact should stand on its own, and can be several sentences long if need be. You can have as many facts as needed. For each fact, please start it as a new line with "---" as the bullet point. For example:
 11 | 
 12 | --- Fact 1... This is the fact.
 13 | --- Here is a second fact.
 14 | --- And a third fact.
 15 | 
 16 | Please do not include new lines between bullet points. Make sure you write your facts in ENGLISH. Translate any foreign language content/facts/observations into ENGLISH.
 17 | 
 18 | We will simply provide you with content and you will just provide facts."""
 19 | 
 20 | 
 21 | class ChunkerGPT4:
 22 | 
 23 |     def __init__(self, openai_api_key: str, model="gpt-4-turbo"):
 24 |         """
 25 |         Chunker based on GPT-4 reading text and providing a list of facts.
 26 | 
 27 |         Args:
 28 |             openai_api_key (str): The OpenAI API key.
 29 |             model (str): The OpenAI model to use. Defaults to "gpt-4-turbo".
 30 |         """
 31 |         self.openai_api_key = openai_api_key
 32 |         self.model = model
 33 | 
 34 |     def chunk(self, content: str, topic: str) -> list[str]:
 35 |         """
 36 |         Chunk text into facts.
 37 | 
 38 |         Args:
 39 |             content (str): The content to chunk.
 40 |             topic (str): The topic to focus on when building facts.
 41 | 
 42 |         Returns:
 43 |             list[str]: The list of facts.
 44 |         """
 45 | 
 46 |         llm = OpenAIGPTWrapper(self.openai_api_key, model=self.model)
 47 |         chatbot = ChatBot(llm)
 48 |         chatbot.messages = [{"role": "system", "content": fact_system_prompt}]
 49 | 
 50 |         prompt_template = ChatPrompt(
 51 |             [
 52 |                 {"role": "system", "content": fact_system_prompt},
 53 |             ]
 54 |         )
 55 | 
 56 |         chatbot.messages = prompt_template.fill(topic=topic)
 57 | 
 58 |         response = chatbot.chat(content)
 59 | 
 60 |         lines = response.split("\n")
 61 | 
 62 |         facts = []
 63 | 
 64 |         for line in lines:
 65 |             if line[0:4] == "--- ":
 66 |                 fact = line[4:]
 67 |                 facts.append(fact)
 68 | 
 69 |         return facts
 70 | 
 71 | 
 72 | class ChunkerNewLines:
 73 | 
 74 |     def __init__(self, min_length: int = 7):
 75 |         """
 76 |         Chunker using line breaks for content.
 77 | 
 78 |         Args:
 79 |             min_length (int): The minimum length (in characters) of a fact. Defaults to 7 characters.
 80 |         """
 81 |         self.min_length = min_length
 82 | 
 83 |     def chunk(self, content: str, topic: str = None) -> list[str]:
 84 |         """
 85 |         Chunk text into facts.
 86 | 
 87 |         Args:
 88 |             content (str): The content to chunk.
 89 |             topic (str): The topic to focus on when building facts. This defaults to None so we can keep the same function calls as other chunkers.
 90 | 
 91 |         Returns:
 92 |             list[str]: The list of facts.
 93 |         """
 94 | 
 95 |         lines = content.split("\n")
 96 | 
 97 |         facts = []
 98 | 
 99 |         for line in lines:
100 |             ls = line.strip()
101 |             if len(ls) >= self.min_length:
102 |                 facts.append(ls)
103 | 
104 |         return facts
105 | 


--------------------------------------------------------------------------------
/emergingtrajectories/citationagents.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agents for generating forecasts.
  3 | """
  4 | 
  5 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt
  6 | from phasellm.agents import WebpageAgent, WebSearchAgent
  7 | 
  8 | from . import Client
  9 | from .utils import UtilityHelper
 10 | from .knowledge import KnowledgeBaseFileCache
 11 | 
 12 | # from . import scrapeandpredict as sap
 13 | 
 14 | import datetime
 15 | import re
 16 | 
 17 | ####
 18 | # EXTENDING FORECASTS
 19 | #
 20 | 
 21 | base_system_prompt_ext = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
 22 | 
 23 | The project description is as follows...
 24 | {statement_description}
 25 | 
 26 | We need your help analyzing content and extracting any relevant information. We'll have a few requests for you... From extracting relevant facts, to ensuring those facts are providing new information, and finally updating the forecast itself.
 27 | 
 28 | The user will provide the relevant requests.
 29 | """
 30 | 
 31 | ext_message_1 = """Today's date is {the_date}.
 32 | 
 33 | Here is all the content we've managed to collect.
 34 | 
 35 | ----------------------
 36 | {scraped_content}
 37 | ----------------------
 38 | 
 39 | Could you please extract the relevant facts from the content provided? Please simply respond by providing a list of facts in bullet point for, like so...
 40 | 
 41 | - Fact 1
 42 | - Fact 2
 43 | ... and so on.
 44 | 
 45 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
 46 | 
 47 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number.
 48 | 
 49 | """
 50 | 
 51 | ext_message_2 = """Today's date is {the_date}.
 52 | 
 53 | Assume all the content and facts above are accurate and correct up to today's date. The forecasting challenge we are working on is outlined below:
 54 | {statement_fill_in_the_blank}
 55 | 
 56 | The earlier forecast was as follows...
 57 | ----------------------
 58 | PREDICTION: {forecast_value}
 59 | 
 60 | JUSTIFICATION...
 61 | 
 62 | {forecast_justification}
 63 | ----------------------
 64 | 
 65 | Given the above, please use your logical thinking and reasoning to update the "justification" by including any new facts you provided earlier. Update the actual forecast prediction accordingly.
 66 | 
 67 | Make sure to reference the citation/source numbers from the fact list.
 68 | 
 69 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 70 | """
 71 | 
 72 | ext_message_3 = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank below... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR".
 73 | 
 74 | {statement_fill_in_the_blank}
 75 | """
 76 | 
 77 | 
 78 | def CiteExtendScrapePredictAgent(
 79 |     openai_api_key: str,
 80 |     google_api_key: str,
 81 |     google_search_id: str,
 82 |     google_search_query: str,
 83 |     knowledge_base: KnowledgeBaseFileCache,
 84 |     forecast_id: int,
 85 |     et_api_key: str = None,
 86 |     statement_title: str = None,
 87 |     statement_description: str = None,
 88 |     fill_in_the_blank: str = None,
 89 |     chat_prompt_system: str = base_system_prompt_ext,
 90 |     ext_message_1: str = ext_message_1,
 91 |     ext_message_2: str = ext_message_2,
 92 |     ext_message_3: str = ext_message_3,
 93 |     prediction_title: str = "Prediction",
 94 |     prediction_agent: str = "Generic Agent",
 95 | ) -> dict:
 96 |     """
 97 |     Extends an existing forecast by scraping content and including any content from a knowledge base (assuming there's new content).
 98 | 
 99 |     Args:
100 |         openai_api_key: the OpenAI API key
101 |         google_api_key: the Google Search API key
102 |         google_search_id: the Google search ID
103 |         google_search_query: the Google search query
104 |         knowledge_base: the KnowledgeBaseFileCache object
105 |         forecast_id: the ID of the forecast to extend
106 |         et_api_key: the Emerging Trajectories API key
107 |         statement_title: the title of the statement (if not submitting a statement ID)
108 |         statement_description: the description of the statement (if not submitting a statement ID)
109 |         fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID)
110 |         ext_message_1: the first message to the LLM
111 |         ext_message_2: the second message to the LLM
112 |         ext_message_3: the third message to the LLM
113 |         prediction_title: the title of the forecast
114 |         prediction_agent: the agent making the forecast
115 | 
116 |     Returns:
117 |         dict: the response from the Emerging Trajectories platform
118 |     """
119 | 
120 |     if et_api_key is not None:
121 |         client = Client(et_api_key)
122 |         forecast = client.get_forecast(forecast_id)
123 |         statement_id = forecast["statement_id"]
124 |         statement = client.get_statement(statement_id)
125 |         statement_title = statement["title"]
126 |         statement_description = statement["description"]
127 |         fill_in_the_blank = statement["fill_in_the_blank"]
128 |         justification = forecast["justification"]
129 |         forecast_value = forecast["value"]
130 | 
131 |     webagent = WebSearchAgent(api_key=google_api_key)
132 |     results = webagent.search_google(
133 |         query=google_search_query, custom_search_engine_id=google_search_id, num=10
134 |     )
135 | 
136 |     scraped_content = ""
137 | 
138 |     added_new_content = False
139 | 
140 |     # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
141 |     accessed_resources = []
142 | 
143 |     ctr = 0
144 |     ctr_to_source = {}
145 | 
146 |     for result in results:
147 |         if not knowledge_base.in_cache(result.url):
148 |             ctr += 1
149 |             added_new_content = True
150 |             page_content = knowledge_base.get(result.url)
151 | 
152 |             accessed_resources.append(result.url)
153 |             # knowledge_base.log_access(result.url)
154 | 
155 |             scraped_content += (
156 |                 f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
157 |             )
158 |             ctr_to_source[ctr] = result.url
159 | 
160 |     # We also check the knowledge base for content that was added manually.
161 |     unaccessed_uris = knowledge_base.get_unaccessed_content()
162 |     for ua in unaccessed_uris:
163 |         added_new_content = True
164 |         ctr += 1
165 |         page_content = knowledge_base.get(ua)
166 | 
167 |         accessed_resources.append(ua)
168 |         # knowledge_base.log_access(ua)
169 | 
170 |         scraped_content += f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
171 |         ctr_to_source[ctr] = ua
172 | 
173 |     if not added_new_content:
174 |         print("No new content added to the forecast.")
175 |         return None
176 | 
177 |     the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
178 | 
179 |     llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
180 |     chatbot = ChatBot(llm)
181 | 
182 |     # Steps 0 and 1
183 | 
184 |     prompt_template = ChatPrompt(
185 |         [
186 |             {"role": "system", "content": chat_prompt_system},
187 |             {"role": "user", "content": ext_message_1},
188 |         ]
189 |     )
190 | 
191 |     chatbot.messages = prompt_template.fill(
192 |         statement_title=statement_title,
193 |         statement_description=statement_description,
194 |         statement_fill_in_the_blank=fill_in_the_blank,
195 |         scraped_content=scraped_content,
196 |         the_date=the_date,
197 |         forecast_value=str(forecast_value),
198 |         forecast_justification=justification,
199 |     )
200 | 
201 |     new_facts = chatbot.resend()
202 | 
203 |     print("\n\n\n")
204 |     print(new_facts)
205 | 
206 |     # Step 3
207 | 
208 |     prompt_template_2 = ChatPrompt(
209 |         [
210 |             {"role": "system", "content": chat_prompt_system},
211 |             {"role": "user", "content": ext_message_1},
212 |             {"role": "assistant", "content": "{new_facts}"},
213 |             {"role": "user", "content": ext_message_2},
214 |         ]
215 |     )
216 | 
217 |     chatbot.messages = prompt_template_2.fill(
218 |         statement_title=statement_title,
219 |         statement_description=statement_description,
220 |         statement_fill_in_the_blank=fill_in_the_blank,
221 |         scraped_content=scraped_content,
222 |         new_facts=new_facts,
223 |         the_date=the_date,
224 |         forecast_value=str(forecast_value),
225 |         forecast_justification=justification,
226 |     )
227 | 
228 |     assistant_analysis = chatbot.resend()
229 | 
230 |     print("\n\n\n")
231 |     print(assistant_analysis)
232 | 
233 |     # Step 4
234 | 
235 |     prompt_template_3 = ChatPrompt(
236 |         [
237 |             {"role": "system", "content": chat_prompt_system},
238 |             {"role": "user", "content": ext_message_1},
239 |             {"role": "assistant", "content": "{new_facts}"},
240 |             {"role": "user", "content": ext_message_2},
241 |             {"role": "assistant", "content": "{assistant_analysis}"},
242 |             {"role": "user", "content": ext_message_3},
243 |         ]
244 |     )
245 | 
246 |     chatbot.messages = prompt_template_3.fill(
247 |         statement_title=statement_title,
248 |         statement_description=statement_description,
249 |         statement_fill_in_the_blank=fill_in_the_blank,
250 |         scraped_content=scraped_content,
251 |         new_facts=new_facts,
252 |         assistant_analysis=assistant_analysis,
253 |         the_date=the_date,
254 |         forecast_value=str(forecast_value),
255 |         forecast_justification=justification,
256 |     )
257 | 
258 |     filled_in_statement = chatbot.resend()
259 | 
260 |     print("\n\n\n")
261 |     print(filled_in_statement)
262 | 
263 |     assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source)
264 | 
265 |     print("\n\n\n*** ANALYSIS WITH CITATIONS***\n\n\n")
266 |     print(assistant_analysis_sourced)
267 | 
268 |     uh = UtilityHelper(openai_api_key)
269 |     prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank)
270 | 
271 |     response = client.create_forecast(
272 |         statement_id,
273 |         prediction_title,
274 |         assistant_analysis_sourced,
275 |         prediction,
276 |         prediction_agent,
277 |         {
278 |             "full_response_from_llm_before_source_cleanup": assistant_analysis,
279 |             "full_response_from_llm": assistant_analysis_sourced,
280 |             "raw_forecast": filled_in_statement,
281 |             "extracted_value": prediction,
282 |         },
283 |         forecast_id,
284 |     )
285 | 
286 |     for ar in accessed_resources:
287 |         knowledge_base.log_access(ar)
288 | 
289 |     return response
290 | 
291 | 
292 | ####
293 | # INITIAL FORECAST
294 | #
295 | 
296 | base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
297 | 
298 | The project description is as follows...
299 | {statement_description}
300 | 
301 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking, and then finally provide a forecast for us based on the points.
302 | 
303 | The format of the forecast needs to be, verbatim, as follows: {statement_fill_in_the_blank}
304 | """
305 | 
306 | base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 
307 | 
308 | ----------------------
309 | {scraped_content}
310 | ----------------------
311 | 
312 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) discuss your logic and rationale for making a forecast based on the above.
313 | 
314 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
315 | 
316 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number.
317 | 
318 | Do this for the final justification of your forecast as well.
319 | 
320 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
321 | """
322 | 
323 | base_user_prompt_followup = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR".
324 | 
325 | {statement_fill_in_the_blank}
326 | """
327 | 
328 | 
329 | def clean_citations(assistant_analysis: str, ctr_to_source: dict) -> str:
330 |     """
331 |     The analysis currently contains numerical citations that are likely not in order, or in some cases are not used. We will update the cituations to follow the proper numerical order, and also include the URLs at the very end.
332 | 
333 |     Args:
334 |         assistant_analysis: the analysis text from the assistant
335 |         ctr_to_source: the mapping of citation number to source URL
336 | 
337 |     Returns:
338 |         str: the cleaned analysis text, with citations following a proper numerical format and URIs at the end of the analysis
339 |     """
340 | 
341 |     new_ctr_map = {}
342 |     ctr = 1
343 | 
344 |     end_notes = "\n\n--- SOURCES ---\n\n"
345 |     new_analysis = ""
346 | 
347 |     matches = re.finditer(r"\[\d+\]", assistant_analysis)
348 | 
349 |     last_index = 0
350 |     for m in matches:
351 | 
352 |         # print(m.group())
353 |         # print(m.start())
354 |         # print(m.end())
355 |         # print(assistant_analysis[m.start() - 1: m.end() + 1])
356 | 
357 |         m_start = m.start() + 1
358 |         m_end = m.end() - 1
359 | 
360 |         old_ctr = int(m.group()[1:-1])
361 |         uri = ctr_to_source[old_ctr]
362 | 
363 |         if old_ctr not in new_ctr_map:
364 |             new_ctr_map[old_ctr] = ctr
365 |             end_notes += f"{ctr}: {uri}\n"
366 |             ctr += 1
367 | 
368 |         new_analysis += assistant_analysis[last_index:m_start] + str(
369 |             new_ctr_map[old_ctr]
370 |         )
371 |         last_index = m_end
372 | 
373 |     if last_index != 0:
374 |         new_analysis += assistant_analysis[last_index:] + end_notes
375 | 
376 |     else:
377 |         new_analysis = assistant_analysis + end_notes + "No citations provided."
378 | 
379 |     return new_analysis
380 | 
381 | 
382 | # In this case, we also get any documents that haven't been accessed by the agent.
383 | # This is why agent <-> kb needs to be a 1:1 relationship.
384 | def CitationScrapeAndPredictAgent(
385 |     openai_api_key: str,
386 |     google_api_key: str,
387 |     google_search_id: str,
388 |     google_search_query: str,
389 |     knowledge_base: KnowledgeBaseFileCache = None,
390 |     statement_id: int = -1,
391 |     et_api_key: str = None,
392 |     statement_title: str = None,
393 |     statement_description: str = None,
394 |     fill_in_the_blank: str = None,
395 |     chat_prompt_system: str = base_system_prompt,
396 |     chat_prompt_user: str = base_user_prompt,
397 |     chat_prompt_user_followup: str = base_user_prompt_followup,
398 |     prediction_title: str = "Prediction",
399 |     prediction_agent: str = "Generic Agent",
400 | ) -> dict:
401 |     """
402 |     This agent submits a search query to Google to find information related to its forecast. It also uses any information that it has not previously accessed in its KnowledgeBase. It then generates a forecast with all the relevant information.
403 | 
404 |     Args:
405 |         openai_api_key: the OpenAI API key
406 |         google_api_key: the Google Search API key
407 |         google_search_id: the Google search ID
408 |         google_search_query: the Google search query
409 |         knowledge_base: the KnowledgeBaseFileCache object
410 |         statement_id: the ID of the statement to use
411 |         et_api_key: the Emerging Trajectories API key
412 |         statement_title: the title of the statement (if not submitting a statement ID)
413 |         statement_description: the description of the statement (if not submitting a statement ID)
414 |         fill_in_the_blank: the fill-in-the-blank component of the statement (if not submitting a statement ID)
415 |         chat_prompt_system: the system prompt for the chatbot (optional, for overriding defaults)
416 |         chat_prompt_user: the user prompt for the chatbot (optional, for overriding defaults)
417 |         chat_prompt_user_followup: the follow-up user prompt for the chatbot (optional, for overriding defaults)
418 |         prediction_title: the title of the forecast
419 |         prediction_agent: the agent making the forecast
420 | 
421 |     Returns:
422 |         dict: the response from the Emerging Trajectories platform
423 |     """
424 | 
425 |     if et_api_key is not None:
426 |         client = Client(et_api_key)
427 |         statement = client.get_statement(statement_id)
428 |         statement_title = statement["title"]
429 |         statement_description = statement["description"]
430 |         fill_in_the_blank = statement["fill_in_the_blank"]
431 | 
432 |     if statement_id == -1 and (
433 |         statement_title is None
434 |         or statement_description is None
435 |         or fill_in_the_blank is None
436 |     ):
437 |         raise Exception(
438 |             "You must provide either a statement ID or a statement title, description, and fill-in-the-blank."
439 |         )
440 | 
441 |     webagent = WebSearchAgent(api_key=google_api_key)
442 |     results = webagent.search_google(
443 |         query=google_search_query, custom_search_engine_id=google_search_id, num=10
444 |     )
445 | 
446 |     scraped_content = ""
447 | 
448 |     added_new_content = False
449 | 
450 |     # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
451 |     accessed_resources = []
452 | 
453 |     ctr = 0
454 |     ctr_to_source = {}
455 | 
456 |     for result in results:
457 |         if not knowledge_base.in_cache(result.url):
458 |             ctr += 1
459 |             added_new_content = True
460 |             page_content = knowledge_base.get(result.url)
461 | 
462 |             accessed_resources.append(result.url)
463 |             # knowledge_base.log_access(result.url)
464 | 
465 |             scraped_content += (
466 |                 f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
467 |             )
468 |             ctr_to_source[ctr] = result.url
469 | 
470 |     # We also check the knowledge base for content that was added manually.
471 |     unaccessed_uris = knowledge_base.get_unaccessed_content()
472 |     for ua in unaccessed_uris:
473 |         added_new_content = True
474 |         ctr += 1
475 |         page_content = knowledge_base.get(ua)
476 | 
477 |         accessed_resources.append(ua)
478 |         # knowledge_base.log_access(ua)
479 | 
480 |         scraped_content += f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
481 |         ctr_to_source[ctr] = ua
482 | 
483 |     if not added_new_content:
484 |         print("No new content added to the forecast.")
485 |         return None
486 | 
487 |     the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
488 | 
489 |     llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
490 |     chatbot = ChatBot(llm)
491 | 
492 |     prompt_template = ChatPrompt(
493 |         [
494 |             {"role": "system", "content": chat_prompt_system},
495 |             {"role": "user", "content": chat_prompt_user},
496 |         ]
497 |     )
498 | 
499 |     chatbot.messages = prompt_template.fill(
500 |         statement_title=statement_title,
501 |         statement_description=statement_description,
502 |         statement_fill_in_the_blank=fill_in_the_blank,
503 |         scraped_content=scraped_content,
504 |         the_date=the_date,
505 |     )
506 | 
507 |     assistant_analysis = chatbot.resend()
508 | 
509 |     print("\n\n\n")
510 |     print(assistant_analysis)
511 | 
512 |     prompt_template_2 = ChatPrompt(
513 |         [
514 |             {"role": "system", "content": chat_prompt_system},
515 |             {"role": "user", "content": chat_prompt_user},
516 |             {"role": "assistant", "content": "{assistant_analysis}"},
517 |             {"role": "user", "content": chat_prompt_user_followup},
518 |         ]
519 |     )
520 | 
521 |     chatbot.messages = prompt_template_2.fill(
522 |         statement_title=statement_title,
523 |         statement_description=statement_description,
524 |         statement_fill_in_the_blank=fill_in_the_blank,
525 |         scraped_content=scraped_content,
526 |         assistant_analysis=assistant_analysis,
527 |         the_date=the_date,
528 |     )
529 | 
530 |     filled_in_statement = chatbot.resend()
531 | 
532 |     print("\n\n\n")
533 |     print(filled_in_statement)
534 | 
535 |     assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source)
536 | 
537 |     print("\n\n\n*** ANALYSIS WITH CITATIONS***\n\n\n")
538 |     print(assistant_analysis_sourced)
539 | 
540 |     uh = UtilityHelper(openai_api_key)
541 |     prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank)
542 | 
543 |     response = client.create_forecast(
544 |         statement_id,
545 |         prediction_title,
546 |         assistant_analysis_sourced,
547 |         prediction,
548 |         prediction_agent,
549 |         {
550 |             "full_response_from_llm_before_source_cleanup": assistant_analysis,
551 |             "full_response_from_llm": assistant_analysis_sourced,
552 |             "raw_forecast": filled_in_statement,
553 |             "extracted_value": prediction,
554 |         },
555 |     )
556 | 
557 |     for ar in accessed_resources:
558 |         knowledge_base.log_access(ar)
559 | 
560 |     return response
561 | 


--------------------------------------------------------------------------------
/emergingtrajectories/crawlers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Crawlers provide a standardized approach to interacting with with web pages and extracting information. We have a number of crawlers based on PhaseLLM (Python requests) and ones using Playwright (headlessly and with a front-end) to enable flexible scraping.
  3 | 
  4 | All scraping agents return the raw HTML content and the extracted text content.
  5 | """
  6 | 
  7 | from playwright.sync_api import sync_playwright
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | from phasellm.agents import WebpageAgent
 11 | 
 12 | from scrapingbee import ScrapingBeeClient
 13 | 
 14 | 
 15 | def _bs4_childtraversal(html: str) -> str:
 16 |     """
 17 |     Recursively travserse the DOM to extract content.
 18 | 
 19 |     Args:
 20 |         html (str): HTML content
 21 | 
 22 |     Returns:
 23 |         str: Extracted content
 24 |     """
 25 | 
 26 |     if len(str(html).strip()) < 2:
 27 |         return ""
 28 | 
 29 |     new_html = ""
 30 | 
 31 |     for content in html:
 32 |         contentname = ""
 33 | 
 34 |         if isinstance(content, str):
 35 |             contentname = ""
 36 |         elif content.name is not None:
 37 |             contentname = content.name.lower()
 38 | 
 39 |         if contentname in ["p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "span"]:
 40 |             text = content.get_text()
 41 |             num_words = len(text.strip().split(" "))
 42 |             # print(num_words)
 43 |             if num_words > 7:
 44 |                 new_html = new_html + content.get_text() + "\n\n"
 45 |         else:
 46 |             new_html = new_html + _bs4_childtraversal(content)
 47 | 
 48 |     return new_html
 49 | 
 50 | 
 51 | def _get_text_bs4(html: str) -> str:
 52 |     """
 53 |     Extract text content from HTML using BeautifulSoup.
 54 | 
 55 |     Args:
 56 |         html (str): HTML content
 57 | 
 58 |     Returns:
 59 |         str: Extracted text content
 60 |     """
 61 | 
 62 |     new_html = "<html><body>"
 63 | 
 64 |     souppre = BeautifulSoup(html, "html.parser")
 65 |     soup = souppre.body
 66 | 
 67 |     for content in soup.contents:
 68 |         contentname = ""
 69 |         if content.name is not None:
 70 |             contentname = content.name.lower()
 71 |         if contentname not in ["script", "style"]:
 72 |             new_html = new_html + _bs4_childtraversal(content)
 73 | 
 74 |     new_html = new_html + "</body></html>"
 75 | 
 76 |     newsoup = BeautifulSoup(new_html, "html.parser")
 77 |     text = newsoup.get_text()
 78 | 
 79 |     return text
 80 | 
 81 | 
 82 | class crawlerPlaywright:
 83 | 
 84 |     def __init__(self, headless: bool = True) -> None:
 85 |         """
 86 |         Crawler that uses Playwright to scrape web pages.
 87 | 
 88 |         Args:
 89 |             headless (bool, optional): Run the browser in headless mode. Defaults to True.
 90 |         """
 91 |         self.headless = headless
 92 | 
 93 |     def get_content(self, url: str) -> tuple[str, str]:
 94 |         """
 95 |         Gets content for a specific URL.
 96 | 
 97 |         Args:
 98 |             url (str): URL to scrape
 99 | 
100 |         Returns:
101 |             tuple[str, str]: Raw HTML content and extracted text content (in this order)
102 |         """
103 | 
104 |         content = ""
105 |         text = ""
106 |         with sync_playwright() as playwright:
107 | 
108 |             browser = playwright.chromium.launch(headless=self.headless)
109 |             page = browser.new_page()
110 | 
111 |             # Navigate to the webpage
112 |             page.goto(url)
113 | 
114 |             # Extract data
115 |             content = page.content()
116 | 
117 |             # Close the browser
118 |             browser.close()
119 | 
120 |         text = _get_text_bs4(content)
121 | 
122 |         return content, text
123 | 
124 | 
125 | class crawlerPhaseLLM:
126 | 
127 |     def __init__(self):
128 |         """
129 |         PhaseLLM scraper. Uses Python requests and does not execute JS.
130 |         """
131 |         self.scraper = WebpageAgent()
132 | 
133 |     def get_content(self, url):
134 |         """
135 |         Gets content for a specific URL.
136 | 
137 |         Args:
138 |             url (str): URL to scrape
139 | 
140 |         Returns:
141 |             tuple[str, str]: Raw HTML content and extracted text content (in this order)
142 |         """
143 |         content_raw = self.scraper.scrape(url, text_only=False, body_only=False)
144 |         content_parsed = self.scraper.scrape(url, text_only=True, body_only=True)
145 |         return content_raw, content_parsed
146 | 
147 | 
148 | class crawlerScrapingBee:
149 | 
150 |     def __init__(self, api_key: str):
151 |         """
152 |         Crawler that uses ScrapingBee to scrape web pages.
153 |         """
154 |         self.client = ScrapingBeeClient(api_key=api_key)
155 | 
156 |     def get_content(self, url):
157 |         """
158 |         Gets content for a specific URL.
159 | 
160 |         Args:
161 |             url (str): URL to scrape
162 | 
163 |         Returns:
164 |             tuple[str, str]: Raw HTML content and extracted text content (in this order)
165 |         """
166 | 
167 |         response = self.client.get(url)
168 |         content_raw = response.content.decode("utf-8")
169 |         content_parsed = _get_text_bs4(content_raw)
170 |         return content_raw, content_parsed
171 | 


--------------------------------------------------------------------------------
/emergingtrajectories/facts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Facts agent. Similar to knowledge agent but simply provides a list of facts and associated sources.
  3 | 
  4 | This abstracts away the fact generation from forecast creation, thus allowing us to test different prompting strategies and LLMs.
  5 | """
  6 | 
  7 | import os
  8 | import json
  9 | import hashlib
 10 | import re
 11 | 
 12 | # Using JSONEncoder to be consistent with the Emerging Trajectories website and platform.
 13 | from django.core.serializers.json import DjangoJSONEncoder
 14 | 
 15 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt
 16 | from phasellm.agents import WebpageAgent, WebSearchAgent
 17 | 
 18 | from datetime import datetime
 19 | 
 20 | from . import Client
 21 | from .crawlers import crawlerPlaywright
 22 | from phasellm.llms import OpenAIGPTWrapper, ChatBot
 23 | 
 24 | # Number of search results to return from web searche (default value).
 25 | _DEFAULT_NUM_SEARCH_RESULTS = 10
 26 | 
 27 | facts_base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
 28 | 
 29 | The project description is as follows...
 30 | {statement_description}
 31 | 
 32 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking. Rather than using bullet points, please list each as F1, F2, F3, etc... So that we can reference it.
 33 | 
 34 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
 35 | 
 36 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number.
 37 | 
 38 | Thus, a bullet point would look like this:
 39 | F1: (information) [1]
 40 | F2: (information) [1]
 41 | F3: (information) [2]
 42 | 
 43 | ... and so on, where F1, F2, F3, etc. are facts, and [1], [2] are the source documents you are extracting the facts from.
 44 | """
 45 | 
 46 | facts_base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 
 47 | 
 48 | ----------------------
 49 | {scraped_content}
 50 | ----------------------
 51 | 
 52 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) share any insights you might have based on the facts.
 53 | 
 54 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
 55 | 
 56 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the actual number.
 57 | 
 58 | DO NOT PROVIDE A FORECAST, BUT SIMPLY STATE AND SHARE THE FACTS AND INSIGHTS YOU HAVE GATHERED.
 59 | """
 60 | 
 61 | 
 62 | def uri_to_local(uri: str) -> str:
 63 |     """
 64 |     Convert a URI to a local file name. In this case, we typically will use an MD5 sum.
 65 | 
 66 |     Args:
 67 |         uri (str): The URI to convert.
 68 | 
 69 |     Returns:
 70 |         str: The MD5 sum of the URI.
 71 |     """
 72 |     uri_md5 = hashlib.md5(uri.encode("utf-8")).hexdigest()
 73 |     return uri_md5
 74 | 
 75 | 
 76 | # TODO Move to Utils.py, or elsewhere.
 77 | def clean_citations(assistant_analysis: str, ctr_to_source: dict) -> str:
 78 |     """
 79 |     The analysis currently contains numerical citations that are likely not in order, or in some cases are not used. We will update the cituations to follow the proper numerical order, and also include the URLs at the very end.
 80 | 
 81 |     Args:
 82 |         assistant_analysis: the analysis text from the assistant
 83 |         ctr_to_source: the mapping of citation number to source URL
 84 | 
 85 |     Returns:
 86 |         str: the cleaned analysis text, with citations following a proper numerical format and URIs at the end of the analysis
 87 |     """
 88 | 
 89 |     new_ctr_map = {}
 90 |     ctr = 1
 91 | 
 92 |     end_notes = "\n\n--- SOURCES ---\n\n"
 93 |     new_analysis = ""
 94 | 
 95 |     matches = re.finditer(r"\[\d+\]", assistant_analysis)
 96 | 
 97 |     last_index = 0
 98 |     for m in matches:
 99 | 
100 |         m_start = m.start() + 1
101 |         m_end = m.end() - 1
102 | 
103 |         old_ctr = int(m.group()[1:-1])
104 |         uri = ctr_to_source[old_ctr]
105 | 
106 |         if old_ctr not in new_ctr_map:
107 |             new_ctr_map[old_ctr] = ctr
108 |             end_notes += f"{ctr}: {uri}\n"
109 |             ctr += 1
110 | 
111 |         new_analysis += assistant_analysis[last_index:m_start] + str(
112 |             new_ctr_map[old_ctr]
113 |         )
114 |         last_index = m_end
115 | 
116 |     if last_index != 0:
117 |         new_analysis += assistant_analysis[last_index:] + end_notes
118 | 
119 |     else:
120 |         new_analysis = assistant_analysis + end_notes + "No citations provided."
121 | 
122 |     return new_analysis
123 | 
124 | 
125 | # TODO If this works, it should be an agent with setllm() supported, etc.
126 | class FactBaseFileCache:
127 | 
128 |     def __init__(
129 |         self, folder_path: str, cache_file: str = "cache.json", crawler=None
130 |     ) -> None:
131 |         """
132 |         The KnowledgeBaseFileCache is a simple file-based cache for web content and local files. The cache stores the original HTML, PDF, or TXT content and tracks when (if ever) an agent actually accessed the content.
133 | 
134 |         Args:
135 |             folder_path (str): The folder where the cache will be stored.
136 |             cache_file (str, optional): The name of the cache file. Defaults to "cache.json".
137 |         """
138 |         self.root_path = folder_path
139 |         self.root_parsed = os.path.join(folder_path, "parsed")
140 |         self.root_original = os.path.join(folder_path, "original")
141 |         self.cache_file = os.path.join(folder_path, cache_file)
142 |         self.cache = self.load_cache()
143 | 
144 |         if crawler is None:
145 |             self.crawler = crawlerPlaywright()
146 |         else:
147 |             self.crawler = crawler
148 | 
149 |     # TODO: this function is a new one compared to the KnowledgeBaseFileCache
150 |     # TODO: refactor this + code where we run one query
151 |     def summarize_new_info_multiple_queries(
152 |         self,
153 |         statement,
154 |         chatbot,
155 |         google_api_key,
156 |         google_search_id,
157 |         google_search_queries,
158 |         fileout=None,
159 |     ) -> str:
160 | 
161 |         self.google_api_key = google_api_key
162 |         self.google_search_id = google_search_id
163 |         self.google_search_queries = google_search_queries
164 | 
165 |         webagent = WebSearchAgent(api_key=self.google_api_key)
166 | 
167 |         scraped_content = ""
168 |         added_new_content = False
169 | 
170 |         # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
171 |         accessed_resources = []
172 | 
173 |         ctr = 0
174 |         ctr_to_source = {}
175 | 
176 |         for google_search_query in self.google_search_queries:
177 | 
178 |             results = webagent.search_google(
179 |                 query=google_search_query,
180 |                 custom_search_engine_id=self.google_search_id,
181 |                 num=_DEFAULT_NUM_SEARCH_RESULTS,
182 |             )
183 | 
184 |             added_new_content = False
185 | 
186 |             for result in results:
187 |                 if not self.in_cache(result.url):
188 |                     ctr += 1
189 |                     added_new_content = True
190 | 
191 |                     try:
192 |                         page_content = self.get(result.url)
193 |                         print(page_content)
194 |                     except Exception as e:
195 |                         print(f"Failed to get content from {result.url}\n{e}")
196 |                         self.force_empty(result.url)
197 |                         page_content = ""
198 | 
199 |                     accessed_resources.append(result.url)
200 |                     # knowledge_base.log_access(result.url)
201 | 
202 |                     scraped_content += (
203 |                         f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
204 |                     )
205 |                     ctr_to_source[ctr] = result.url
206 | 
207 |         # We also check the knowledge base for content that was added manually.
208 |         unaccessed_uris = self.get_unaccessed_content()
209 |         for ua in unaccessed_uris:
210 |             added_new_content = True
211 |             ctr += 1
212 |             page_content = self.get(ua)
213 | 
214 |             accessed_resources.append(ua)
215 |             # knowledge_base.log_access(ua)
216 | 
217 |             scraped_content += (
218 |                 f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
219 |             )
220 |             ctr_to_source[ctr] = ua
221 | 
222 |         if not added_new_content:
223 |             print("No new content added to the forecast.")
224 |             return None
225 | 
226 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
227 | 
228 |         prompt_template = ChatPrompt(
229 |             [
230 |                 {"role": "system", "content": facts_base_system_prompt},
231 |                 {"role": "user", "content": facts_base_user_prompt},
232 |             ]
233 |         )
234 | 
235 |         chatbot.messages = prompt_template.fill(
236 |             statement_title=statement.title,
237 |             statement_description=statement.description,
238 |             statement_fill_in_the_blank=statement.fill_in_the_blank,
239 |             scraped_content=scraped_content,
240 |             the_date=the_date,
241 |         )
242 | 
243 |         assistant_analysis = chatbot.resend()
244 |         assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source)
245 | 
246 |         print("\n\n\n")
247 |         print(assistant_analysis_sourced)
248 | 
249 |         if fileout is not None:
250 |             with open(fileout, "w") as w:
251 |                 w.write(assistant_analysis_sourced)
252 | 
253 |         for ar in accessed_resources:
254 |             self.log_access(ar)
255 | 
256 |         return assistant_analysis_sourced
257 | 
258 |     # TODO: this function is a new one compared to the KnowledgeBaseFileCache
259 |     def summarize_new_info(
260 |         self,
261 |         statement,
262 |         chatbot,
263 |         google_api_key,
264 |         google_search_id,
265 |         google_search_query,
266 |         fileout=None,
267 |     ) -> str:
268 | 
269 |         self.google_api_key = google_api_key
270 |         self.google_search_id = google_search_id
271 |         self.google_search_query = google_search_query
272 | 
273 |         webagent = WebSearchAgent(api_key=self.google_api_key)
274 |         results = webagent.search_google(
275 |             query=self.google_search_query,
276 |             custom_search_engine_id=self.google_search_id,
277 |             num=_DEFAULT_NUM_SEARCH_RESULTS,
278 |         )
279 | 
280 |         scraped_content = ""
281 | 
282 |         added_new_content = False
283 | 
284 |         # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
285 |         accessed_resources = []
286 | 
287 |         ctr = 0
288 |         ctr_to_source = {}
289 | 
290 |         for result in results:
291 |             if not self.in_cache(result.url):
292 |                 ctr += 1
293 |                 added_new_content = True
294 | 
295 |                 try:
296 |                     page_content = self.get(result.url)
297 |                     print(page_content)
298 |                 except Exception as e:
299 |                     print(f"Failed to get content from {result.url}\n{e}")
300 |                     self.force_empty(result.url)
301 |                     page_content = ""
302 | 
303 |                 accessed_resources.append(result.url)
304 |                 # knowledge_base.log_access(result.url)
305 | 
306 |                 scraped_content += (
307 |                     f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
308 |                 )
309 |                 ctr_to_source[ctr] = result.url
310 | 
311 |         # We also check the knowledge base for content that was added manually.
312 |         unaccessed_uris = self.get_unaccessed_content()
313 |         for ua in unaccessed_uris:
314 |             added_new_content = True
315 |             ctr += 1
316 |             page_content = self.get(ua)
317 | 
318 |             accessed_resources.append(ua)
319 |             # knowledge_base.log_access(ua)
320 | 
321 |             scraped_content += (
322 |                 f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
323 |             )
324 |             ctr_to_source[ctr] = ua
325 | 
326 |         if not added_new_content:
327 |             print("No new content added to the forecast.")
328 |             return None
329 | 
330 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
331 | 
332 |         prompt_template = ChatPrompt(
333 |             [
334 |                 {"role": "system", "content": facts_base_system_prompt},
335 |                 {"role": "user", "content": facts_base_user_prompt},
336 |             ]
337 |         )
338 | 
339 |         chatbot.messages = prompt_template.fill(
340 |             statement_title=statement.title,
341 |             statement_description=statement.description,
342 |             statement_fill_in_the_blank=statement.fill_in_the_blank,
343 |             scraped_content=scraped_content,
344 |             the_date=the_date,
345 |         )
346 | 
347 |         assistant_analysis = chatbot.resend()
348 |         assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source)
349 | 
350 |         print("\n\n\n")
351 |         print(assistant_analysis_sourced)
352 | 
353 |         if fileout is not None:
354 |             with open(fileout, "w") as w:
355 |                 w.write(assistant_analysis_sourced)
356 | 
357 |         for ar in accessed_resources:
358 |             self.log_access(ar)
359 | 
360 |         return assistant_analysis_sourced
361 | 
362 |     def save_state(self) -> None:
363 |         """
364 |         Saves the in-memory changes to the knowledge base to the JSON cache file.
365 |         """
366 |         with open(self.cache_file, "w") as f:
367 |             json.dump(self.cache, f, cls=DjangoJSONEncoder)
368 | 
369 |     def load_cache(self) -> None:
370 |         """
371 |         Loads the cache from the cache file, or creates the relevant files and folders if one does not exist.
372 |         """
373 | 
374 |         if not os.path.exists(self.root_path):
375 |             os.makedirs(self.root_path)
376 | 
377 |         if not os.path.exists(self.root_parsed):
378 |             os.makedirs(self.root_parsed)
379 | 
380 |         if not os.path.exists(self.root_original):
381 |             os.makedirs(self.root_original)
382 | 
383 |         if not os.path.exists(self.cache_file):
384 |             with open(self.cache_file, "w") as f:
385 |                 f.write("{}")
386 | 
387 |         with open(self.cache_file, "r") as f:
388 |             return json.load(f)
389 | 
390 |     def in_cache(self, uri: str) -> bool:
391 |         """
392 |         Checks if a URI is in the cache already.
393 | 
394 |         Args:
395 |             uri (str): The URI to check.
396 | 
397 |         Returns:
398 |             bool: True if the URI is in the cache, False otherwise.
399 |         """
400 |         if uri in self.cache:
401 |             return True
402 |         return False
403 | 
404 |     def update_cache(
405 |         self, uri: str, obtained_on: datetime, last_accessed: datetime
406 |     ) -> None:
407 |         """
408 |         Updates the cache file for a given URI, specifically when it was obtained and last accessed.
409 | 
410 |         Args:
411 |             uri (str): The URI to update.
412 |             obtained_on (datetime): The date and time when the content was obtained.
413 |             last_accessed (datetime): The date and time when the content was last accessed.
414 |         """
415 |         uri_md5 = uri_to_local(uri)
416 |         self.cache[uri] = {
417 |             "obtained_on": obtained_on,
418 |             "last_accessed": last_accessed,
419 |             "accessed": 0,
420 |             "uri_md5": uri_md5,
421 |         }
422 |         self.save_state()
423 | 
424 |     def log_access(self, uri: str) -> None:
425 |         """
426 |         Saves the last accessed time and updates the accessed tracker for a given URI.
427 | 
428 |         Args:
429 |             uri (str): The URI to update.
430 |         """
431 |         self.cache[uri]["last_accessed"] = datetime.now()
432 |         self.cache[uri]["accessed"] = 1
433 |         self.save_state()
434 | 
435 |     def get_unaccessed_content(self) -> list[str]:
436 |         """
437 |         Returns a list of URIs that have not been accessed by the agent.
438 | 
439 |         Returns:
440 |             list[str]: A list of URIs that have not been accessed by the agent.
441 |         """
442 |         unaccessed = []
443 |         for uri in self.cache:
444 |             if self.cache[uri]["accessed"] == 0:
445 |                 unaccessed.append(uri)
446 |         return unaccessed
447 | 
448 |     def force_empty(self, uri: str) -> None:
449 |         """
450 |         Saves an empty file for a given URI. Used when the page is erroring out.
451 | 
452 |         Args:
453 |             uri (str): The URI to empty the cache for.
454 |         """
455 |         uri_md5 = uri_to_local(uri)
456 | 
457 |         with open(os.path.join(self.root_original, uri_md5), "w") as f:
458 |             f.write("")
459 |         with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
460 |             f.write("")
461 | 
462 |         self.update_cache(uri, datetime.now(), datetime.now())
463 | 
464 |     def get(self, uri: str) -> str:
465 |         """
466 |         Returns the content for a given URI. If the content is not in the cache, it will be scraped and added to the cache.
467 | 
468 |         Args:
469 |             uri (str): The URI to get the content for.
470 | 
471 |         Returns:
472 |             str: The content for the given URI.
473 |         """
474 |         uri_md5 = uri_to_local(uri)
475 |         if uri in self.cache:
476 |             with open(os.path.join(self.root_parsed, uri_md5), "r") as f:
477 |                 return f.read()
478 |         else:
479 |             # scraper = WebpageAgent()
480 | 
481 |             # content_raw = scraper.scrape(uri, text_only=False, body_only=False)
482 |             # with open(os.path.join(self.root_original, uri_md5), "w") as f:
483 |             #    f.write(content_raw)
484 | 
485 |             # content_parsed = scraper.scrape(uri, text_only=True, body_only=True)
486 |             # with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
487 |             #    f.write(content_parsed)
488 | 
489 |             content, text = self.crawler.get_content(uri)
490 |             with open(os.path.join(self.root_original, uri_md5), "w") as f:
491 |                 f.write(content)
492 |             with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
493 |                 f.write(text)
494 | 
495 |             self.update_cache(uri, datetime.now(), datetime.now())
496 | 
497 |             return text
498 | 
499 |     def add_content(self, content: str, uri: str = None) -> None:
500 |         """
501 |         Adds content to cache.
502 | 
503 |         Args:
504 |             content (str): The content to add to the cache.
505 |             uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used.
506 |         """
507 |         if uri is None:
508 |             uri = hashlib.md5(content.encode("utf-8")).hexdigest()
509 |         uri_md5 = uri_to_local(uri)
510 |         with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
511 |             f.write(content)
512 |         self.update_cache(uri, datetime.now(), datetime.now())
513 | 
514 |     def add_content_from_file(self, filepath: str, uri: str = None) -> None:
515 |         """
516 |         Adds content from a text file to the cache.
517 | 
518 |         Args:
519 |             filepath (str): The path to the file to add to the cache.
520 |             uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used.
521 |         """
522 |         with open(filepath, "r") as f:
523 |             content = f.read()
524 |         self.add_content(content, uri)
525 | 


--------------------------------------------------------------------------------
/emergingtrajectories/factsforecaster.py:
--------------------------------------------------------------------------------
  1 | from .recursiveagent import ETClient
  2 | from .facts import FactBaseFileCache
  3 | from .utils import UtilityHelper
  4 | from . import Client, Statement, Forecast
  5 | 
  6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt
  7 | 
  8 | from datetime import datetime
  9 | 
 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you."""
 11 | 
 12 | start_user_prompt = """Here is the research:
 13 | ---------------------
 14 | {content}
 15 | ---------------------
 16 | {additional_facts}
 17 | 
 18 | Given the above, we need you to do your best to fill in the following blank...
 19 | {fill_in_the_blank}
 20 | 
 21 | PLEASE DO THE FOLLOWING:
 22 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 23 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 24 | - Do not provide a range; provide ONE number.
 25 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 26 | 
 27 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 28 | """
 29 | 
 30 | extend_user_prompt = """Here is the research:
 31 | ---------------------
 32 | {content}
 33 | ---------------------
 34 | {additional_facts}
 35 | 
 36 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast...
 37 | ---------------------
 38 | FORECAST: {earlier_forecast_value}
 39 | 
 40 | JUSTIFICATION:
 41 | {earlier_forecast}
 42 | ---------------------
 43 | 
 44 | Given the above, we need you to do your best to fill in the following blank...
 45 | {fill_in_the_blank}
 46 | 
 47 | PLEASE DO THE FOLLOWING:
 48 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 49 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 50 | - Do not provide a range; provide ONE number.
 51 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 52 | 
 53 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 54 | 
 55 | """
 56 | 
 57 | 
 58 | class FactForecastingAgent(object):
 59 | 
 60 |     # TODO: document / clean up
 61 |     def __init__(
 62 |         self,
 63 |         client: ETClient,
 64 |         chatbot: ChatBot,
 65 |         factbase: FactBaseFileCache,
 66 |     ):
 67 | 
 68 |         self.client = client
 69 |         self.chatbot = chatbot
 70 |         self.factbase = factbase
 71 | 
 72 |     # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity.
 73 |     def setChatBot(self, chatbot):
 74 |         self.chatbot = chatbot
 75 | 
 76 |     # TODO: standardize -- camel case or snake case? Or something else?
 77 |     def getChatBot(self):
 78 |         return self.chatbot
 79 | 
 80 |     # TODO: we can do much better at disaggregating all these functions. Currently just want this to work.
 81 |     # TODO: Google query can be a list of queries, not just a single query.
 82 |     def create_forecast(
 83 |         self,
 84 |         statement: Statement,
 85 |         openai_api_key,
 86 |         et_api_key,
 87 |         google_api_key,
 88 |         google_search_id,
 89 |         google_search_query,
 90 |         facts=None,
 91 |         prediction_agent="Test Agent",
 92 |     ):
 93 | 
 94 |         fact_llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
 95 |         fact_chatbot = ChatBot(fact_llm)
 96 | 
 97 |         if isinstance(google_search_query, str):
 98 |             print("CALLING SINGLE QUERY...")
 99 |             content = self.factbase.summarize_new_info(
100 |                 statement,
101 |                 fact_chatbot,
102 |                 google_api_key,
103 |                 google_search_id,
104 |                 google_search_query,
105 |             )
106 |         elif isinstance(google_search_query, list):
107 |             content = self.factbase.summarize_new_info_multiple_queries(
108 |                 statement,
109 |                 fact_chatbot,
110 |                 google_api_key,
111 |                 google_search_id,
112 |                 google_search_query,
113 |             )
114 |         else:
115 |             raise ValueError(
116 |                 "google_search_query must be a string or a list of strings"
117 |             )
118 | 
119 |         if content is None:
120 |             print("No new content added to the forecast.")
121 |             return None
122 | 
123 |         chatbot_messages = [
124 |             {"role": "system", "content": start_system_prompt},
125 |             {"role": "user", "content": start_user_prompt},
126 |         ]
127 | 
128 |         chatbot = self.chatbot
129 | 
130 |         prompt_template = ChatPrompt(chatbot_messages)
131 | 
132 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
133 | 
134 |         additional_facts = ""
135 |         if facts is not None:
136 |             additional_facts = "Some additional facts for consideration are below...\n"
137 |             afctr = 1
138 |             for f in facts:
139 |                 additional_facts += f"AF{afctr}: {f}\n"
140 |                 afctr += 1
141 |             additional_facts += "---------------------\n\n"
142 | 
143 |         chatbot.messages = prompt_template.fill(
144 |             statement_title=statement.title,
145 |             statement_description=statement.description,
146 |             statement_fill_in_the_blank=statement.fill_in_the_blank,
147 |             fill_in_the_blank_2=statement.fill_in_the_blank,
148 |             content=content,
149 |             the_date=the_date,
150 |             additional_facts=additional_facts,
151 |         )
152 | 
153 |         assistant_analysis = chatbot.resend()
154 | 
155 |         print("\n\n\n")
156 |         print(assistant_analysis)
157 | 
158 |         uh = UtilityHelper(openai_api_key)
159 |         prediction = uh.extract_prediction(
160 |             assistant_analysis, statement.fill_in_the_blank
161 |         )
162 | 
163 |         client = Client(et_api_key)
164 | 
165 |         full_content = content + "\n\n-----------------\n\n" + assistant_analysis
166 | 
167 |         response = client.create_forecast(
168 |             statement.id,
169 |             "Prediction",
170 |             full_content,
171 |             prediction,
172 |             prediction_agent,
173 |             {
174 |                 "full_response_from_llm_before_source_cleanup": content,
175 |                 "full_response_from_llm": assistant_analysis,
176 |                 "extracted_value": prediction,
177 |             },
178 |         )
179 | 
180 |         return response
181 | 
182 |     def extend_forecast(
183 |         self,
184 |         forecast: Forecast,
185 |         openai_api_key,
186 |         et_api_key,
187 |         google_api_key,
188 |         google_search_id,
189 |         google_search_query,
190 |         facts=None,
191 |         prediction_agent="Test Agent",
192 |     ):
193 | 
194 |         fact_llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
195 |         fact_chatbot = ChatBot(fact_llm)
196 | 
197 |         if isinstance(google_search_query, str):
198 |             content = self.factbase.summarize_new_info(
199 |                 forecast.statement,
200 |                 fact_chatbot,
201 |                 google_api_key,
202 |                 google_search_id,
203 |                 google_search_query,
204 |             )
205 |         elif isinstance(google_search_query, list):
206 |             content = self.factbase.summarize_new_info_multiple_queries(
207 |                 forecast.statement,
208 |                 fact_chatbot,
209 |                 google_api_key,
210 |                 google_search_id,
211 |                 google_search_query,
212 |             )
213 |         else:
214 |             raise ValueError(
215 |                 "google_search_query must be a string or a list of strings"
216 |             )
217 | 
218 |         if content is None:
219 |             print("No new content added to the forecast.")
220 |             return None
221 | 
222 |         chatbot_messages = [
223 |             {"role": "system", "content": start_system_prompt},
224 |             {"role": "user", "content": extend_user_prompt},
225 |         ]
226 | 
227 |         chatbot = self.chatbot
228 | 
229 |         prompt_template = ChatPrompt(chatbot_messages)
230 | 
231 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
232 | 
233 |         additional_facts = ""
234 |         if facts is not None:
235 |             additional_facts = "Some additional facts for consideration are below...\n"
236 |             afctr = 1
237 |             for f in facts:
238 |                 additional_facts += f"AF{afctr}: {f}\n"
239 |                 afctr += 1
240 |             additional_facts += "---------------------\n\n"
241 | 
242 |         chatbot.messages = prompt_template.fill(
243 |             statement_title=forecast.statement.title,
244 |             statement_description=forecast.statement.description,
245 |             statement_fill_in_the_blank=forecast.statement.fill_in_the_blank,
246 |             fill_in_the_blank_2=forecast.statement.fill_in_the_blank,
247 |             content=content,
248 |             the_date=the_date,
249 |             additional_facts=additional_facts,
250 |             earlier_forecast_value=str(forecast.value),
251 |             earlier_forecast=forecast.justification,
252 |         )
253 | 
254 |         assistant_analysis = chatbot.resend()
255 | 
256 |         print("\n\n\n")
257 |         print(assistant_analysis)
258 | 
259 |         uh = UtilityHelper(openai_api_key)
260 |         prediction = uh.extract_prediction(
261 |             assistant_analysis, forecast.statement.fill_in_the_blank
262 |         )
263 | 
264 |         client = Client(et_api_key)
265 | 
266 |         full_content = content + "\n\n-----------------\n\n" + assistant_analysis
267 | 
268 |         response = client.create_forecast(
269 |             forecast.statement.id,
270 |             "Prediction",
271 |             full_content,
272 |             prediction,
273 |             prediction_agent,
274 |             {
275 |                 "full_response_from_llm_before_source_cleanup": content,
276 |                 "full_response_from_llm": assistant_analysis,
277 |                 "extracted_value": prediction,
278 |             },
279 |             forecast.id,
280 |         )
281 | 
282 |         return response
283 | 


--------------------------------------------------------------------------------
/emergingtrajectories/factsrag.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is an experimental approach to tracking information regardless of source type. It will also power more than recent updates. Here's how it works...
  3 | 
  4 | 1. All "Content Sources" (a new class type that obtains content) will send content directly to the Facts DB.
  5 | 2. The "Facts DB" will then extract all relevant facts for a prediction or research theme. It will keep cache the original content, will track the sources, and will also input all the facts into a RAG database.
  6 | 3. We can then query the DB for relevant facts on an ad hoc basis, rather than only for new content.
  7 | 
  8 | """
  9 | 
 10 | import os
 11 | import json
 12 | import hashlib
 13 | import re
 14 | 
 15 | # Using JSONEncoder to be consistent with the Emerging Trajectories website and platform.
 16 | from django.core.serializers.json import DjangoJSONEncoder
 17 | 
 18 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt
 19 | from phasellm.agents import WebpageAgent, WebSearchAgent
 20 | 
 21 | from datetime import datetime, timedelta
 22 | 
 23 | from . import Client
 24 | from .crawlers import crawlerPlaywright
 25 | from .prompts import *
 26 | from .news import NewsAPIAgent, RSSAgent, FinancialTimesAgent
 27 | 
 28 | import chromadb
 29 | import chromadb.utils.embedding_functions as embedding_functions
 30 | 
 31 | # Number of search results to return from web searche (default value).
 32 | _DEFAULT_NUM_SEARCH_RESULTS = 10
 33 | 
 34 | facts_base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
 35 | 
 36 | The project description is as follows...
 37 | {statement_description}
 38 | 
 39 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking. Rather than using bullet points, please list each as F1, F2, F3, etc... So that we can reference it.
 40 | 
 41 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
 42 | 
 43 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number.
 44 | 
 45 | Thus, a bullet point would look like this:
 46 | F1: (information) [1]
 47 | F2: (information) [1]
 48 | F3: (information) [2]
 49 | 
 50 | ... and so on, where F1, F2, F3, etc. are facts, and [1], [2] are the source documents you are extracting the facts from.
 51 | """
 52 | 
 53 | facts_base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 
 54 | 
 55 | ----------------------
 56 | {scraped_content}
 57 | ----------------------
 58 | 
 59 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) share any insights you might have based on the facts.
 60 | 
 61 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
 62 | 
 63 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the actual number.
 64 | 
 65 | DO NOT PROVIDE A FORECAST, BUT SIMPLY STATE AND SHARE THE FACTS AND INSIGHTS YOU HAVE GATHERED.
 66 | """
 67 | 
 68 | fact_system_prompt = """You are a researcher helping extract facts about {topic}, trends, and related observations. We will give you a piece of content scraped on the web. Please extract facts from this. Each fact should stand on its own, and can be several sentences long if need be. You can have as many facts as needed. For each fact, please start it as a new line with "---" as the bullet point. For example:
 69 | 
 70 | --- Fact 1... This is the fact.
 71 | --- Here is a second fact.
 72 | --- And a third fact.
 73 | 
 74 | Please do not include new lines between bullet points. Make sure you write your facts in ENGLISH. Translate any foreign language content/facts/observations into ENGLISH.
 75 | 
 76 | We will simply provide you with content and you will just provide facts."""
 77 | 
 78 | 
 79 | def uri_to_local(uri: str) -> str:
 80 |     """
 81 |     Convert a URI to a local file name. In this case, we typically will use an MD5 sum.
 82 | 
 83 |     Args:
 84 |         uri (str): The URI to convert.
 85 | 
 86 |     Returns:
 87 |         str: The MD5 sum of the URI.
 88 |     """
 89 |     uri_md5 = hashlib.md5(uri.encode("utf-8")).hexdigest()
 90 |     return uri_md5
 91 | 
 92 | 
 93 | # TODO If this works, it should be an agent with setllm() supported, etc.
 94 | # TODO Right now, we don't actually save sources. It's an important feature (track reliability, etc. too!) but we want to ensure the POC works well first.
 95 | class FactRAGFileCache:
 96 | 
 97 |     def __init__(
 98 |         self,
 99 |         folder_path: str,
100 |         openai_api_key: str,
101 |         cache_file: str = "cache.json",
102 |         sources_file: str = "sources.json",
103 |         facts_file: str = "facts.json",
104 |         rag_db_folder="cdb",
105 |         crawler=None,
106 |     ) -> None:
107 |         """
108 |         This is a RAG-based fact database. We build a database of facts available in JSON and via RAG and use this as a basic search engine for information. We use ChromaDB to index all facts, but also maintain a list of facts, sources, etc. in a JSON file. Finally, we keep a cache of all content and assume URLs do not get updated; we'll change this process in the future.
109 | 
110 |         Args:
111 |             folder_path (str): The folder where everything will be stored.
112 |             openai_api_key (str): The OpenAI API key. Used for RAG embeddings.
113 |             cache_file (str, optional): The name of the cache file. Defaults to "cache.json".
114 |             sources_file (str, optional): The name of the sources file. Defaults to "sources.json".
115 |             facts_file (str, optional): The name of the facts file. Defaults to "facts.json".
116 |             rag_db_folder (str, optional): The folder where the ChromaDB database will be stored. Defaults to "cdb".
117 |             crawler (optional): The crawler to use. Defaults to None, in which case a Playwright crawler will be used.
118 |         """
119 |         self.root_path = folder_path
120 |         self.root_parsed = os.path.join(folder_path, "parsed")
121 |         self.root_original = os.path.join(folder_path, "original")
122 |         self.cache_file = os.path.join(folder_path, cache_file)
123 |         self.sources_file = os.path.join(folder_path, sources_file)
124 |         self.facts_file = os.path.join(folder_path, facts_file)
125 |         self.rag_db_folder = os.path.join(folder_path, rag_db_folder)
126 |         self.openai_api_key = openai_api_key
127 | 
128 |         # Use the same default crawler for all other agents.
129 |         # TODO Eventually we'll want to have an array agents we use to get content.
130 |         if crawler is None:
131 |             self.crawler = crawlerPlaywright()
132 |         else:
133 |             self.crawler = crawler
134 | 
135 |         # Set up / load Chroma DB
136 |         openai_ef = embedding_functions.OpenAIEmbeddingFunction(
137 |             api_key=self.openai_api_key, model_name="text-embedding-3-small"
138 |         )
139 |         self.chromadb_client = chromadb.PersistentClient(path=self.rag_db_folder)
140 |         self.facts_rag_collection = self.chromadb_client.get_or_create_collection(
141 |             name="facts", embedding_function=openai_ef
142 |         )
143 | 
144 |         # Set up / load cache
145 |         self.cache = self.load_cache()
146 | 
147 |         # Set up / load facts dictionary
148 |         # TODO Eventually, move this to a database or table or something.
149 |         self.facts = self.load_facts()
150 | 
151 |         # Set up / load sources dictionary
152 |         # TODO Eventually, move this to a database or table or something.
153 |         self.sources = self.load_sources()
154 | 
155 |     def query_to_fact_list(
156 |         self, query: str, n_results: int = 10, since_date: datetime = None
157 |     ) -> dict:
158 |         """
159 |         Takes a query and finds the closest semantic matches to the query in the knowledge base.
160 | 
161 |         Args:
162 |             query (str): The query to search for.
163 |             n_results (int, optional): The number of results to return. Defaults to 10.
164 |             since_date (datetime, optional): The date to search from. Defaults to None, in which case all dates are searched.
165 | 
166 |         Returns:
167 |             dict: A list of the facts found, with the key being the fact ID and each fact having its source, add date, and content info.
168 |         """
169 | 
170 |         r = []
171 |         if since_date is None:
172 |             r = self.facts_rag_collection.query(
173 |                 query_texts=[query], n_results=n_results
174 |             )
175 |         else:
176 |             r = self.facts_rag_collection.query(
177 |                 query_texts=[query],
178 |                 n_results=n_results,
179 |                 where={"added_on_timestamp": {"$gt": since_date.timestamp()}},
180 |             )
181 | 
182 |         facts = {}
183 |         for item in r["ids"][0]:
184 |             facts[item] = {
185 |                 "content": self.facts[item]["content"],
186 |                 "source": self.facts[item]["source"],
187 |                 "added": self.facts[item]["added"],
188 |             }
189 | 
190 |         return facts
191 | 
192 |     def query_to_fact_content(
193 |         self, query: str, n_results: int = 10, since_date=None, skip_separator=False
194 |     ) -> str:
195 |         """
196 |         Takes a query and finds the closest semantic matches to the query in the knowledge base.
197 | 
198 |         Args:
199 |             query (str): The query to search for.
200 |             n_results (int, optional): The number of results to return. Defaults to 10.
201 |             since_date ([type], optional): The date to search from. Defaults to None, in which case all dates are searched.
202 |             skip_separator (bool, optional): Whether to prepend and append a note horizontal line and title to the string being returned. Defaults to False.
203 | 
204 |         Returns:
205 |             str: The content of the facts found, along with the fact IDs.
206 | 
207 |         """
208 | 
209 |         facts = self.query_to_fact_list(query, n_results, since_date)
210 | 
211 |         if len(facts) == 0:
212 |             return ""
213 | 
214 |         fact_content = ""
215 |         if not skip_separator:
216 |             fact_content = """--- START FACTS ---------------------------\n"""
217 | 
218 |         for key, fact in facts.items():
219 |             fact_content += key + ": " + fact["content"] + "\n"
220 | 
221 |         if not skip_separator:
222 |             fact_content += """--- END FACTS ---------------------------\n"""
223 | 
224 |         return fact_content
225 | 
226 |     def get_all_recent_facts(self, days: float = 1, skip_separator=False) -> str:
227 |         """
228 |         Returns a list of all facts and sources added in the last n days.
229 | 
230 |         Args:
231 |             days (float, optional): The number of days to search back. Defaults to 1. Can be fractional as well.
232 |             skip_separator (bool, optional): Whether to prepend and append a note horizontal line and title to the string being returned. Defaults to False.
233 | 
234 |         Returns:
235 |             str: The content of the facts found, along with the fact IDs.
236 |         """
237 | 
238 |         fact_content = ""
239 | 
240 |         if not skip_separator:
241 |             fact_content = """--- START FACTS ---------------------------\n"""
242 | 
243 |         min_date_timestamp = (datetime.now() - timedelta(days=days)).timestamp()
244 |         for key, fact in self.facts.items():
245 |             if fact["added_timestamp"] > min_date_timestamp:
246 |                 fact_content += key + ": " + fact["content"] + "\n"
247 | 
248 |         if not skip_separator:
249 |             fact_content += """--- END FACTS ---------------------------\n"""
250 | 
251 |         return fact_content
252 | 
253 |     def save_facts_and_sources(self) -> None:
254 |         """
255 |         Saves facts and sources to their respective files.
256 |         """
257 |         with open(self.facts_file, "w") as f:
258 |             json.dump(self.facts, f, indent=4, cls=DjangoJSONEncoder)
259 |         with open(self.sources_file, "w") as f:
260 |             json.dump(self.sources, f, indent=4, cls=DjangoJSONEncoder)
261 | 
262 |     def add_fact(self, fact: str, url: str) -> bool:
263 |         """
264 |         Adds a fact to the knowledge base.
265 | 
266 |         Args:
267 |             fact (str): The fact to add.
268 |             url (str): The URL source of the fact.
269 | 
270 |         Returns:
271 |             bool: True if the fact was added, False otherwise.
272 |         """
273 | 
274 |         fact_id_start = self.facts_rag_collection.count() + 1
275 | 
276 |         added_now = datetime.now()
277 |         added_now_timestamp = added_now.timestamp()
278 | 
279 |         self.facts_rag_collection.add(
280 |             documents=[fact],
281 |             ids=[f"f{fact_id_start}"],
282 |             metadatas=[{"added_on_timestamp": added_now_timestamp}],
283 |         )
284 | 
285 |         self.facts[f"f{fact_id_start}"] = {
286 |             "added": added_now,
287 |             "added_timestamp": added_now_timestamp,
288 |             "source": url,
289 |             "content": fact,
290 |             "cid": f"f{fact_id_start}",
291 |         }
292 | 
293 |         self.save_facts_and_sources()
294 | 
295 |         return True
296 | 
297 |     def facts_from_url(self, url: str, topic: str) -> None:
298 |         """
299 |         Given a URL, extract facts from it and save them to ChromaDB and the facts dictionary. Also returns the facts in an array, in case one wants to analyze new facts.
300 | 
301 |         Args:
302 |             url (str): Location of the content.
303 |             topic (str): a brief description of the research you are undertaking.
304 |         """
305 | 
306 |         content = self.get(url)
307 | 
308 |         llm = OpenAIGPTWrapper(self.openai_api_key, model="gpt-4-turbo-preview")
309 |         chatbot = ChatBot(llm)
310 |         chatbot.messages = [{"role": "system", "content": fact_system_prompt}]
311 | 
312 |         prompt_template = ChatPrompt(
313 |             [
314 |                 {"role": "system", "content": fact_system_prompt},
315 |             ]
316 |         )
317 | 
318 |         chatbot.messages = prompt_template.fill(topic=topic)
319 | 
320 |         response = chatbot.chat(content)
321 | 
322 |         lines = response.split("\n")
323 | 
324 |         for line in lines:
325 |             if line[0:4] == "--- ":
326 |                 fact = line[4:]
327 |                 self.add_fact(fact, url)
328 | 
329 |         self.save_facts_and_sources()
330 | 
331 |     # This builds facts based on RSS feeds.
332 |     def new_get_rss_links(self, rss_url, topic) -> None:
333 |         """
334 |         Crawls an RSS feed and its posts.
335 | 
336 |         Args:
337 |             rss_url (str): The URL of the RSS feed.
338 |             topic (str): a brief description of the research you are undertaking.
339 |         """
340 | 
341 |         rss_agent = RSSAgent(rss_url, crawler=self.crawler)
342 |         urls = rss_agent.get_news_as_list()
343 | 
344 |         for url in urls:
345 |             if not self.in_cache(url):
346 |                 print("RSS RESULT: " + url)
347 |                 try:
348 |                     self.facts_from_url(url, topic)
349 |                 except:
350 |                     print("Error; failed to get content from " + url)
351 | 
352 |     # This builds facts based on news articles.
353 |     def new_get_new_info_news(
354 |         self,
355 |         newsapi_api_key,
356 |         topic,
357 |         queries,
358 |         top_headlines=False,
359 |     ) -> None:
360 |         """
361 |         Uses the News API to find new information and extract facts from it.
362 | 
363 |         Args:
364 |             newsapi_api_key (str): The News API key.
365 |             topic (str): a brief description of the research you are undertaking.
366 |             queries (list[str]): A list of queries to search for.
367 |             top_headlines (bool, optional): Whether to search for top headlines. Defaults to False.
368 |         """
369 | 
370 |         news_agent = NewsAPIAgent(
371 |             newsapi_api_key, top_headlines=top_headlines, crawler=self.crawler
372 |         )
373 | 
374 |         for q in queries:
375 |             results = news_agent.get_news_as_list(q)
376 |             for result in results["articles"]:
377 |                 url = result["url"]
378 |                 if not self.in_cache(url):
379 |                     print("NEWS RESULT: " + url)
380 |                     self.facts_from_url(url, topic)
381 | 
382 |     # POC for FT
383 |     def get_ft_news(self, ft_user, ft_pass, topic) -> None:
384 |         """
385 |         Uses the Financial Times Agent to find new information and extract facts from it.
386 | 
387 |         Args:
388 |             ft_user (str): The Financial Times username.
389 |             ft_pass (str): The Financial Times password.
390 |             topic (str): a brief description of the research you are undertaking.
391 |         """
392 | 
393 |         fta = FinancialTimesAgent(ft_user, ft_pass)
394 |         urls, html_content, text_content = fta.get_news()
395 | 
396 |         if len(urls) != len(text_content):
397 |             raise ValueError("URLs and text content are not the same length.")
398 | 
399 |         for i in range(0, len(urls)):
400 |             url = urls[i]
401 |             content = text_content[i]
402 | 
403 |             if not self.in_cache(url):
404 |                 print("FT RESULT: " + url)
405 |                 self.force_content(url, content)
406 |                 self.facts_from_url(url, topic)
407 | 
408 |     # This builds facts based on all the google searches.
409 |     def new_get_new_info_google(
410 |         self,
411 |         google_api_key,
412 |         google_search_id,
413 |         google_search_queries,
414 |         topic,
415 |     ) -> None:
416 |         """
417 |         Uses Google search to find new information and extract facts from it.
418 | 
419 |         Args:
420 |             google_api_key (str): The Google API key.
421 |             google_search_id (str): The Google search ID.
422 |             google_search_queries (list[str]): A list of queries to search for.
423 |             topic (str): a brief description of the research you are undertaking.
424 |         """
425 | 
426 |         self.google_api_key = google_api_key
427 |         self.google_search_id = google_search_id
428 |         self.google_search_queries = google_search_queries
429 | 
430 |         webagent = WebSearchAgent(api_key=self.google_api_key)
431 | 
432 |         for google_search_query in self.google_search_queries:
433 | 
434 |             results = webagent.search_google(
435 |                 query=google_search_query,
436 |                 custom_search_engine_id=self.google_search_id,
437 |                 num=_DEFAULT_NUM_SEARCH_RESULTS,
438 |             )
439 | 
440 |             for result in results:
441 |                 if not self.in_cache(result.url):
442 |                     try:
443 |                         print("SEARCH RESULT: " + result.url)
444 |                         # page_content = self.get(result.url)
445 |                         self.facts_from_url(result.url, topic)
446 |                         # print(page_content)
447 |                     except Exception as e:
448 |                         print(f"Failed to get content from {result.url}\n{e}")
449 | 
450 |     def save_state(self) -> None:
451 |         """
452 |         Saves the in-memory changes to the knowledge base to the JSON cache file.
453 |         """
454 |         with open(self.cache_file, "w") as f:
455 |             json.dump(self.cache, f, cls=DjangoJSONEncoder)
456 | 
457 |     def load_facts(self) -> dict:
458 |         """
459 |         Loads the facts from the facts file.
460 |         """
461 |         if not os.path.exists(self.facts_file):
462 |             with open(self.facts_file, "w") as f:
463 |                 f.write("{}")
464 | 
465 |         with open(self.facts_file, "r") as f:
466 |             self.facts = json.load(f)
467 | 
468 |         return self.facts
469 | 
470 |     def load_sources(self) -> dict:
471 |         """
472 |         Loads the sources from the sources file.
473 |         """
474 |         if not os.path.exists(self.sources_file):
475 |             with open(self.sources_file, "w") as f:
476 |                 f.write("{}")
477 | 
478 |         with open(self.sources_file, "r") as f:
479 |             self.sources = json.load(f)
480 | 
481 |         return self.sources
482 | 
483 |     def load_cache(self) -> None:
484 |         """
485 |         Loads the cache from the cache file, or creates the relevant files and folders if one does not exist.
486 |         """
487 | 
488 |         if not os.path.exists(self.root_path):
489 |             os.makedirs(self.root_path)
490 | 
491 |         if not os.path.exists(self.root_parsed):
492 |             os.makedirs(self.root_parsed)
493 | 
494 |         if not os.path.exists(self.root_original):
495 |             os.makedirs(self.root_original)
496 | 
497 |         if not os.path.exists(self.cache_file):
498 |             with open(self.cache_file, "w") as f:
499 |                 f.write("{}")
500 | 
501 |         with open(self.cache_file, "r") as f:
502 |             return json.load(f)
503 | 
504 |     def in_cache(self, uri: str) -> bool:
505 |         """
506 |         Checks if a URI is in the cache already.
507 | 
508 |         Args:
509 |             uri (str): The URI to check.
510 | 
511 |         Returns:
512 |             bool: True if the URI is in the cache, False otherwise.
513 |         """
514 |         if uri in self.cache:
515 |             return True
516 |         return False
517 | 
518 |     def update_cache(
519 |         self, uri: str, obtained_on: datetime, last_accessed: datetime
520 |     ) -> None:
521 |         """
522 |         Updates the cache file for a given URI, specifically when it was obtained and last accessed.
523 | 
524 |         Args:
525 |             uri (str): The URI to update.
526 |             obtained_on (datetime): The date and time when the content was obtained.
527 |             last_accessed (datetime): The date and time when the content was last accessed.
528 |         """
529 |         uri_md5 = uri_to_local(uri)
530 |         self.cache[uri] = {
531 |             "obtained_on": obtained_on,
532 |             "last_accessed": last_accessed,
533 |             "accessed": 0,
534 |             "uri_md5": uri_md5,
535 |         }
536 |         self.save_state()
537 | 
538 |     def log_access(self, uri: str) -> None:
539 |         """
540 |         Saves the last accessed time and updates the accessed tracker for a given URI.
541 | 
542 |         Args:
543 |             uri (str): The URI to update.
544 |         """
545 |         self.cache[uri]["last_accessed"] = datetime.now()
546 |         self.cache[uri]["accessed"] = 1
547 |         self.save_state()
548 | 
549 |     def get_unaccessed_content(self) -> list[str]:
550 |         """
551 |         Returns a list of URIs that have not been accessed by the agent.
552 | 
553 |         Returns:
554 |             list[str]: A list of URIs that have not been accessed by the agent.
555 |         """
556 |         unaccessed = []
557 |         for uri in self.cache:
558 |             if self.cache[uri]["accessed"] == 0:
559 |                 unaccessed.append(uri)
560 |         return unaccessed
561 | 
562 |     def force_content(self, uri: str, content: str, check_exists: bool = True) -> bool:
563 |         """
564 |         Forces a specific URI to have specific content (both HTML and text content). Used to fill old links that we don't actually want to crawl.
565 | 
566 |         Args:
567 |             uri (str): The URI to force content for.
568 |             content (str): The content to force.
569 |             check_exists (bool): checks if content has already been included in the cache before forcing the new content.
570 | 
571 |         Returns:
572 |             bool: True if the content was forced, False otherwise.
573 |         """
574 | 
575 |         # If the content already exists and we avoid overwrites, then we don't want to overwrite it.
576 |         if check_exists and self.in_cache(uri):
577 |             return False
578 | 
579 |         uri_md5 = uri_to_local(uri)
580 |         with open(os.path.join(self.root_original, uri_md5), "w") as f:
581 |             f.write(content)
582 |         with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
583 |             f.write(content)
584 | 
585 |         self.update_cache(uri, datetime.now(), datetime.now())
586 |         self.log_access(uri)
587 | 
588 |         return True
589 | 
590 |     def get(self, uri: str) -> str:
591 |         """
592 |         Returns the content for a given URI. If the content is not in the cache, it will be scraped and added to the cache.
593 | 
594 |         Args:
595 |             uri (str): The URI to get the content for.
596 | 
597 |         Returns:
598 |             str: The content for the given URI.
599 |         """
600 |         uri_md5 = uri_to_local(uri)
601 |         if uri in self.cache:
602 |             with open(os.path.join(self.root_parsed, uri_md5), "r") as f:
603 |                 return f.read()
604 |         else:
605 |             # scraper = WebpageAgent()
606 | 
607 |             # content_raw = scraper.scrape(uri, text_only=False, body_only=False)
608 |             # with open(os.path.join(self.root_original, uri_md5), "w") as f:
609 |             #    f.write(content_raw)
610 | 
611 |             # content_parsed = scraper.scrape(uri, text_only=True, body_only=True)
612 |             # with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
613 |             #    f.write(content_parsed)
614 | 
615 |             try:
616 |                 content, text = self.crawler.get_content(uri)
617 |             except Exception as e:
618 |                 print(f"Failed to get content from {uri}\n{e}")
619 |                 content = ""
620 |                 text = ""
621 | 
622 |             with open(os.path.join(self.root_original, uri_md5), "w") as f:
623 |                 f.write(content)
624 |             with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
625 |                 f.write(text)
626 | 
627 |             self.update_cache(uri, datetime.now(), datetime.now())
628 | 
629 |             return text
630 | 
631 |     def add_content(self, content: str, uri: str = None) -> None:
632 |         """
633 |         Adds content to cache.
634 | 
635 |         Args:
636 |             content (str): The content to add to the cache.
637 |             uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used.
638 |         """
639 |         if uri is None:
640 |             uri = hashlib.md5(content.encode("utf-8")).hexdigest()
641 |         uri_md5 = uri_to_local(uri)
642 |         with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
643 |             f.write(content)
644 |         self.update_cache(uri, datetime.now(), datetime.now())
645 | 
646 |     def add_content_from_file(self, filepath: str, uri: str = None) -> None:
647 |         """
648 |         Adds content from a text file to the cache.
649 | 
650 |         Args:
651 |             filepath (str): The path to the file to add to the cache.
652 |             uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used.
653 |         """
654 |         with open(filepath, "r") as f:
655 |             content = f.read()
656 |         self.add_content(content, uri)
657 | 
658 | 
659 | class FactBot:
660 | 
661 |     def __init__(
662 |         self,
663 |         knowledge_db: FactRAGFileCache,
664 |         openai_api_key: str = None,
665 |         chatbot: ChatBot = None,
666 |     ) -> None:
667 |         """
668 |         The FactBot is like a ChatBot but enables you to ask questions that reference an underlying RAG database (KnowledgeBaseFileCache), which then enables the chatbot to cite sourcable facts.
669 | 
670 |         Args:
671 |             knowledge_db (FactRAGFileCache): The knowledge database to use.
672 |             openai_api_key (str, optional): The OpenAI API key. Defaults to None.
673 |             chatbot (ChatBot, optional): The PhaseLLB chatbot to use. Defaults to None, in which case an OpenAI chatbot is used (and the OpenAI API key must be provided).
674 |         """
675 |         if openai_api_key is None and chatbot is None:
676 |             raise ValueError("One of openai_api_key or chatbot must be provided.")
677 | 
678 |         if chatbot is not None:
679 |             self.chatbot = chatbot
680 |         else:
681 |             llm = OpenAIGPTWrapper(openai_api_key, model="gpt-4-turbo-preview")
682 |             self.chatbot = ChatBot(llm)
683 |             self.chatbot.messages = [
684 |                 {"role": "system", "content": system_prompt_question_continuous}
685 |             ]
686 | 
687 |         self.knowledge_db = knowledge_db
688 | 
689 |     def ask(self, question: str, clean_sources: bool = True) -> str:
690 |         """
691 |         Ask a question to the FactBot. This will query the underlying knowledge database and use the returned facts to answer the question.
692 | 
693 |         Args:
694 |             question (str): The question to ask.
695 |             clean_sources (bool, optional): Whether to clean the sources in the response. Defaults to True; in this case, it will replace fact IDs with relevant source links at the end of the response.
696 | 
697 |         Returns:
698 |             str: The response to the question.
699 |         """
700 |         message = self.knowledge_db.query_to_fact_content(question) + "\n\n" + question
701 |         response = self.chatbot.chat(message)
702 |         if clean_sources:
703 |             return clean_fact_citations(self.knowledge_db, response)
704 |         else:
705 |             return response
706 | 
707 |     def source(self, fact_id: str) -> str:
708 |         """
709 |         Returns the URL source for a given fact ID.
710 | 
711 |         Args:
712 |             fact_id (str): The fact ID to get the source for.
713 | 
714 |         Returns:
715 |             str: The URL source for the given fact ID.
716 |         """
717 |         if fact_id in self.knowledge_db.facts:
718 |             return self.knowledge_db.facts[fact_id]["source"]
719 | 
720 |         if fact_id.lower() in self.knowledge_db.facts:
721 |             return self.knowledge_db.facts[fact_id.lower()]["source"]
722 | 
723 |         raise ValueError(
724 |             f"Fact ID " + str(fact_id) + " not found in the knowledge database."
725 |         )
726 | 
727 |     def clean_and_source_to_html(
728 |         self, text_to_clean: str, start_count: int = 0
729 |     ) -> list:
730 |         """
731 |         Returns a formatted response with sourced HTML. This is used for emergingtrajectories.com and acts as a base for anyone else wanting to build similar features.
732 | 
733 |         Args:
734 |             text_to_clean: The text to clean/cite/source.
735 |             start_count: The starting count for the sources.
736 | 
737 |         Returns:
738 |             list: two strings -- the actual response in the first case, and the sources in the second case, and an integer representing the new source count.
739 |         """
740 | 
741 |         pattern = r"\[f[\d\s\,f]+\]"
742 |         new_text = ""
743 |         sources_text = ""
744 |         ref_ctr = start_count
745 |         last_index = 0
746 | 
747 |         for match in re.finditer(pattern, text_to_clean, flags=re.IGNORECASE):
748 | 
749 |             if match.group(0).find(",") == -1:
750 |                 ref_ctr += 1
751 |                 ref = match.group(0)[1:-1].strip()
752 |                 ref = ref.lower()
753 | 
754 |                 new_text += text_to_clean[last_index : match.start()]
755 |                 new_text += f"""<a class='source_link' target='_blank' href='{self.source(ref)}'>{ref_ctr}</a>"""
756 | 
757 |                 # Save the source
758 |                 fact_text = self.knowledge_db.facts[ref]["content"]
759 |                 new_source_text = f"""<span class='fact_span'><b>{ref_ctr}:</b> {fact_text} <a href='{self.source(ref)}' target='_blank'>View Source</a></span>"""
760 |                 sources_text += new_source_text + "\n"
761 | 
762 |                 last_index = match.end()
763 |             else:
764 |                 refs = match.group(0)[1:-1].split(",")
765 |                 ref_arr = []
766 |                 ref_str = ""
767 |                 for ref in refs:
768 |                     ref = ref.strip()
769 |                     ref = ref.lower()
770 |                     ref_ctr += 1
771 |                     ref_arr.append(str(ref_ctr))
772 | 
773 |                     # Add the source to the text
774 |                     new_text_source_num = f"""<a class='source_link' target='_blank' href='{self.source(ref)}'>{ref_ctr}</a>"""
775 |                     ref_str += " " + new_text_source_num
776 | 
777 |                     # Save the source
778 |                     fact_text = self.knowledge_db.facts[ref]["content"]
779 |                     new_source_text = f"""<span class='fact_span'><b>{ref_ctr}:</b> ${fact_text} <a href='{self.source(ref)}' target='_blank'>View Source</a></span>"""
780 |                     sources_text += new_source_text + "\n"
781 | 
782 |                 new_text += text_to_clean[last_index : match.start()] + ref_str
783 |                 last_index = match.end()
784 | 
785 |         new_text += text_to_clean[last_index:]
786 | 
787 |         return new_text, sources_text, ref_ctr
788 | 
789 | 
790 | def clean_fact_citations(knowledge_db: FactRAGFileCache, text_to_clean: str) -> str:
791 |     """
792 |     Converts fact IDs referenced in a piece of text to relevant source links, appending sources as end notes in the document/text.
793 | 
794 |     Args:
795 |         knowledge_db (FactRAGFileCache): The knowledge database to use for fact lookups.
796 |         text_to_clean (str): The text to clean.
797 | 
798 |     Returns:
799 |         str: The cleaned text.
800 |     """
801 |     bot = FactBot(knowledge_db, knowledge_db.openai_api_key)
802 |     pattern = r"\[f[\d\s\,f]+\]"
803 |     new_text = ""
804 |     ref_ctr = 0
805 |     last_index = 0
806 |     sources_list = ""
807 |     for match in re.finditer(pattern, text_to_clean):
808 |         if match.group(0).find(",") == -1:
809 |             ref_ctr += 1
810 |             ref = match.group(0)[1:-1].strip()
811 |             new_text += text_to_clean[last_index : match.start()]
812 |             new_text += f"[{ref_ctr}]"
813 |             sources_list += f"{ref_ctr} :: " + bot.source(f"{ref}") + "\n"
814 |             last_index = match.end()
815 |         else:
816 |             refs = match.group(0)[1:-1].split(",")
817 |             ref_arr = []
818 |             for ref in refs:
819 |                 ref = ref.strip()
820 |                 ref_ctr += 1
821 |                 ref_arr.append(str(ref_ctr))
822 |                 sources_list += f"{ref_ctr} :: " + bot.source(f"{ref}") + "\n"
823 |             ref_str = "[" + ", ".join(ref_arr) + "]"
824 |             new_text += text_to_clean[last_index : match.start()] + ref_str
825 |             last_index = match.end()
826 | 
827 |     new_text += text_to_clean[last_index:]
828 | 
829 |     if ref_ctr == 0:
830 |         return text_to_clean
831 |     else:
832 |         return new_text + "\n\nSources:\n" + sources_list
833 | 


--------------------------------------------------------------------------------
/emergingtrajectories/factsragforecaster.py:
--------------------------------------------------------------------------------
  1 | from .recursiveagent import ETClient
  2 | from .factsrag import FactRAGFileCache, FactBot, clean_fact_citations
  3 | from .utils import UtilityHelper
  4 | from . import Client, Statement, Forecast
  5 | 
  6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt
  7 | 
  8 | from datetime import datetime
  9 | 
 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you. All your answers should be absed on these facts ONLY.
 11 | 
 12 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts...
 13 | 
 14 | F1: The President of the USA is Joe Biden.
 15 | F2: The Vice President of the USA is Kamala Harris.
 16 | 
 17 | ... your answers hould be something like this:
 18 | 
 19 | The President of th USA is Joe Biden [F1].
 20 | 
 21 | We will give you a list of facts for every question. You can reference those facts, or you can also reference earlier facts from the conversatio chain. YOU CANNOT USE OTHER INFORMATION."""
 22 | 
 23 | start_user_prompt = """Here is the research:
 24 | {content}
 25 | {additional_facts}
 26 | ------------
 27 | 
 28 | Given the above, we need you to do your best to fill in the following blank...
 29 | {fill_in_the_blank}
 30 | 
 31 | PLEASE DO THE FOLLOWING:
 32 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 33 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 34 | - Do not provide a range; provide ONE number.
 35 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 36 | 
 37 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 38 | """
 39 | 
 40 | extend_user_prompt = """Here is the research:
 41 | {content}
 42 | {additional_facts}
 43 | ---------------------
 44 | 
 45 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast...
 46 | ---------------------
 47 | FORECAST: {earlier_forecast_value}
 48 | 
 49 | JUSTIFICATION:
 50 | {earlier_forecast}
 51 | ---------------------
 52 | 
 53 | Given the above, we need you to do your best to fill in the following blank...
 54 | {fill_in_the_blank}
 55 | 
 56 | PLEASE DO THE FOLLOWING:
 57 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 58 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 59 | - Do not provide a range; provide ONE number.
 60 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 61 | 
 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI."""
 63 | 
 64 | 
 65 | class FactsRAGForecastingAgent(object):
 66 | 
 67 |     # TODO: document / clean up
 68 |     def __init__(
 69 |         self,
 70 |         client: ETClient,
 71 |         chatbot: ChatBot,
 72 |         factbase: FactRAGFileCache,
 73 |     ):
 74 | 
 75 |         self.client = client
 76 |         self.chatbot = chatbot
 77 |         self.factbase = factbase
 78 | 
 79 |     # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity.
 80 |     def setChatBot(self, chatbot):
 81 |         self.chatbot = chatbot
 82 | 
 83 |     # TODO: standardize -- camel case or snake case? Or something else?
 84 |     def getChatBot(self):
 85 |         return self.chatbot
 86 | 
 87 |     # TODO: we can do much better at disaggregating all these functions. Currently just want this to work.
 88 |     def create_forecast(
 89 |         self,
 90 |         statement: Statement,
 91 |         openai_api_key,
 92 |         et_api_key,
 93 |         facts=None,
 94 |         prediction_agent="Test Agent",
 95 |     ):
 96 | 
 97 |         # factbot = FactBot(self.factbase, openai_api_key)
 98 |         query1 = self.factbase.query_to_fact_content(
 99 |             statement.fill_in_the_blank, n_results=25, skip_separator=True
100 |         )
101 |         query2 = self.factbase.query_to_fact_content(
102 |             statement.description, n_results=25, skip_separator=True
103 |         )
104 | 
105 |         if len(query1) == 0 and len(query2) == 0:
106 |             print("No new content added to the forecast.")
107 |             return None
108 | 
109 |         facts_to_use = (
110 |             """--- START FACTS ---------------------------\n"""
111 |             + query1.strip()
112 |             + "\n"
113 |             + query2.strip()
114 |             + """--- END FACTS ---------------------------\n"""
115 |         )
116 | 
117 |         chatbot_messages = [
118 |             {"role": "system", "content": start_system_prompt},
119 |             {"role": "user", "content": start_user_prompt},
120 |         ]
121 | 
122 |         chatbot = self.chatbot
123 | 
124 |         prompt_template = ChatPrompt(chatbot_messages)
125 | 
126 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
127 | 
128 |         additional_facts = ""
129 |         if facts is not None:
130 |             additional_facts = "Some additional facts for consideration are below...\n"
131 |             afctr = 1
132 |             for f in facts:
133 |                 additional_facts += f"AF{afctr}: {f}\n"
134 |                 afctr += 1
135 |             additional_facts += "---------------------\n\n"
136 | 
137 |         chatbot.messages = prompt_template.fill(
138 |             statement_title=statement.title,
139 |             statement_description=statement.description,
140 |             statement_fill_in_the_blank=statement.fill_in_the_blank,
141 |             fill_in_the_blank_2=statement.fill_in_the_blank,
142 |             content=facts_to_use,
143 |             the_date=the_date,
144 |             additional_facts=additional_facts,
145 |         )
146 | 
147 |         assistant_analysis = chatbot.resend()
148 |         full_content = clean_fact_citations(self.factbase, assistant_analysis)
149 | 
150 |         print("\n\n\n")
151 |         print(assistant_analysis)
152 | 
153 |         uh = UtilityHelper(openai_api_key)
154 |         prediction = uh.extract_prediction(
155 |             assistant_analysis, statement.fill_in_the_blank
156 |         )
157 | 
158 |         client = Client(et_api_key)
159 | 
160 |         # full_content = content + "\n\n-----------------\n\n" + assistant_analysis
161 | 
162 |         response = client.create_forecast(
163 |             statement.id,
164 |             "Prediction",
165 |             full_content,
166 |             prediction,
167 |             prediction_agent,
168 |             {
169 |                 # "full_response_from_llm_before_source_cleanup": content,
170 |                 "full_response_from_llm": assistant_analysis,
171 |                 "extracted_value": prediction,
172 |             },
173 |         )
174 | 
175 |         return response
176 | 
177 |     def extend_forecast(
178 |         self,
179 |         forecast: Forecast,
180 |         openai_api_key,
181 |         et_api_key,
182 |         facts=None,
183 |         prediction_agent="Test Agent",
184 |     ):
185 | 
186 |         # Note: we only update the forecast with data/info we added since the last forecast.
187 | 
188 |         query1 = self.factbase.query_to_fact_content(
189 |             forecast.statement.fill_in_the_blank,
190 |             n_results=25,
191 |             skip_separator=True,
192 |             since_date=forecast.created_at,
193 |         )
194 |         query2 = self.factbase.query_to_fact_content(
195 |             forecast.statement.description,
196 |             n_results=25,
197 |             skip_separator=True,
198 |             since_date=forecast.created_at,
199 |         )
200 | 
201 |         if len(query1) == 0 and len(query2) == 0:
202 |             print("No new content added to the forecast.")
203 |             return None
204 | 
205 |         facts_to_use = (
206 |             """--- START FACTS ---------------------------\n"""
207 |             + query1.strip()
208 |             + "\n"
209 |             + query2.strip()
210 |             + """--- END FACTS ---------------------------\n"""
211 |         )
212 | 
213 |         chatbot_messages = [
214 |             {"role": "system", "content": start_system_prompt},
215 |             {"role": "user", "content": extend_user_prompt},
216 |         ]
217 | 
218 |         chatbot = self.chatbot
219 | 
220 |         prompt_template = ChatPrompt(chatbot_messages)
221 | 
222 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
223 | 
224 |         additional_facts = ""
225 |         if facts is not None:
226 |             additional_facts = "Some additional facts for consideration are below...\n"
227 |             afctr = 1
228 |             for f in facts:
229 |                 additional_facts += f"AF{afctr}: {f}\n"
230 |                 afctr += 1
231 |             additional_facts += "---------------------\n\n"
232 | 
233 |         chatbot.messages = prompt_template.fill(
234 |             statement_title=forecast.statement.title,
235 |             statement_description=forecast.statement.description,
236 |             statement_fill_in_the_blank=forecast.statement.fill_in_the_blank,
237 |             fill_in_the_blank_2=forecast.statement.fill_in_the_blank,
238 |             content=facts_to_use,
239 |             the_date=the_date,
240 |             additional_facts=additional_facts,
241 |             earlier_forecast_value=str(forecast.value),
242 |             earlier_forecast=forecast.justification,
243 |         )
244 | 
245 |         assistant_analysis = chatbot.resend()
246 | 
247 |         # print("\n\n\n")
248 |         # print(assistant_analysis)
249 | 
250 |         full_content = clean_fact_citations(self.factbase, assistant_analysis)
251 |         print(full_content)
252 | 
253 |         uh = UtilityHelper(openai_api_key)
254 |         prediction = uh.extract_prediction(
255 |             assistant_analysis, forecast.statement.fill_in_the_blank
256 |         )
257 | 
258 |         client = Client(et_api_key)
259 | 
260 |         response = client.create_forecast(
261 |             forecast.statement.id,
262 |             "Prediction",
263 |             full_content,
264 |             prediction,
265 |             prediction_agent,
266 |             {
267 |                 "full_response_from_llm": assistant_analysis,
268 |                 "extracted_value": prediction,
269 |             },
270 |             forecast.id,
271 |         )
272 | 
273 |         return response
274 | 


--------------------------------------------------------------------------------
/emergingtrajectories/factsragforecaster2.py:
--------------------------------------------------------------------------------
  1 | from .recursiveagent import ETClient
  2 | from .factsrag2 import FactRAGFileCache, FactBot, clean_fact_citations
  3 | from .utils import UtilityHelper
  4 | from . import Client, Statement, Forecast
  5 | 
  6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt
  7 | 
  8 | from datetime import datetime
  9 | 
 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you. All your answers should be absed on these facts ONLY.
 11 | 
 12 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts...
 13 | 
 14 | F1: The President of the USA is Joe Biden.
 15 | F2: The Vice President of the USA is Kamala Harris.
 16 | 
 17 | ... your answers hould be something like this:
 18 | 
 19 | The President of th USA is Joe Biden [F1].
 20 | 
 21 | We will give you a list of facts for every question. You can reference those facts, or you can also reference earlier facts from the conversatio chain. YOU CANNOT USE OTHER INFORMATION."""
 22 | 
 23 | start_user_prompt = """Here is the research:
 24 | {content}
 25 | {additional_facts}
 26 | ------------
 27 | 
 28 | Given the above, we need you to do your best to fill in the following blank...
 29 | {fill_in_the_blank}
 30 | 
 31 | PLEASE DO THE FOLLOWING:
 32 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 33 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 34 | - Do not provide a range; provide ONE number.
 35 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 36 | 
 37 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 38 | """
 39 | 
 40 | extend_user_prompt = """Here is the research:
 41 | {content}
 42 | {additional_facts}
 43 | ---------------------
 44 | 
 45 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast...
 46 | ---------------------
 47 | FORECAST: {earlier_forecast_value}
 48 | 
 49 | JUSTIFICATION:
 50 | {earlier_forecast}
 51 | ---------------------
 52 | 
 53 | Given the above, we need you to do your best to fill in the following blank...
 54 | {fill_in_the_blank}
 55 | 
 56 | PLEASE DO THE FOLLOWING:
 57 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 58 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 59 | - Do not provide a range; provide ONE number.
 60 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 61 | 
 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI."""
 63 | 
 64 | 
 65 | class FactsRAGForecastingAgent(object):
 66 | 
 67 |     # TODO: document / clean up
 68 |     def __init__(
 69 |         self,
 70 |         client: ETClient,
 71 |         chatbot: ChatBot,
 72 |         factbase: FactRAGFileCache,
 73 |     ):
 74 | 
 75 |         self.client = client
 76 |         self.chatbot = chatbot
 77 |         self.factbase = factbase
 78 | 
 79 |     # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity.
 80 |     def setChatBot(self, chatbot):
 81 |         self.chatbot = chatbot
 82 | 
 83 |     # TODO: standardize -- camel case or snake case? Or something else?
 84 |     def getChatBot(self):
 85 |         return self.chatbot
 86 | 
 87 |     # TODO: we can do much better at disaggregating all these functions. Currently just want this to work.
 88 |     def create_forecast(
 89 |         self,
 90 |         statement: Statement,
 91 |         openai_api_key,
 92 |         et_api_key,
 93 |         facts=None,
 94 |         prediction_agent="Test Agent",
 95 |     ):
 96 | 
 97 |         # factbot = FactBot(self.factbase, openai_api_key)
 98 |         query1 = self.factbase.query_to_fact_content(
 99 |             statement.fill_in_the_blank, n_results=25, skip_separator=True
100 |         )
101 |         query2 = self.factbase.query_to_fact_content(
102 |             statement.description, n_results=25, skip_separator=True
103 |         )
104 | 
105 |         if len(query1) == 0 and len(query2) == 0:
106 |             print("No new content added to the forecast.")
107 |             return None
108 | 
109 |         facts_to_use = (
110 |             """--- START FACTS ---------------------------\n"""
111 |             + query1.strip()
112 |             + "\n"
113 |             + query2.strip()
114 |             + """--- END FACTS ---------------------------\n"""
115 |         )
116 | 
117 |         chatbot_messages = [
118 |             {"role": "system", "content": start_system_prompt},
119 |             {"role": "user", "content": start_user_prompt},
120 |         ]
121 | 
122 |         chatbot = self.chatbot
123 | 
124 |         prompt_template = ChatPrompt(chatbot_messages)
125 | 
126 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
127 | 
128 |         additional_facts = ""
129 |         if facts is not None:
130 |             additional_facts = "Some additional facts for consideration are below...\n"
131 |             afctr = 1
132 |             for f in facts:
133 |                 additional_facts += f"AF{afctr}: {f}\n"
134 |                 afctr += 1
135 |             additional_facts += "---------------------\n\n"
136 | 
137 |         chatbot.messages = prompt_template.fill(
138 |             statement_title=statement.title,
139 |             statement_description=statement.description,
140 |             statement_fill_in_the_blank=statement.fill_in_the_blank,
141 |             fill_in_the_blank_2=statement.fill_in_the_blank,
142 |             content=facts_to_use,
143 |             the_date=the_date,
144 |             additional_facts=additional_facts,
145 |         )
146 | 
147 |         assistant_analysis = chatbot.resend()
148 |         full_content = clean_fact_citations(self.factbase, assistant_analysis)
149 | 
150 |         print("\n\n\n")
151 |         print(assistant_analysis)
152 | 
153 |         uh = UtilityHelper(openai_api_key)
154 |         prediction = uh.extract_prediction(
155 |             assistant_analysis, statement.fill_in_the_blank
156 |         )
157 | 
158 |         client = Client(et_api_key)
159 | 
160 |         # full_content = content + "\n\n-----------------\n\n" + assistant_analysis
161 | 
162 |         response = client.create_forecast(
163 |             statement.id,
164 |             "Prediction",
165 |             full_content,
166 |             prediction,
167 |             prediction_agent,
168 |             {
169 |                 # "full_response_from_llm_before_source_cleanup": content,
170 |                 "full_response_from_llm": assistant_analysis,
171 |                 "extracted_value": prediction,
172 |             },
173 |         )
174 | 
175 |         return response
176 | 
177 |     def extend_forecast(
178 |         self,
179 |         forecast: Forecast,
180 |         openai_api_key,
181 |         et_api_key,
182 |         facts=None,
183 |         prediction_agent="Test Agent",
184 |     ):
185 | 
186 |         # Note: we only update the forecast with data/info we added since the last forecast.
187 | 
188 |         query1 = self.factbase.query_to_fact_content(
189 |             forecast.statement.fill_in_the_blank,
190 |             n_results=25,
191 |             skip_separator=True,
192 |             since_date=forecast.created_at,
193 |         )
194 |         query2 = self.factbase.query_to_fact_content(
195 |             forecast.statement.description,
196 |             n_results=25,
197 |             skip_separator=True,
198 |             since_date=forecast.created_at,
199 |         )
200 | 
201 |         if len(query1) == 0 and len(query2) == 0:
202 |             print("No new content added to the forecast.")
203 |             return None
204 | 
205 |         facts_to_use = (
206 |             """--- START FACTS ---------------------------\n"""
207 |             + query1.strip()
208 |             + "\n"
209 |             + query2.strip()
210 |             + """--- END FACTS ---------------------------\n"""
211 |         )
212 | 
213 |         chatbot_messages = [
214 |             {"role": "system", "content": start_system_prompt},
215 |             {"role": "user", "content": extend_user_prompt},
216 |         ]
217 | 
218 |         chatbot = self.chatbot
219 | 
220 |         prompt_template = ChatPrompt(chatbot_messages)
221 | 
222 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
223 | 
224 |         additional_facts = ""
225 |         if facts is not None:
226 |             additional_facts = "Some additional facts for consideration are below...\n"
227 |             afctr = 1
228 |             for f in facts:
229 |                 additional_facts += f"AF{afctr}: {f}\n"
230 |                 afctr += 1
231 |             additional_facts += "---------------------\n\n"
232 | 
233 |         chatbot.messages = prompt_template.fill(
234 |             statement_title=forecast.statement.title,
235 |             statement_description=forecast.statement.description,
236 |             statement_fill_in_the_blank=forecast.statement.fill_in_the_blank,
237 |             fill_in_the_blank_2=forecast.statement.fill_in_the_blank,
238 |             content=facts_to_use,
239 |             the_date=the_date,
240 |             additional_facts=additional_facts,
241 |             earlier_forecast_value=str(forecast.value),
242 |             earlier_forecast=forecast.justification,
243 |         )
244 | 
245 |         assistant_analysis = chatbot.resend()
246 | 
247 |         # print("\n\n\n")
248 |         # print(assistant_analysis)
249 | 
250 |         full_content = clean_fact_citations(self.factbase, assistant_analysis)
251 |         print(full_content)
252 | 
253 |         uh = UtilityHelper(openai_api_key)
254 |         prediction = uh.extract_prediction(
255 |             assistant_analysis, forecast.statement.fill_in_the_blank
256 |         )
257 | 
258 |         client = Client(et_api_key)
259 | 
260 |         response = client.create_forecast(
261 |             forecast.statement.id,
262 |             "Prediction",
263 |             full_content,
264 |             prediction,
265 |             prediction_agent,
266 |             {
267 |                 "full_response_from_llm": assistant_analysis,
268 |                 "extracted_value": prediction,
269 |             },
270 |             forecast.id,
271 |         )
272 | 
273 |         return response
274 | 


--------------------------------------------------------------------------------
/emergingtrajectories/factsragforecaster3.py:
--------------------------------------------------------------------------------
  1 | from .recursiveagent import ETClient
  2 | from .factsrag3 import FactRAGFileCache, FactBot, clean_fact_citations
  3 | from .utils import UtilityHelper
  4 | from . import Client, Statement, Forecast
  5 | 
  6 | from phasellm.llms import ChatBot, OpenAIGPTWrapper, ChatPrompt
  7 | 
  8 | from datetime import datetime
  9 | 
 10 | start_system_prompt = """Today's date is {the_date}. You are a researcher helping with economics and politics research. We will give you a few facts and we need you to fill in a blank to the best of your knowledge, based on all the information provided to you. All your answers should be absed on these facts ONLY.
 11 | 
 12 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts...
 13 | 
 14 | f1: The President of the USA is Joe Biden.
 15 | f2: The Vice President of the USA is Kamala Harris.
 16 | 
 17 | ... your answers hould be something like this:
 18 | 
 19 | The President of th USA is Joe Biden [f1].
 20 | 
 21 | We will give you a list of facts for every question. You can reference those facts, or you can also reference earlier facts from the conversatio chain. YOU CANNOT USE OTHER INFORMATION."""
 22 | 
 23 | start_user_prompt = """Here is the research:
 24 | {content}
 25 | {additional_facts}
 26 | ------------
 27 | 
 28 | Given the above, we need you to do your best to fill in the following blank...
 29 | {fill_in_the_blank}
 30 | 
 31 | PLEASE DO THE FOLLOWING:
 32 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 33 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 34 | - Do not provide a range; provide ONE number.
 35 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 36 | 
 37 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
 38 | """
 39 | 
 40 | extend_user_prompt = """Here is the research:
 41 | {content}
 42 | {additional_facts}
 43 | ---------------------
 44 | 
 45 | In addition to the new content above, we want to UPDATE the forecast from before. Here is the earlier forecast...
 46 | ---------------------
 47 | FORECAST: {earlier_forecast_value}
 48 | 
 49 | JUSTIFICATION:
 50 | {earlier_forecast}
 51 | ---------------------
 52 | 
 53 | Given the above, we need you to do your best to fill in the following blank...
 54 | {fill_in_the_blank}
 55 | 
 56 | PLEASE DO THE FOLLOWING:
 57 | - Provide any further justification ONLY BASED ON THE FACTS AND SOURCES PROVIDED ABOVE.
 58 | - Explain your forecast and how the facts, insights, etc. support it. Do not simply state a number.
 59 | - Do not provide a range; provide ONE number.
 60 | - End your forecast with the filled-in statement: {fill_in_the_blank_2}
 61 | 
 62 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI."""
 63 | 
 64 | 
 65 | class FactsRAGForecastingAgent(object):
 66 | 
 67 |     # TODO: document / clean up
 68 |     def __init__(
 69 |         self,
 70 |         client: ETClient,
 71 |         chatbot: ChatBot,
 72 |         factbase: FactRAGFileCache,
 73 |     ):
 74 | 
 75 |         self.client = client
 76 |         self.chatbot = chatbot
 77 |         self.factbase = factbase
 78 | 
 79 |     # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity.
 80 |     def setChatBot(self, chatbot):
 81 |         self.chatbot = chatbot
 82 | 
 83 |     # TODO: standardize -- camel case or snake case? Or something else?
 84 |     def getChatBot(self):
 85 |         return self.chatbot
 86 | 
 87 |     # TODO: we can do much better at disaggregating all these functions. Currently just want this to work.
 88 |     def create_forecast(
 89 |         self,
 90 |         statement: Statement,
 91 |         openai_api_key,
 92 |         et_api_key,
 93 |         facts=None,
 94 |         prediction_agent="Test Agent",
 95 |     ):
 96 | 
 97 |         # factbot = FactBot(self.factbase, openai_api_key)
 98 |         query1 = self.factbase.query_to_fact_content(
 99 |             statement.fill_in_the_blank, n_results=25, skip_separator=True
100 |         )
101 |         query2 = self.factbase.query_to_fact_content(
102 |             statement.description, n_results=25, skip_separator=True
103 |         )
104 | 
105 |         if len(query1) == 0 and len(query2) == 0:
106 |             print("No new content added to the forecast.")
107 |             return None
108 | 
109 |         facts_to_use = (
110 |             """--- START FACTS ---------------------------\n"""
111 |             + query1.strip()
112 |             + "\n"
113 |             + query2.strip()
114 |             + """--- END FACTS ---------------------------\n"""
115 |         )
116 | 
117 |         chatbot_messages = [
118 |             {"role": "system", "content": start_system_prompt},
119 |             {"role": "user", "content": start_user_prompt},
120 |         ]
121 | 
122 |         chatbot = self.chatbot
123 | 
124 |         prompt_template = ChatPrompt(chatbot_messages)
125 | 
126 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
127 | 
128 |         additional_facts = ""
129 |         if facts is not None:
130 |             additional_facts = "Some additional facts for consideration are below...\n"
131 |             afctr = 1
132 |             for f in facts:
133 |                 additional_facts += f"AF{afctr}: {f}\n"
134 |                 afctr += 1
135 |             additional_facts += "---------------------\n\n"
136 | 
137 |         chatbot.messages = prompt_template.fill(
138 |             statement_title=statement.title,
139 |             statement_description=statement.description,
140 |             statement_fill_in_the_blank=statement.fill_in_the_blank,
141 |             fill_in_the_blank_2=statement.fill_in_the_blank,
142 |             content=facts_to_use,
143 |             the_date=the_date,
144 |             additional_facts=additional_facts,
145 |         )
146 | 
147 |         assistant_analysis = chatbot.resend()
148 |         full_content = clean_fact_citations(self.factbase, assistant_analysis)
149 | 
150 |         print("\n\n\n")
151 |         print(assistant_analysis)
152 | 
153 |         uh = UtilityHelper(openai_api_key)
154 |         prediction = uh.extract_prediction(
155 |             assistant_analysis, statement.fill_in_the_blank
156 |         )
157 | 
158 |         client = Client(et_api_key)
159 | 
160 |         # full_content = content + "\n\n-----------------\n\n" + assistant_analysis
161 | 
162 |         response = client.create_forecast(
163 |             statement.id,
164 |             "Prediction",
165 |             full_content,
166 |             prediction,
167 |             prediction_agent,
168 |             {
169 |                 # "full_response_from_llm_before_source_cleanup": content,
170 |                 "full_response_from_llm": assistant_analysis,
171 |                 "extracted_value": prediction,
172 |             },
173 |         )
174 | 
175 |         return response
176 | 
177 |     def extend_forecast(
178 |         self,
179 |         forecast: Forecast,
180 |         openai_api_key,
181 |         et_api_key,
182 |         facts=None,
183 |         prediction_agent="Test Agent",
184 |     ):
185 | 
186 |         # Note: we only update the forecast with data/info we added since the last forecast.
187 | 
188 |         query1 = self.factbase.query_to_fact_content(
189 |             forecast.statement.fill_in_the_blank,
190 |             n_results=25,
191 |             skip_separator=True,
192 |             since_date=forecast.created_at,
193 |         )
194 |         query2 = self.factbase.query_to_fact_content(
195 |             forecast.statement.description,
196 |             n_results=25,
197 |             skip_separator=True,
198 |             since_date=forecast.created_at,
199 |         )
200 | 
201 |         if len(query1) == 0 and len(query2) == 0:
202 |             print("No new content added to the forecast.")
203 |             return None
204 | 
205 |         facts_to_use = (
206 |             """--- START FACTS ---------------------------\n"""
207 |             + query1.strip()
208 |             + "\n"
209 |             + query2.strip()
210 |             + """--- END FACTS ---------------------------\n"""
211 |         )
212 | 
213 |         chatbot_messages = [
214 |             {"role": "system", "content": start_system_prompt},
215 |             {"role": "user", "content": extend_user_prompt},
216 |         ]
217 | 
218 |         chatbot = self.chatbot
219 | 
220 |         prompt_template = ChatPrompt(chatbot_messages)
221 | 
222 |         the_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
223 | 
224 |         additional_facts = ""
225 |         if facts is not None:
226 |             additional_facts = "Some additional facts for consideration are below...\n"
227 |             afctr = 1
228 |             for f in facts:
229 |                 additional_facts += f"AF{afctr}: {f}\n"
230 |                 afctr += 1
231 |             additional_facts += "---------------------\n\n"
232 | 
233 |         chatbot.messages = prompt_template.fill(
234 |             statement_title=forecast.statement.title,
235 |             statement_description=forecast.statement.description,
236 |             statement_fill_in_the_blank=forecast.statement.fill_in_the_blank,
237 |             fill_in_the_blank_2=forecast.statement.fill_in_the_blank,
238 |             content=facts_to_use,
239 |             the_date=the_date,
240 |             additional_facts=additional_facts,
241 |             earlier_forecast_value=str(forecast.value),
242 |             earlier_forecast=forecast.justification,
243 |         )
244 | 
245 |         assistant_analysis = chatbot.resend()
246 | 
247 |         # print("\n\n\n")
248 |         # print(assistant_analysis)
249 | 
250 |         full_content = clean_fact_citations(self.factbase, assistant_analysis)
251 |         print(full_content)
252 | 
253 |         uh = UtilityHelper(openai_api_key)
254 |         prediction = uh.extract_prediction(
255 |             assistant_analysis, forecast.statement.fill_in_the_blank
256 |         )
257 | 
258 |         client = Client(et_api_key)
259 | 
260 |         response = client.create_forecast(
261 |             forecast.statement.id,
262 |             "Prediction",
263 |             full_content,
264 |             prediction,
265 |             prediction_agent,
266 |             {
267 |                 "full_response_from_llm": assistant_analysis,
268 |                 "extracted_value": prediction,
269 |             },
270 |             forecast.id,
271 |         )
272 | 
273 |         return response
274 | 


--------------------------------------------------------------------------------
/emergingtrajectories/knowledge.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Solutions for finding, extracting, storing, and reviisting knowledge.
  3 | """
  4 | 
  5 | """
  6 | This is the first knowledge base, and is meant to be a POC, really.
  7 | 
  8 | All of our agents as of today (Feb 1) focus on web searches and website content. Today, we do a Google search and scrape the content from the top results. We repeat this process every time the agent runs.
  9 | 
 10 | An obvious next step would be to create some sort of a cache to see if we already scraped the page and included the content elsewhere.
 11 | 
 12 | This should also be able to do multiple searches *and* accept other URLs to scrape.
 13 | 
 14 | How would this one work?
 15 | 1. Have a folder where things get cached.
 16 | 2. Have a JSON file that tracks when a knowledge base was accessed, the source URL, etc.
 17 | """
 18 | 
 19 | import os
 20 | import json
 21 | import hashlib
 22 | 
 23 | # Using JSONEncoder to be consistent with the Emerging Trajectories website and platform.
 24 | from django.core.serializers.json import DjangoJSONEncoder
 25 | 
 26 | from phasellm.agents import WebpageAgent
 27 | 
 28 | from datetime import datetime
 29 | 
 30 | from . import Client
 31 | from phasellm.llms import OpenAIGPTWrapper, ChatBot
 32 | 
 33 | """
 34 | CACHE STRUCTURE IN JSON...
 35 | 
 36 | key: URI (URL or file name)
 37 | value: {
 38 |     "obtained_on": <date>; when the file was downloaded
 39 |     "last_accessed": <date>; when the file was last used by the agent
 40 |     "accessed": 0 if not accessed, 1 if accessed
 41 |     "uri_md5": the MD5 sum of the URI
 42 | }
 43 | 
 44 | """
 45 | 
 46 | 
 47 | def statement_to_search_queries(
 48 |     statement_id: int, client: Client, openai_api_key: str, num_queries: int = 3
 49 | ) -> list[str]:
 50 |     """
 51 |     Given a specific statement ID, this will return a list of queries you can put into a search engine to get useful information.
 52 | 
 53 |     Args:
 54 |         statement_id (int): The ID of the statement to get search queries for.
 55 |         client (Client): The Emerging Trajectories API client.
 56 |         openai_api_key (str): The OpenAI API key.
 57 |         num_queries (int, optional): The number of queries to return. Defaults to 3.
 58 | 
 59 |     Returns:
 60 |         list[str]: A list of search queries.
 61 | 
 62 |     """
 63 | 
 64 |     statement = client.get_statement(statement_id)
 65 |     # print(statement)
 66 | 
 67 |     llm = OpenAIGPTWrapper(openai_api_key, model="gpt-3.5-turbo")
 68 |     chatbot = ChatBot(llm)
 69 | 
 70 |     chatbot.messages = [
 71 |         {
 72 |             "role": "system",
 73 |             "content": f"""I am working on a research project about this topic:\n{statement['title']}\n\n{statement['description']}\n\nHere is more information about what I am trying to do:\n{statement['description']}""",
 74 |         },
 75 |         {
 76 |             "role": "user",
 77 |             "content": "Could you please provide me with up to {num_queries} search queries that I can input into a search engine to find info about this topic? Please do not qualify your response... Simply provide one search query per line and nothing else.",
 78 |         },
 79 |     ]
 80 | 
 81 |     response = chatbot.resend()
 82 |     lines = response.strip().split("\n")
 83 | 
 84 |     if len(lines) > num_queries:
 85 |         lines = lines[:num_queries]
 86 | 
 87 |     return lines
 88 | 
 89 | 
 90 | def uri_to_local(uri: str) -> str:
 91 |     """
 92 |     Convert a URI to a local file name. In this case, we typically will use an MD5 sum.
 93 | 
 94 |     Args:
 95 |         uri (str): The URI to convert.
 96 | 
 97 |     Returns:
 98 |         str: The MD5 sum of the URI.
 99 |     """
100 |     uri_md5 = hashlib.md5(uri.encode("utf-8")).hexdigest()
101 |     return uri_md5
102 | 
103 | 
104 | class KnowledgeBaseFileCache:
105 | 
106 |     def __init__(self, folder_path: str, cache_file: str = "cache.json") -> None:
107 |         """
108 |         The KnowledgeBaseFileCache is a simple file-based cache for web content and local files. The cache stores the original HTML, PDF, or TXT content and tracks when (if ever) an agent actually accessed the content.
109 | 
110 |         Args:
111 |             folder_path (str): The folder where the cache will be stored.
112 |             cache_file (str, optional): The name of the cache file. Defaults to "cache.json".
113 |         """
114 |         self.root_path = folder_path
115 |         self.root_parsed = os.path.join(folder_path, "parsed")
116 |         self.root_original = os.path.join(folder_path, "original")
117 |         self.cache_file = os.path.join(folder_path, cache_file)
118 |         self.cache = self.load_cache()
119 | 
120 |     def save_state(self) -> None:
121 |         """
122 |         Saves the in-memory changes to the knowledge base to the JSON cache file.
123 |         """
124 |         with open(self.cache_file, "w") as f:
125 |             json.dump(self.cache, f, cls=DjangoJSONEncoder)
126 | 
127 |     def load_cache(self) -> None:
128 |         """
129 |         Loads the cache from the cache file, or creates the relevant files and folders if one does not exist.
130 |         """
131 | 
132 |         if not os.path.exists(self.root_path):
133 |             os.makedirs(self.root_path)
134 | 
135 |         if not os.path.exists(self.root_parsed):
136 |             os.makedirs(self.root_parsed)
137 | 
138 |         if not os.path.exists(self.root_original):
139 |             os.makedirs(self.root_original)
140 | 
141 |         if not os.path.exists(self.cache_file):
142 |             with open(self.cache_file, "w") as f:
143 |                 f.write("{}")
144 | 
145 |         with open(self.cache_file, "r") as f:
146 |             return json.load(f)
147 | 
148 |     def in_cache(self, uri: str) -> bool:
149 |         """
150 |         Checks if a URI is in the cache already.
151 | 
152 |         Args:
153 |             uri (str): The URI to check.
154 | 
155 |         Returns:
156 |             bool: True if the URI is in the cache, False otherwise.
157 |         """
158 |         if uri in self.cache:
159 |             return True
160 |         return False
161 | 
162 |     def update_cache(
163 |         self, uri: str, obtained_on: datetime, last_accessed: datetime
164 |     ) -> None:
165 |         """
166 |         Updates the cache file for a given URI, specifically when it was obtained and last accessed.
167 | 
168 |         Args:
169 |             uri (str): The URI to update.
170 |             obtained_on (datetime): The date and time when the content was obtained.
171 |             last_accessed (datetime): The date and time when the content was last accessed.
172 |         """
173 |         uri_md5 = uri_to_local(uri)
174 |         self.cache[uri] = {
175 |             "obtained_on": obtained_on,
176 |             "last_accessed": last_accessed,
177 |             "accessed": 0,
178 |             "uri_md5": uri_md5,
179 |         }
180 |         self.save_state()
181 | 
182 |     def log_access(self, uri: str) -> None:
183 |         """
184 |         Saves the last accessed time and updates the accessed tracker for a given URI.
185 | 
186 |         Args:
187 |             uri (str): The URI to update.
188 |         """
189 |         self.cache[uri]["last_accessed"] = datetime.now()
190 |         self.cache[uri]["accessed"] = 1
191 |         self.save_state()
192 | 
193 |     def get_unaccessed_content(self) -> list[str]:
194 |         """
195 |         Returns a list of URIs that have not been accessed by the agent.
196 | 
197 |         Returns:
198 |             list[str]: A list of URIs that have not been accessed by the agent.
199 |         """
200 |         unaccessed = []
201 |         for uri in self.cache:
202 |             if self.cache[uri]["accessed"] == 0:
203 |                 unaccessed.append(uri)
204 |         return unaccessed
205 | 
206 |     def get(self, uri: str) -> str:
207 |         """
208 |         Returns the content for a given URI. If the content is not in the cache, it will be scraped and added to the cache.
209 | 
210 |         Args:
211 |             uri (str): The URI to get the content for.
212 | 
213 |         Returns:
214 |             str: The content for the given URI.
215 |         """
216 |         uri_md5 = uri_to_local(uri)
217 |         if uri in self.cache:
218 |             with open(os.path.join(self.root_parsed, uri_md5), "r") as f:
219 |                 return f.read()
220 |         else:
221 |             scraper = WebpageAgent()
222 | 
223 |             content_raw = scraper.scrape(uri, text_only=False, body_only=False)
224 |             with open(os.path.join(self.root_original, uri_md5), "w") as f:
225 |                 f.write(content_raw)
226 | 
227 |             content_parsed = scraper.scrape(uri, text_only=True, body_only=True)
228 |             with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
229 |                 f.write(content_parsed)
230 | 
231 |             self.update_cache(uri, datetime.now(), datetime.now())
232 | 
233 |             return content_parsed
234 | 
235 |     def add_content(self, content: str, uri: str = None) -> None:
236 |         """
237 |         Adds content to cache.
238 | 
239 |         Args:
240 |             content (str): The content to add to the cache.
241 |             uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used.
242 |         """
243 |         if uri is None:
244 |             uri = hashlib.md5(content.encode("utf-8")).hexdigest()
245 |         uri_md5 = uri_to_local(uri)
246 |         with open(os.path.join(self.root_parsed, uri_md5), "w") as f:
247 |             f.write(content)
248 |         self.update_cache(uri, datetime.now(), datetime.now())
249 | 
250 |     def add_content_from_file(self, filepath: str, uri: str = None) -> None:
251 |         """
252 |         Adds content from a text file to the cache.
253 | 
254 |         Args:
255 |             filepath (str): The path to the file to add to the cache.
256 |             uri (str, optional): The URI to use for the content. Defaults to None, in which case an MD5 sum of the content will be used.
257 |         """
258 |         with open(filepath, "r") as f:
259 |             content = f.read()
260 |         self.add_content(content, uri)
261 | 


--------------------------------------------------------------------------------
/emergingtrajectories/news.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import feedparser
  3 | import time
  4 | import random
  5 | 
  6 | from .crawlers import crawlerPlaywright, _get_text_bs4
  7 | 
  8 | from playwright.sync_api import sync_playwright
  9 | 
 10 | from news_search_client import NewsSearchClient
 11 | from azure.core.credentials import AzureKeyCredential
 12 | 
 13 | 
 14 | def force_empty_content(rss_url: str, content, cache_function) -> None:
 15 |     """
 16 |     Force the crawler to visit every URL in the RSS feed and save it as a blank content file. We do this because some RSS feeds have a lot of old URLs we do not need to crawl, and only want to crawl the delta over some period.
 17 | 
 18 |     Args:
 19 |         rss_url (str): The URL of the RSS feed.
 20 |         content: the content string to save.
 21 |         cache_function: the specific function to call the rss_url and content to save.
 22 |     """
 23 | 
 24 |     agent = RSSAgent(rss_url)
 25 |     all_urls = agent.get_news_as_list()
 26 |     for u in all_urls:
 27 |         cache_function(u, content)
 28 | 
 29 | 
 30 | class RSSAgent:
 31 | 
 32 |     def __init__(self, rss_url, crawler=None) -> None:
 33 |         """
 34 |         A simple wrapper for an RSS feed, so we can query it for URLs.
 35 | 
 36 |         Args:
 37 |             rss_url (str): The URL of the RSS feed.
 38 |             crawler (Crawler, optional): The crawler to use. Defaults to None, in which case we will use crawlerPlaywright in headless mode.
 39 |         """
 40 |         self.rss_url = rss_url
 41 |         if crawler is None:
 42 |             self.crawler = crawlerPlaywright()
 43 |         else:
 44 |             self.crawler = crawler
 45 | 
 46 |     def get_news_as_list(self) -> list:
 47 |         """
 48 |         Query the RSS feed for news articles, and return them as a list of dictionaries.
 49 | 
 50 |         Returns:
 51 |             list: A list of URLs.
 52 |         """
 53 |         urls = []
 54 |         feed = feedparser.parse(self.rss_url)
 55 |         for entry in feed.entries:
 56 |             urls.append(entry.link)
 57 |         return urls
 58 | 
 59 | 
 60 | class NewsBingAgent:
 61 | 
 62 |     def __init__(self, api_key: str, endpoint: str):
 63 |         """
 64 |         Creates a new Bing News API agent. To learn more, see: https://github.com/microsoft/bing-search-sdk-for-python/
 65 | 
 66 |         Args:
 67 |             api_key (str): The Bing News API key.
 68 |             endpoint (str): The Bing News API endpoint.
 69 |         """
 70 |         self.api_key = api_key
 71 |         self.endpoint = endpoint
 72 | 
 73 |     def get_news_as_list(self, query: str, market: str = "en-us") -> list:
 74 |         """
 75 |         Gets a list of URLS from the Bing News API. For more information on markets, see: https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes
 76 | 
 77 |         Args:
 78 |             query (str): The query to search for.
 79 |             market (str, optional): The market to search in. Defaults to "en-us". (US English
 80 | 
 81 |         Returns:
 82 |             list: A list of URLs.
 83 |         """
 84 | 
 85 |         client = NewsSearchClient(
 86 |             endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key)
 87 |         )
 88 | 
 89 |         urls = []
 90 | 
 91 |         try:
 92 |             news_result = client.news.search(query=query, market=market, count=10)
 93 |             for n in news_result.value:
 94 |                 urls.append(n.url)
 95 | 
 96 |         except Exception as err:
 97 |             print("Encountered exception. {}".format(err))
 98 | 
 99 |         return urls
100 | 
101 | 
102 | class NewsAPIAgent:
103 | 
104 |     def __init__(self, api_key, top_headlines=False, crawler=None) -> None:
105 |         """
106 |         A simple wrapper for the News API, so we can query it for URLs.
107 | 
108 |         Args:
109 |             api_key (str): The News API key.
110 |             top_headlines (bool, optional): Whether to get top headlines. Defaults to False.
111 |             crawler (Crawler, optional): The crawler to use. Defaults to None, in which case we will use crawlerPlaywright in headless mode.
112 |         """
113 |         self.api_key = api_key
114 |         self.top_headlines = top_headlines
115 |         if crawler is None:
116 |             self.crawler = crawlerPlaywright()
117 |         else:
118 |             self.crawler = crawler
119 | 
120 |     def get_news_as_list(self, query: str) -> list:
121 |         """
122 |         Query the News API for news articles, and return them as a list of dictionaries.
123 | 
124 |         Args:
125 |             query (str): The query to search for.
126 | 
127 |         Returns:
128 |             list: A list of dictionaries, where each dictionary represents a news article.
129 |         """
130 |         url = f"https://newsapi.org/v2/everything?q={query}&apiKey={self.api_key}"
131 |         if self.top_headlines:
132 |             url = (
133 |                 f"https://newsapi.org/v2/top-headlines?q={query}&apiKey={self.api_key}"
134 |             )
135 |         response = requests.get(url)
136 |         return response.json()
137 | 
138 | 
139 | class FinancialTimesAgent:
140 | 
141 |     # The RSS feed URLs for the Financial Times.
142 |     ft_rss_feed_urls = [
143 |         "https://www.ft.com/rss/home",
144 |         "https://www.ft.com/world?format=rss",
145 |         "https://www.ft.com/global-economy?format=rss",
146 |         "https://www.ft.com/companies?format=rss",
147 |         "https://www.ft.com/opinion?format=rss",
148 |     ]
149 | 
150 |     ft_login_url = "https://ft.com/login"
151 |     ft_main_url = "https://ft.com/"
152 | 
153 |     def __init__(self, user_email, user_password) -> None:
154 |         """
155 |         This is a POC agent that uses Playwright to crawl the Financial Times articles you are interested in. Note that you *NEED* to be a subscriber to the FT to make this work, and thus need to provide your FT user name and password.
156 | 
157 |         Args:
158 |             user_email (str): Your FT email.
159 |             user_password (str): Your FT password.
160 |         """
161 |         self.user_email = user_email
162 |         self.user_password = user_password
163 | 
164 |     def get_news(self, urls: list[str] = None) -> list:
165 |         """
166 |         Get the news from the Financial Times as a list of tuples, where each tuple contains the URL and the extracted text content.
167 | 
168 |         Args:
169 |             urls: a list of URLs to get content for.
170 | 
171 |         Returns:
172 |             A list of lists -- urls, html, and text content
173 |         """
174 | 
175 |         if urls is None:
176 |             urls = set()
177 |             for rss_url in self.ft_rss_feed_urls:
178 |                 agent = RSSAgent(rss_url)
179 |                 rss_url_list = agent.get_news_as_list()
180 |                 for r in rss_url_list:
181 |                     urls.add(r)
182 |             urls = list(urls)
183 | 
184 |         html_content_array = []
185 |         text_content_array = []
186 | 
187 |         with sync_playwright() as playwright:
188 | 
189 |             browser = playwright.firefox.launch(headless=False)
190 |             page = browser.new_page()
191 | 
192 |             # Navigate to the webpage
193 |             page.goto(self.ft_main_url)
194 | 
195 |             print("Accepting Cookies")
196 |             page.frame_locator('*[title="SP Consent Message"]').get_by_text(
197 |                 "Accept Cookies"
198 |             ).click()
199 | 
200 |             time.sleep(2)
201 | 
202 |             page.goto(self.ft_login_url)
203 | 
204 |             time.sleep(2)
205 | 
206 |             print("Entering user name + hitting enter")
207 | 
208 |             page.locator("#enter-email").fill(self.user_email)
209 |             page.keyboard.press("Enter")
210 | 
211 |             time.sleep(5)
212 | 
213 |             page.locator("#enter-password").fill(self.user_password)
214 |             page.keyboard.press("Enter")
215 | 
216 |             time.sleep(5)
217 | 
218 |             url_ctr = 1
219 |             for url in urls:
220 |                 print(f"Getting content for URL {url_ctr} of {len(urls)}")
221 | 
222 |                 html_content = ""
223 |                 text_content = ""
224 | 
225 |                 try:
226 |                     page.goto(url)
227 |                     html_content = page.content()
228 |                     text_content = _get_text_bs4(html_content)
229 | 
230 |                     print(url)
231 |                     print(text_content)
232 | 
233 |                 except:
234 |                     print(url)
235 |                     print(f"Error getting content for URL {url_ctr} of {len(urls)}")
236 | 
237 |                 html_content_array.append(html_content)
238 |                 text_content_array.append(text_content)
239 | 
240 |                 url_ctr += 1
241 | 
242 |                 time.sleep(2 + random.randint(0, 5))
243 | 
244 |             # Close the browser
245 |             browser.close()
246 | 
247 |         return urls, html_content_array, text_content_array
248 | 


--------------------------------------------------------------------------------
/emergingtrajectories/pdf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a very simple set of utility function(s) for loading PDF content. In fact, it might be easier to just use PyPDF directly and avoid this altogether. In the future, we might create specialized functions and classes for doing "fancy" things with PDFs (e.g., OCR, tables, etc.) so have created this module as a way to keep this in mind.
 3 | """
 4 | 
 5 | from pypdf import PdfReader
 6 | import requests
 7 | import io
 8 | 
 9 | 
10 | def get_PDF_content_from_file_by_page(file_path: str) -> list:
11 |     """
12 |     Loads a PDF file and extracts the text into a list of strings, one for each page.
13 | 
14 |     Args:
15 |         file_path (str): The path to the PDF file.
16 | 
17 |     Returns:
18 |         list: A list of strings, one for each page.
19 |     """
20 |     reader = PdfReader(file_path)
21 |     content = []
22 |     for page in reader.pages:
23 |         content.append(page.extract_text())
24 |     return content
25 | 
26 | 
27 | def get_PDF_content_from_url_by_page(url: str) -> list:
28 |     """
29 |     Loads a PDF file from a URL and extracts the text into a list of strings, one for each page.
30 | 
31 |     Args:
32 |         url (str): The URL to the PDF file.
33 | 
34 |     Returns:
35 |         list: A list of strings, one for each page.
36 |     """
37 |     response = requests.get(url=url, timeout=120)
38 |     pdf_file = io.BytesIO(response.content)
39 |     reader = PdfReader(pdf_file)
40 |     content = []
41 |     for page in reader.pages:
42 |         content.append(page.extract_text())
43 |     return content
44 | 
45 | 
46 | def get_PDF_content_by_page_from_file(file_path: str) -> str:
47 |     """
48 |     Loads a PDF file and extracts the text into one big string.
49 | 
50 |     Args:
51 |         file_path (str): The path to the PDF file.
52 | 
53 |     Returns:
54 |         str: The text content of the PDF file.
55 |     """
56 |     reader = PdfReader(file_path)
57 |     content = ""
58 |     for page in reader.pages:
59 |         content += page.extract_text() + "\n"
60 |     return content
61 | 
62 | 
63 | def get_PDF_content_by_page_from_url(url: str) -> str:
64 |     """
65 |     Loads a PDF file from a URL and extracts the text into one big string.
66 | 
67 |     Args:
68 |         url (str): The URL to the PDF file.
69 | 
70 |     Returns:
71 |         str: The text content of the PDF file.
72 |     """
73 |     response = requests.get(url=url, timeout=120)
74 |     pdf_file = io.BytesIO(response.content)
75 |     reader = PdfReader(pdf_file)
76 |     content = ""
77 |     for page in reader.pages:
78 |         content += page.extract_text() + "\n"
79 |     return content
80 | 


--------------------------------------------------------------------------------
/emergingtrajectories/prompts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a convenience file for tracking prompts. We'll likely remove this in the (near) future.
 3 | """
 4 | 
 5 | system_prompt_question_continuous = """You are a research agent that is meant to answer questions about specific points and topics. The facts you reference in answering these questions should all be based on information we provide you. We will provide you knowledge base below, where each fact is preceded by an ID (e.g., F1, F2, etc.). All your answers should be absed on these facts ONLY.
 6 | 
 7 | For example, suppose we ask, 'Who is the President of the USA?' and have the following facts...
 8 | 
 9 | F1: The President of the USA is Joe Biden.
10 | F2: The Vice President of the USA is Kamala Harris.
11 | 
12 | ... your answers hould be something like this:
13 | 
14 | The President of th USA is Joe Biden [F1].
15 | 
16 | We will give you a list of facts for every question. You can reference those facts using square brackets and the fact ID, so [F123] for fact 123, or you can also reference earlier facts from the conversation chain. YOU CANNOT USE OTHER INFORMATION."""
17 | 


--------------------------------------------------------------------------------
/emergingtrajectories/recursiveagent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | x(t+1) = x(t) + z
  3 | ... where x(t) is the current observation about the world
  4 | ... z is the set of scenarios that will impact x in the future (i.e., x(t+1))
  5 | 
  6 | This is influenced by Yann LeCun's world modeling approach discussion: https://www.linkedin.com/feed/update/urn:li:activity:7165738293223931904/ 
  7 | 
  8 | We are aiming to eventually build some sort of a fact base system. Until then, however, we will be passing information directly through to the agent here.
  9 | 
 10 | We're also using this as a way to test how well our new approach to classes (new Client, new Forecast, etc.) will work, so we can plug and play different types of agents here.
 11 | 
 12 | Note that this approach will *not* test new knowledge bases *yet*.
 13 | 
 14 | """
 15 | 
 16 | from .knowledge import KnowledgeBaseFileCache
 17 | from .utils import UtilityHelper
 18 | 
 19 | from . import Client, Statement, Forecast
 20 | 
 21 | from phasellm.llms import OpenAIGPTWrapper, ChatBot, ChatPrompt
 22 | from phasellm.agents import WebpageAgent, WebSearchAgent
 23 | 
 24 | import requests
 25 | import dateparser
 26 | import re
 27 | import datetime
 28 | 
 29 | 
 30 | class ETClient(object):
 31 | 
 32 |     # The base URL for the API, in case we need to change it or if someone wants to self-host anything.
 33 |     base_url = "https://emergingtrajectories.com/a/api/v0.2/"
 34 | 
 35 |     def __init__(self, api_key: str) -> None:
 36 |         self.api_key = api_key
 37 | 
 38 |     def get_statement(self, statement_id: int) -> Statement:
 39 |         """
 40 |         Returns a given statement from the platform. Includes title, description, deadline, and fill-in-the-blank.
 41 | 
 42 |         Args:
 43 |             statement_id: the ID of the statement to retrieve
 44 | 
 45 |         Returns:
 46 |             Statement: the statement from the platform
 47 |         """
 48 |         url = self.base_url + "get_statement" + "/" + str(statement_id)
 49 |         headers = {
 50 |             "Authorization": f"Bearer {self.api_key}",
 51 |             "Content-Type": "application/json",
 52 |         }
 53 |         response = requests.post(url, headers=headers)
 54 |         if response.status_code == 200:
 55 |             r_obj = response.json()
 56 |             s = Statement(r_obj["title"], r_obj["fill_in_the_blank"])
 57 |             s.id = int(r_obj["id"])
 58 |             s.description = r_obj["description"]
 59 |             s.deadline = dateparser.parse(r_obj["deadline"])
 60 |             s.created_at = dateparser.parse(r_obj["created_at"])
 61 |             s.updated_at = dateparser.parse(r_obj["updated_at"])
 62 |             s.created_by = r_obj["created_by"]
 63 |             return s
 64 |         else:
 65 |             raise Exception(response.text)
 66 | 
 67 |     def get_forecast(self, forecast_id: int) -> Forecast:
 68 |         """
 69 |         Returns a given forecast from the platform.
 70 | 
 71 |         Args:
 72 |             forecast_id: the ID of the statement to retrieve
 73 | 
 74 |         Returns:
 75 |             Forecast: the forecast from the platform
 76 |         """
 77 |         url = self.base_url + "get_forecast" + "/" + str(forecast_id)
 78 |         headers = {
 79 |             "Authorization": f"Bearer {self.api_key}",
 80 |             "Content-Type": "application/json",
 81 |         }
 82 |         response = requests.post(url, headers=headers)
 83 |         if response.status_code == 200:
 84 | 
 85 |             r_obj = response.json()
 86 | 
 87 |             f = Forecast(r_obj["title"], float(r_obj["value"]), r_obj["justification"])
 88 | 
 89 |             f.id = int(r_obj["forecast_id"])
 90 | 
 91 |             f.statement_id = int(r_obj["statement_id"])
 92 |             f.statement = self.get_statement(int(r_obj["statement_id"]))
 93 | 
 94 |             f.created_at = dateparser.parse(r_obj["created_at"])
 95 |             f.updated_at = dateparser.parse(r_obj["updated_at"])
 96 |             # f.created_by = r_obj["created_by"]
 97 |             f.prediction_agent = r_obj["prediction_agent"]
 98 | 
 99 |             f.additional_data = r_obj["additional_data"]
100 | 
101 |             if "prior_forecast" in r_obj:
102 |                 if r_obj["prior_forecast"] is not None:
103 |                     f.prior_forecast = int(r_obj["prior_forecast"])
104 |             f.is_human = bool(r_obj["is_human"])
105 | 
106 |             if "next_forecasts" in r_obj:
107 |                 if r_obj is not None:
108 |                     f.next_forecasts = r_obj["next_forecasts"]
109 | 
110 |             return f
111 |         else:
112 |             raise Exception(response.text)
113 | 
114 |     def add_facts_to_factbase(
115 |         self, fact_db_slug: str, url: str, facts: list[str]
116 |     ) -> bool:
117 |         """
118 |         Adds a list of facts to a factbase on the Emerging Trajectories website.
119 | 
120 |         Args:
121 |             fact_db_slug: the slug of the fact database to add the fact to.
122 |             url: the URL of the fact.
123 |             facts: the facts to add (a list of strings).
124 | 
125 |         Reutnr:
126 |             bool: True if successful, False otherwise.
127 |         """
128 | 
129 |         api_url = self.base_url + "add_facts/" + fact_db_slug
130 |         headers = {
131 |             "Authorization": f"Bearer {self.api_key}",
132 |             "Content-Type": "application/json",
133 |         }
134 |         j = {
135 |             "facts": facts,
136 |             "url": url,
137 |         }
138 |         response = requests.post(api_url, headers=headers, json=j)
139 | 
140 |         if response.status_code == 200 or response.status_code == 201:
141 |             return True
142 |         print(response)
143 |         return False
144 | 
145 |     def add_fact_to_factbase(self, fact_db_slug: str, url: str, fact: str) -> bool:
146 |         """
147 |         Adds a fact to a factbase on the Emerging Trajectories website.
148 | 
149 |         Args:
150 |             fact_db_slug: the slug of the fact database to add the fact to.
151 |             url: the URL of the fact.
152 |             fact: the fact to add.
153 | 
154 |         Reutnr:
155 |             bool: True if successful, False otherwise.
156 |         """
157 |         api_url = self.base_url + "add_fact/" + fact_db_slug
158 |         headers = {
159 |             "Authorization": f"Bearer {self.api_key}",
160 |             "Content-Type": "application/json",
161 |         }
162 |         j = {
163 |             "fact": fact,
164 |             "url": url,
165 |         }
166 |         response = requests.post(api_url, headers=headers, json=j)
167 | 
168 |         if response.status_code == 200 or response.status_code == 201:
169 |             return True
170 |         print(response)
171 |         return False
172 | 
173 |     def add_content_to_factbase(
174 |         self, fact_db_slug: str, url: str, content: str, topic: str
175 |     ) -> bool:
176 |         """
177 |         Sends content to the Emerging Trajectories website and extract facts from it.
178 | 
179 |         Args:
180 |             fact_db_slug: the slug of the fact database to add the content to.
181 |             url: the URL of the content. Note: we do not actually crawl this, we assume the content passed is the right conent.
182 |             content: the content to extract facts from.
183 |             topic: the topic of the content.
184 | 
185 |         Returns:
186 |             bool: True if successful, False otherwise.
187 |         """
188 | 
189 |         api_url = self.base_url + "add_content_to_factbase/" + fact_db_slug
190 |         headers = {
191 |             "Authorization": f"Bearer {self.api_key}",
192 |             "Content-Type": "application/json",
193 |         }
194 |         j = {
195 |             "content": content,
196 |             "url": url,
197 |             "topic": topic,
198 |         }
199 |         response = requests.post(api_url, headers=headers, json=j)
200 | 
201 |         if response.status_code == 200 or response.status_code == 201:
202 |             return True
203 |         print(response)
204 |         return False
205 | 
206 | 
207 | # TODO Move to Utils.py, or elsewhere.
208 | def clean_citations(assistant_analysis: str, ctr_to_source: dict) -> str:
209 |     """
210 |     The analysis currently contains numerical citations that are likely not in order, or in some cases are not used. We will update the cituations to follow the proper numerical order, and also include the URLs at the very end.
211 | 
212 |     Args:
213 |         assistant_analysis: the analysis text from the assistant
214 |         ctr_to_source: the mapping of citation number to source URL
215 | 
216 |     Returns:
217 |         str: the cleaned analysis text, with citations following a proper numerical format and URIs at the end of the analysis
218 |     """
219 | 
220 |     new_ctr_map = {}
221 |     ctr = 1
222 | 
223 |     end_notes = "\n\n--- SOURCES ---\n\n"
224 |     new_analysis = ""
225 | 
226 |     matches = re.finditer(r"\[\d+\]", assistant_analysis)
227 | 
228 |     last_index = 0
229 |     for m in matches:
230 | 
231 |         m_start = m.start() + 1
232 |         m_end = m.end() - 1
233 | 
234 |         old_ctr = int(m.group()[1:-1])
235 |         uri = ctr_to_source[old_ctr]
236 | 
237 |         if old_ctr not in new_ctr_map:
238 |             new_ctr_map[old_ctr] = ctr
239 |             end_notes += f"{ctr}: {uri}\n"
240 |             ctr += 1
241 | 
242 |         new_analysis += assistant_analysis[last_index:m_start] + str(
243 |             new_ctr_map[old_ctr]
244 |         )
245 |         last_index = m_end
246 | 
247 |     if last_index != 0:
248 |         new_analysis += assistant_analysis[last_index:] + end_notes
249 | 
250 |     else:
251 |         new_analysis = assistant_analysis + end_notes + "No citations provided."
252 | 
253 |     return new_analysis
254 | 
255 | 
256 | ####
257 | # INITIAL FORECAST
258 | #
259 | 
260 | base_system_prompt = """You are a researcher tasked with helping forecast economic and social trends. The title of our research project is: {statement_title}.
261 | 
262 | The project description is as follows...
263 | {statement_description}
264 | 
265 | We will provide you with content from reports and web pages that is meant to help with the above. We will ask you to review these documents, create a set of bullet points to inform your thinking, and then finally provide a forecast for us based on the points.
266 | 
267 | The format of the forecast needs to be, verbatim, as follows: {statement_fill_in_the_blank}
268 | """
269 | 
270 | base_user_prompt = """Today's date is {the_date}. We will now provide you with all the content we've managed to collect. 
271 | 
272 | ----------------------
273 | {scraped_content}
274 | ----------------------
275 | 
276 | Please think step-by-step by (a) extracting critical bullet points from the above, and (b) discuss your logic and rationale for making a forecast based on the above.
277 | 
278 | The content we provided you contains source numbers in the format 'SOURCE: #'. When you extract facts, please include the citation in square brackets, with the #, like [#], but replace "#" with the actual Source # from the crawled content we are providing you.
279 | 
280 | For example, if you are referring to a fact that came under --- SOURCE: 3 ---, you would write something like: "Data is already trending to hotter temperatures [3]." Do not include the "#" in the brackets, just the number.
281 | 
282 | Do this for the final justification of your forecast as well.
283 | 
284 | We realize you are being asked to provide a speculative forecast. We are using this to better understand the world and finance, so please fill in the blank. We will not use this for any active decision-making, but more to learn about the capabilities of AI.
285 | """
286 | 
287 | base_user_prompt_followup = """Thank you! Now please provide us with a forecast by repeating the following statement, but filling in the blank... DO NOT provide a range, but provide one specific numerical value. If you are unable to provide a forecast, please respond with "UNCLEAR".
288 | 
289 | {statement_fill_in_the_blank}
290 | """
291 | 
292 | 
293 | class RecursiveForecastingAgent(object):
294 | 
295 |     # TODO: eventually, should move the Google / KnowledgeBaseFileCache to some other knowledge process.
296 |     def __init__(
297 |         self,
298 |         client: ETClient,
299 |         chatbot: ChatBot,
300 |         google_api_key: str,
301 |         google_search_id: str,
302 |         google_search_query: str,
303 |         knowledge_base: KnowledgeBaseFileCache,
304 |     ):
305 | 
306 |         self.google_api_key = google_api_key
307 |         self.google_search_id = google_search_id
308 |         self.google_search_query = google_search_query
309 |         self.knowledge_base = knowledge_base
310 |         self.client = client
311 |         self.chatbot = chatbot
312 | 
313 |     # TODO / NOTE: this allows us to continue chatting with the forecasting agent, since we can obtain the chatbot later. Given that some folks are interested in asking for clarifications, this could be an interesting opportunity.
314 |     def setChatBot(self, chatbot):
315 |         self.chatbot = chatbot
316 | 
317 |     # TODO: standardize -- camel case or snake case? Or something else?
318 |     def getChatBot(self):
319 |         return self.chatbot
320 | 
321 |     def create_forecast(
322 |         self, statement: Statement, openai_api_key, et_api_key, facts=None
323 |     ):
324 |         """
325 |         Options for taking in x(t) or z...
326 |         1) x(t) and z are strings... An array of facts.
327 |         2) x(t) and z are specific preprogrammed/strict facts, like "today's date" and "last forecast".
328 |         3) Facts are "Fact Objects" that have specific string representations. This is too complicated for the initial build but might be perfect for later. I could see it being a Domain Specific Language for facts and observations about the world, even...
329 |         """
330 | 
331 |         statement_id = statement.id
332 |         statement_title = statement.title
333 |         statement_description = statement.description
334 |         fill_in_the_blank = statement.fill_in_the_blank
335 | 
336 |         knowledge_base = self.knowledge_base
337 | 
338 |         webagent = WebSearchAgent(api_key=self.google_api_key)
339 |         results = webagent.search_google(
340 |             query=self.google_search_query,
341 |             custom_search_engine_id=self.google_search_id,
342 |             num=10,
343 |         )
344 | 
345 |         scraped_content = ""
346 | 
347 |         added_new_content = False
348 | 
349 |         # We store the accessed resources and log access only when we successfully submit a forecast. If anything fails, we'll review those resources again during the next forecasting attempt.
350 |         accessed_resources = []
351 | 
352 |         ctr = 0
353 |         ctr_to_source = {}
354 | 
355 |         for result in results:
356 |             if not knowledge_base.in_cache(result.url):
357 |                 ctr += 1
358 |                 added_new_content = True
359 |                 page_content = knowledge_base.get(result.url)
360 | 
361 |                 accessed_resources.append(result.url)
362 |                 # knowledge_base.log_access(result.url)
363 | 
364 |                 scraped_content += (
365 |                     f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
366 |                 )
367 |                 ctr_to_source[ctr] = result.url
368 | 
369 |         # We also check the knowledge base for content that was added manually.
370 |         unaccessed_uris = knowledge_base.get_unaccessed_content()
371 |         for ua in unaccessed_uris:
372 |             added_new_content = True
373 |             ctr += 1
374 |             page_content = knowledge_base.get(ua)
375 | 
376 |             accessed_resources.append(ua)
377 |             # knowledge_base.log_access(ua)
378 | 
379 |             scraped_content += (
380 |                 f"{page_content}\n\n--- SOURCE: {ctr}-------------------\n\n"
381 |             )
382 |             ctr_to_source[ctr] = ua
383 | 
384 |         if not added_new_content:
385 |             print("No new content added to the forecast.")
386 |             return None
387 | 
388 |         the_date = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
389 | 
390 |         # llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
391 |         # chatbot = ChatBot(llm)
392 |         chatbot = self.chatbot
393 | 
394 |         first_user_message = base_user_prompt
395 |         if facts is not None:
396 |             fact_str = ""
397 |             for f in facts:
398 |                 fact_str += "-- " + f + "\n"
399 |             first_user_message = (
400 |                 "We know the following facts. These are fully correct and should be used to inform your forecast:"
401 |                 + fact_str.strip()
402 |                 + "\n\n"
403 |                 + first_user_message
404 |             )
405 | 
406 |         prompt_template = ChatPrompt(
407 |             [
408 |                 {"role": "system", "content": base_system_prompt},
409 |                 {"role": "user", "content": first_user_message},
410 |             ]
411 |         )
412 | 
413 |         chatbot.messages = prompt_template.fill(
414 |             statement_title=statement_title,
415 |             statement_description=statement_description,
416 |             statement_fill_in_the_blank=fill_in_the_blank,
417 |             scraped_content=scraped_content,
418 |             the_date=the_date,
419 |         )
420 | 
421 |         assistant_analysis = chatbot.resend()
422 | 
423 |         print("\n\n\n")
424 |         print(assistant_analysis)
425 | 
426 |         prompt_template_2 = ChatPrompt(
427 |             [
428 |                 {"role": "system", "content": base_system_prompt},
429 |                 {"role": "user", "content": first_user_message},
430 |                 {"role": "assistant", "content": "{assistant_analysis}"},
431 |                 {"role": "user", "content": base_user_prompt_followup},
432 |             ]
433 |         )
434 | 
435 |         chatbot.messages = prompt_template_2.fill(
436 |             statement_title=statement_title,
437 |             statement_description=statement_description,
438 |             statement_fill_in_the_blank=fill_in_the_blank,
439 |             scraped_content=scraped_content,
440 |             assistant_analysis=assistant_analysis,
441 |             the_date=the_date,
442 |         )
443 | 
444 |         filled_in_statement = chatbot.resend()
445 | 
446 |         print("\n\n\n")
447 |         print(filled_in_statement)
448 | 
449 |         assistant_analysis_sourced = clean_citations(assistant_analysis, ctr_to_source)
450 | 
451 |         print("\n\n\n*** ANALYSIS WITH CITATIONS***\n\n\n")
452 |         print(assistant_analysis_sourced)
453 | 
454 |         uh = UtilityHelper(openai_api_key)
455 |         prediction = uh.extract_prediction(filled_in_statement, fill_in_the_blank)
456 | 
457 |         client = Client(et_api_key)
458 | 
459 |         response = client.create_forecast(
460 |             statement_id,
461 |             "Prediction",
462 |             assistant_analysis_sourced,
463 |             prediction,
464 |             "Test Agent",
465 |             {
466 |                 "full_response_from_llm_before_source_cleanup": assistant_analysis,
467 |                 "full_response_from_llm": assistant_analysis_sourced,
468 |                 "raw_forecast": filled_in_statement,
469 |                 "extracted_value": prediction,
470 |             },
471 |         )
472 | 
473 |         for ar in accessed_resources:
474 |             knowledge_base.log_access(ar)
475 | 
476 |         return response
477 | 
478 |     def extend_forecast(self, forecast: Forecast):
479 |         pass
480 | 


--------------------------------------------------------------------------------
/emergingtrajectories/utils.py:
--------------------------------------------------------------------------------
  1 | from phasellm.llms import OpenAIGPTWrapper, ChatBot
  2 | 
  3 | # Prompt used for extracting predictions from text messages.
  4 | _extract_prediction_prompt = """You are helping a researcher with a data extraction exercise. You will be provided with a prediction statement and a broader piece of text. Your objective is to extract the specific numerical prediction and provide it as a response. DO NOT qualify your response in any way.
  5 | 
  6 | For example, suppose you have the following...
  7 | 
  8 | |||---|||
  9 | PREDICTION STATEMENT: please predict the probability that the price of Bitcoin will exceed $100,000 by the end of 2024.
 10 | 
 11 | TEXT: The probability that bitcoin will exceed $100,000 by the end of 2024 is 0.37.
 12 | |||---|||
 13 | 
 14 | In the case above, your response would simply be "0.37".
 15 | 
 16 | The actual metrics (i.e., prediction) might be provided with formatting. For example...
 17 | 
 18 | |||---|||
 19 | PREDICTION STATEMENT: The probability that Boeing's (NYSE:BA) share price at the close of markets on or before March 1, 2024 will be $220.00 USD or higher is _____ (value between 0.00 and 1.00).
 20 | 
 21 | TEXT: The probability that Boeing's (NYSE:BA) share price at the close of markets on or before March 1, 2024, will be $220.00 USD or higher is **0.65**.
 22 | |||---|||
 23 | 
 24 | In this case, ignore the asterisks or instructions ("value between 0.00 and 1.00") and provide the correct response, which is 0.65.
 25 | 
 26 | The user will provide you with a PREDICTION STATEMENT and TEXT and you need to answer like the above.
 27 | 
 28 | On the extremely rare occasion that the TEXT does not have a proper numerical prediction or you are unable to extract it, simply respond with "UNCLEAR".
 29 | """
 30 | 
 31 | # Error message used when the prediction cannot be extracted from the response.
 32 | _extract_prediction_prompt_error = "UNCLEAR"
 33 | 
 34 | 
 35 | def is_numeric(string: str) -> bool:
 36 |     """
 37 |     Checks whether the 'string' passed as an argument can be converted into a numeric value.
 38 | 
 39 |     Args:
 40 |         string: the string in question
 41 | 
 42 |     Returns:
 43 |         Boolean value; True if the string can be converted into a numeric value, False otherwise.
 44 |     """
 45 |     if string is None:
 46 |         return False
 47 |     try:
 48 |         float(string)
 49 |         return True
 50 |     except ValueError:
 51 |         return False
 52 | 
 53 | 
 54 | # TODO document
 55 | def run_forecast(function_to_call, n, *args, **kwargs):
 56 | 
 57 |     if n == 0:
 58 |         return None
 59 | 
 60 |     result = None
 61 | 
 62 |     try:
 63 |         result = function_to_call(*args, **kwargs)
 64 |     except Exception as e:
 65 |         print(f"Forecast failed with error: {e}")
 66 |         print(f"Trying up to {n-1} more times.")
 67 |         result = run_forecast(function_to_call, n - 1, *args, **kwargs)
 68 | 
 69 |     if result is None:
 70 |         print(f"Forecast failed after {n} attempts.")
 71 | 
 72 |     return result
 73 | 
 74 | 
 75 | class UtilityHelper(object):
 76 | 
 77 |     def __init__(self, api_key, model="gpt-4-0125-preview") -> None:
 78 |         """
 79 |         The UtilityHelper class is used to extract predictions from text messages.
 80 | 
 81 |         Args:
 82 |             api_key: the OpenAI API key
 83 |             model: the OpenAI model to use for the extraction process
 84 |         """
 85 | 
 86 |         self.api_key = api_key
 87 |         self.model = model
 88 | 
 89 |     def extract_prediction(self, response: str, statement_challenge: str) -> float:
 90 |         """
 91 |         Extracts the prediction value from the response to a statement challenge.
 92 | 
 93 |         Args:
 94 |             response: the response to the statement challenge (i.e., what was predicted by another LLM)
 95 |             statement_challenge: the statement challenge -- what is being predicted
 96 | 
 97 |         Returns:
 98 |             The extracted prediction value as a float. Raises an exception if the prediction cannot be extracted.
 99 |         """
100 | 
101 |         message_stack = [
102 |             {"role": "system", "content": _extract_prediction_prompt},
103 |             {
104 |                 "role": "user",
105 |                 "content": f"PREDICTION STATEMENT: {statement_challenge}\n\nTEXT: {response}",
106 |             },
107 |         ]
108 | 
109 |         # print(f"PREDICTION STATEMENT: {statement_challenge}\n\nTEXT: {response}")
110 | 
111 |         llm = OpenAIGPTWrapper(apikey=self.api_key, model=self.model)
112 |         chatbot = ChatBot(llm)
113 |         chatbot.messages = message_stack
114 | 
115 |         output = chatbot.resend()
116 | 
117 |         # print(f"\n\n\n{output}\b\b\b")
118 | 
119 |         if output == _extract_prediction_prompt_error:
120 |             raise Exception("Unable to extract prediction from response.")
121 | 
122 |         if output[0] == "$":
123 |             output = output[1:]
124 | 
125 |         # Remove commas...
126 |         output = output.replace(",", "")
127 | 
128 |         if not is_numeric(output):
129 |             raise Exception(f"Prediction does not appear to be numeric:\n{output}")
130 | 
131 |         return float(output)
132 | 


--------------------------------------------------------------------------------
/forecast1.py:
--------------------------------------------------------------------------------
 1 | # This is a sample model for tracking oil prices.
 2 | 
 3 | import os
 4 | from dotenv import load_dotenv
 5 | 
 6 | load_dotenv()
 7 | openai_api_key = os.getenv("OPENAI_API_KEY")
 8 | et_api_key = os.getenv("ET_API_KEY")
 9 | google_api_key = os.getenv("GOOGLE_API_KEY")
10 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
11 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
12 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
13 | news_api_key = os.getenv("NEWS_API_KEY")
14 | 
15 | from emergingtrajectories.news import RSSAgent
16 | from emergingtrajectories.crawlers import crawlerPlaywright
17 | from emergingtrajectories.factsrag import (
18 |     FactRAGFileCache,
19 |     FactBot,
20 |     clean_fact_citations,
21 | )
22 | from emergingtrajectories.factsragforecaster import FactsRAGForecastingAgent
23 | from emergingtrajectories import Client
24 | from emergingtrajectories.recursiveagent import ETClient
25 | 
26 | from phasellm.llms import (
27 |     OpenAIGPTWrapper,
28 |     ClaudeWrapper,
29 |     VertexAIWrapper,
30 |     ChatBot,
31 |     ReplicateLlama2Wrapper,
32 | )
33 | 
34 | topic = "oil futures and oil prices for 2024"
35 | 
36 | queries = ["oil prices end of 2024 and early 2025", "oil prices today"]
37 | 
38 | crawler = crawlerPlaywright(True)
39 | fr = FactRAGFileCache("forecasting_oil", openai_api_key, crawler=crawler)
40 | 
41 | """
42 | # Testing cleaning citations...
43 | 
44 | str_out = clean_fact_citations(
45 |     fr, "Hey, this is a set of facts [f1]. This is also a set of facts [f2, f3]."
46 | )
47 | print(str_out)
48 | """
49 | 
50 | """
51 | # Get Content
52 | 
53 | fr.new_get_rss_links(
54 |     "https://www.oilholicssynonymous.com/feeds/posts/default", topic=topic
55 | )
56 | # TODO Need to fix timeout bug.
57 | # fr.new_get_rss_links("https://oilprice.com/rss/main", topic=topic)
58 | 
59 | fr.new_get_new_info_google(google_api_key, google_search_id, queries, topic=topic)
60 | """
61 | 
62 | """
63 | # Create a forecast
64 | 
65 | # client = Client(et_api_key)
66 | # s = client.get_statement(5)
67 | etc = ETClient(et_api_key)
68 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
69 | chatbot = ChatBot(llm)
70 | f = FactsRAGForecastingAgent(ETClient(et_api_key), chatbot, fr)
71 | f.create_forecast(
72 |     etc.get_statement(5),
73 |     openai_api_key,
74 |     et_api_key,
75 |     ["Today's oil price is about $85."],
76 |     prediction_agent="FactsRAGForecastingAgent",
77 | )
78 | """
79 | 
80 | """
81 | # Extend a forecast
82 | etc = ETClient(et_api_key)
83 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
84 | chatbot = ChatBot(llm)
85 | f = FactsRAGForecastingAgent(ETClient(et_api_key), chatbot, fr)
86 | f.extend_forecast(
87 |     etc.get_forecast(252),
88 |     openai_api_key,
89 |     et_api_key,
90 |     prediction_agent="FactsRAGForecastingAgent",
91 | )
92 | """
93 | 


--------------------------------------------------------------------------------
/forecasttest1.py:
--------------------------------------------------------------------------------
 1 | # See forecast1.py instead!!
 2 | 
 3 | import os
 4 | from dotenv import load_dotenv
 5 | 
 6 | from google.cloud import aiplatform
 7 | 
 8 | aiplatform.init(project="phasellm-gemini-testing")
 9 | 
10 | load_dotenv()
11 | openai_api_key = os.getenv("OPENAI_API_KEY")
12 | et_api_key = os.getenv("ET_API_KEY")
13 | google_api_key = os.getenv("GOOGLE_API_KEY")
14 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
15 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
16 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
17 | news_api_key = os.getenv("NEWS_API_KEY")
18 | 
19 | from emergingtrajectories import Client
20 | from emergingtrajectories.news import NewsAPIAgent
21 | from emergingtrajectories.crawlers import crawlerPlaywright
22 | from emergingtrajectories.factsrag import FactRAGFileCache
23 | from emergingtrajectories.recursiveagent import ETClient
24 | from emergingtrajectories.factsragforecaster import FactsRAGForecastingAgent
25 | 
26 | from phasellm.llms import OpenAIGPTWrapper, ChatBot
27 | 
28 | topic = "drivers of oil prices, associated political/economic/military climate, and oil futures commodity prices"
29 | 
30 | crawler = crawlerPlaywright(False)
31 | fr = FactRAGFileCache("testing_oil", openai_api_key, crawler=crawler)
32 | 
33 | etc = ETClient(et_api_key)
34 | statement = etc.get_statement(5)
35 | 
36 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
37 | chatbot = ChatBot(llm)
38 | 
39 | google_search_queries = [
40 |     "oil price projections",
41 |     "how do oil prices work",
42 |     "oil and geopolitics 2024",
43 | ]
44 | 
45 | # fr.new_get_new_info_google(
46 | #    google_api_key,
47 | #    google_search_id,
48 | #    google_search_queries,
49 | #    topic,
50 | # )
51 | 
52 | forecaster = FactsRAGForecastingAgent(etc, chatbot, fr)
53 | 
54 | """
55 | result = forecaster.create_forecast(
56 |     etc.get_statement(5),
57 |     openai_api_key,
58 |     et_api_key,
59 | )
60 | print(result)
61 | """
62 | 
63 | """
64 | client = Client(et_api_key)
65 | forecast_id = client.get_most_recent_forecast(5)
66 | result = forecaster.extend_forecast(
67 |     etc.get_forecast(forecast_id),
68 |     openai_api_key,
69 |     et_api_key,
70 | )
71 | print(result)
72 | """
73 | 


--------------------------------------------------------------------------------
/newstest1.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | from google.cloud import aiplatform
 5 | 
 6 | aiplatform.init(project="phasellm-gemini-testing")
 7 | 
 8 | load_dotenv()
 9 | openai_api_key = os.getenv("OPENAI_API_KEY")
10 | et_api_key = os.getenv("ET_API_KEY")
11 | google_api_key = os.getenv("GOOGLE_API_KEY")
12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
14 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
15 | news_api_key = os.getenv("NEWS_API_KEY")
16 | 
17 | from emergingtrajectories.news import NewsAPIAgent
18 | from emergingtrajectories.crawlers import crawlerPlaywright
19 | 
20 | """
21 | na = NewsAPIAgent(news_api_key)
22 | 
23 | r = na.get_news("lng")
24 | r = na.get_news("covid")
25 | 
26 | for result in r['articles']:
27 |     print("")
28 |     print(result['title'])
29 |     print(result['url'])
30 |     print(result['publishedAt'])
31 |     print("\n\n")
32 | """
33 | 
34 | """from emergingtrajectories.factsrag import FactRAGFileCache
35 | 
36 | topic = "Liquefied Natural Gas (LNG) futures + commodity prices"
37 | 
38 | crawler = crawlerPlaywright(False)
39 | fr = FactRAGFileCache("test_rag", openai_api_key, crawler=crawler)
40 | # fr.facts_from_url("https://www.bbc.com/news/business-63585732", topic=topic)
41 | 
42 | print(fr.query_to_fact_content("What is LNG?"))
43 | """
44 | 
45 | na = NewsAPIAgent(news_api_key)
46 | 
47 | r = na.get_news_as_list("autonomous trucking")
48 | 
49 | for result in r["articles"]:
50 |     print("")
51 |     print(result["title"])
52 |     print(result["url"])
53 |     print(result["publishedAt"])
54 |     print("\n\n")
55 | 


--------------------------------------------------------------------------------
/newstest2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | from google.cloud import aiplatform
 5 | 
 6 | aiplatform.init(project="phasellm-gemini-testing")
 7 | 
 8 | load_dotenv()
 9 | openai_api_key = os.getenv("OPENAI_API_KEY")
10 | et_api_key = os.getenv("ET_API_KEY")
11 | google_api_key = os.getenv("GOOGLE_API_KEY")
12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
14 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
15 | news_api_key = os.getenv("NEWS_API_KEY")
16 | 
17 | from emergingtrajectories.news import NewsAPIAgent
18 | from emergingtrajectories.crawlers import crawlerPlaywright
19 | from emergingtrajectories.factsrag import FactRAGFileCache, FactBot
20 | 
21 | topic = "Liquefied Natural Gas (LNG) futures + commodity prices"
22 | crawler = crawlerPlaywright(False)
23 | fr = FactRAGFileCache("test_rag", openai_api_key, crawler=crawler)
24 | bot = FactBot(fr, openai_api_key)
25 | 


--------------------------------------------------------------------------------
/newstest3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | from google.cloud import aiplatform
 5 | 
 6 | aiplatform.init(project="phasellm-gemini-testing")
 7 | 
 8 | load_dotenv()
 9 | openai_api_key = os.getenv("OPENAI_API_KEY")
10 | et_api_key = os.getenv("ET_API_KEY")
11 | google_api_key = os.getenv("GOOGLE_API_KEY")
12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
14 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
15 | news_api_key = os.getenv("NEWS_API_KEY")
16 | 
17 | from emergingtrajectories.news import NewsAPIAgent
18 | from emergingtrajectories.crawlers import crawlerPlaywright
19 | from emergingtrajectories.factsrag import FactRAGFileCache
20 | from emergingtrajectories.recursiveagent import ETClient
21 | 
22 | from phasellm.llms import OpenAIGPTWrapper, ChatBot
23 | 
24 | topic = "drivers of Liquefied Natural Gas (LNG) prices, associated political/economic/military climate, and LNG futures commodity prices"
25 | 
26 | crawler = crawlerPlaywright(False)
27 | fr = FactRAGFileCache("rag_lng", openai_api_key, crawler=crawler)
28 | 
29 | etc = ETClient(et_api_key)
30 | statement = etc.get_statement(37)
31 | 
32 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
33 | chatbot = ChatBot(llm)
34 | 
35 | google_search_queries = [
36 |     "LNG futures prices and estimmates",
37 |     "LNG prices in 2024 and 2025",
38 |     "Political, economic, and military drivers of LNG prices",
39 |     "Biggest producers of LNG",
40 |     "Biggest consumers of LNG",
41 |     "LNG and Natural Gas price and structural relationships",
42 |     "The politics of LNG",
43 |     "LNG regulations in USA",
44 |     "LNG regulations in China",
45 |     "LNG regulations in Europe",
46 |     "LNG and the Russia/Ukraine war",
47 |     "The economics of LNG production",
48 |     "Economic models and LNG",
49 | ]
50 | 
51 | fr.summarize_new_info_multiple_queries(
52 |     statement,
53 |     chatbot,
54 |     google_api_key,
55 |     google_search_id,
56 |     google_search_queries,
57 |     topic,
58 | )
59 | 


--------------------------------------------------------------------------------
/newstest4.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | from google.cloud import aiplatform
 5 | 
 6 | aiplatform.init(project="phasellm-gemini-testing")
 7 | 
 8 | load_dotenv()
 9 | openai_api_key = os.getenv("OPENAI_API_KEY")
10 | et_api_key = os.getenv("ET_API_KEY")
11 | google_api_key = os.getenv("GOOGLE_API_KEY")
12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
14 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
15 | news_api_key = os.getenv("NEWS_API_KEY")
16 | 
17 | from emergingtrajectories.news import NewsAPIAgent
18 | from emergingtrajectories.crawlers import crawlerPlaywright
19 | from emergingtrajectories.factsrag import FactRAGFileCache, FactBot
20 | 
21 | topic = "drivers of Liquefied Natural Gas (LNG) prices, associated political/economic/military climate, and LNG futures commodity prices"
22 | crawler = crawlerPlaywright(False)
23 | fr = FactRAGFileCache("rag_lng", openai_api_key, crawler=crawler)
24 | bot = FactBot(fr, openai_api_key)
25 | 


--------------------------------------------------------------------------------
/newstest5.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | from google.cloud import aiplatform
 5 | 
 6 | aiplatform.init(project="phasellm-gemini-testing")
 7 | 
 8 | load_dotenv()
 9 | openai_api_key = os.getenv("OPENAI_API_KEY")
10 | et_api_key = os.getenv("ET_API_KEY")
11 | google_api_key = os.getenv("GOOGLE_API_KEY")
12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
14 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
15 | news_api_key = os.getenv("NEWS_API_KEY")
16 | 
17 | from emergingtrajectories.news import NewsAPIAgent
18 | from emergingtrajectories.crawlers import crawlerPlaywright
19 | from emergingtrajectories.factsrag import FactRAGFileCache, clean_fact_citations
20 | from emergingtrajectories.recursiveagent import ETClient
21 | 
22 | from phasellm.llms import OpenAIGPTWrapper, ChatBot
23 | 
24 | from datetime import datetime
25 | 
26 | topic = "drivers of Liquefied Natural Gas (LNG) prices, associated political/economic/military climate, and LNG futures commodity prices"
27 | 
28 | crawler = crawlerPlaywright(False)
29 | fr = FactRAGFileCache("testing_news_lng", openai_api_key, crawler=crawler)
30 | 
31 | etc = ETClient(et_api_key)
32 | statement = etc.get_statement(37)
33 | 
34 | llm = OpenAIGPTWrapper(openai_api_key, "gpt-4-0125-preview")
35 | chatbot = ChatBot(llm)
36 | 
37 | google_search_queries = [
38 |     "Economic models and LNG",
39 | ]
40 | 
41 | """fr.new_get_new_info_google(
42 |     google_api_key,
43 |     google_search_id,
44 |     google_search_queries,
45 |     topic,
46 | )"""
47 | 
48 | """fr.new_get_new_info_news(
49 |     news_api_key, topic, ["lng", "liquefied natural gas"], top_headlines=False
50 | )"""
51 | 
52 | """print(
53 |     fr.query_to_fact_content(
54 |         "What is LNG?", n_results=3, since_date=datetime(2023, 1, 1)
55 |     )
56 | )"""
57 | 
58 | # text = "abcd [f1], [f3, f6] and so on [f4]"
59 | # print(clean_fact_citations(fr, text))
60 | 


--------------------------------------------------------------------------------
/newstest6.py:
--------------------------------------------------------------------------------
 1 | # Testing RSS feeds
 2 | # https://www.oilholicssynonymous.com/feeds/posts/default
 3 | 
 4 | import os
 5 | from dotenv import load_dotenv
 6 | 
 7 | load_dotenv()
 8 | openai_api_key = os.getenv("OPENAI_API_KEY")
 9 | et_api_key = os.getenv("ET_API_KEY")
10 | google_api_key = os.getenv("GOOGLE_API_KEY")
11 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
12 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
13 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
14 | news_api_key = os.getenv("NEWS_API_KEY")
15 | 
16 | from emergingtrajectories.news import RSSAgent
17 | from emergingtrajectories.crawlers import crawlerPlaywright
18 | from emergingtrajectories.factsrag import FactRAGFileCache
19 | 
20 | topic = "oil futures and oil prices"
21 | 
22 | crawler = crawlerPlaywright(False)
23 | fr = FactRAGFileCache("test_rss_oil", openai_api_key, crawler=crawler)
24 | 
25 | fr.new_get_rss_links(
26 |     "https://www.oilholicssynonymous.com/feeds/posts/default", topic=topic
27 | )
28 | 
29 | print(fr.query_to_fact_content("How are oil prices doing in March 2024?"))
30 | 


--------------------------------------------------------------------------------
/newstest7.py:
--------------------------------------------------------------------------------
 1 | topic = (
 2 |     "Any news related to finance, economics, government, diplomacy, or current affairs"
 3 | )
 4 | 
 5 | import os
 6 | from dotenv import load_dotenv
 7 | 
 8 | load_dotenv()
 9 | openai_api_key = os.getenv("OPENAI_API_KEY")
10 | et_api_key = os.getenv("ET_API_KEY")
11 | google_api_key = os.getenv("GOOGLE_API_KEY")
12 | google_search_id = os.getenv("GOOGLE_SEARCH_ID")
13 | anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
14 | replicate_api_key = os.getenv("REPLICATE_API_KEY")
15 | news_api_key = os.getenv("NEWS_API_KEY")
16 | 
17 | ft_user = os.getenv("FT_USER_NAME")
18 | ft_pass = os.getenv("FT_PASSWORD")
19 | 
20 | from emergingtrajectories.news import FinancialTimesAgent
21 | from emergingtrajectories.factsrag import FactRAGFileCache
22 | 
23 | # fta = FinancialTimesAgent(ft_user, ft_pass)
24 | # a = fta.get_news()
25 | 
26 | f = FactRAGFileCache("rag_demo_ft_rss", openai_api_key)
27 | f.get_ft_news(ft_user, ft_pass, topic)
28 | 


--------------------------------------------------------------------------------
/project_metadata.py:
--------------------------------------------------------------------------------
 1 | NAME = "emergingtrajectories"
 2 | 
 3 | AUTHOR = "Wojciech Gryc"
 4 | 
 5 | VERSION = "0.2.53"
 6 | 
 7 | DESCRIPTION = "Open source library for tracking and saving forecasts of political, economic, and social events."
 8 | 
 9 | LONG_DESCRIPTION = "Open source library for tracking and saving forecasts of political, economic, and social events."
10 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | twine>=4.0.2
2 | wheel>=0.41.3


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | phasellm>=0.0.22
 2 | Django>=5.0.0
 3 | python-dotenv>=1.0.0
 4 | dateparser>=1.2.0
 5 | pytest-playwright
 6 | beautifulsoup4
 7 | chromadb
 8 | feedparser
 9 | pypdf
10 | faiss-cpu
11 | microsoft-bing-newssearch
12 | scrapingbee
13 | tiktoken
14 | # db-dtypes # For Google Cloud, unsure if needed


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | from project_metadata import NAME, VERSION, AUTHOR, DESCRIPTION, LONG_DESCRIPTION
 4 | 
 5 | setup(
 6 |     name=NAME,
 7 |     version=VERSION,
 8 |     description=DESCRIPTION,
 9 |     long_description=LONG_DESCRIPTION,
10 |     author=AUTHOR,
11 |     author_email="hello@phaseai.com",
12 |     license="MIT",
13 |     packages=find_packages(),
14 |     install_requires=[
15 |         "phasellm>=0.0.22",
16 |         "Django>=5.0.0",
17 |         "python-dotenv>=1.0.0",
18 |         "dateparser>=1.2.0",
19 |         "pytest-playwright",
20 |         "beautifulsoup4",
21 |         "chromadb",
22 |         "feedparser",
23 |         "pypdf",
24 |         "faiss-cpu",
25 |         "microsoft-bing-newssearch",
26 |         "scrapingbee",
27 |         "tiktoken",
28 |     ],
29 |     extras_require={
30 |         "docs": [
31 |             "furo",
32 |             "sphinx>=7.1.2",
33 |             "myst_parser>=2.0.0",
34 |             "sphinx-autoapi>=2.1.1",
35 |             "sphinx-autobuild>=2021.3.14",
36 |         ]
37 |     },
38 |     python_requires=">=3.10.0",
39 |     keywords="llm, nlp, ai, social, politics, economics",
40 |     classifiers=[
41 |         "Development Status :: 3 - Alpha",
42 |         "Intended Audience :: Developers",
43 |         "License :: OSI Approved :: MIT License",
44 |         "Programming Language :: Python :: 3",
45 |     ],
46 | )
47 | 


--------------------------------------------------------------------------------