├── .env.example
├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── evals.png
    └── sentient-logo-narrow.png
├── evals
    ├── README.md
    ├── autograde_df.py
    ├── datasets
    │   ├── frames_test_set.csv
    │   └── simple_qa_test_set.csv
    ├── eval_gpt_web.py
    ├── eval_tasks.py
    ├── gpt_web_extract.py
    └── grader_prompts.py
├── gradio_demo.py
├── pdm.lock
├── pyproject.toml
├── requirements.txt
├── src
    └── opendeepsearch
    │   ├── __init__.py
    │   ├── context_building
    │       ├── build_context.py
    │       └── process_sources_pro.py
    │   ├── context_scraping
    │       ├── basic_web_scraper.py
    │       ├── crawl4ai_scraper.py
    │       ├── extraction_result.py
    │       ├── fast_scraper.py
    │       ├── strategy_factory.py
    │       └── utils.py
    │   ├── ods_agent.py
    │   ├── ods_tool.py
    │   ├── prompts.py
    │   ├── ranking_models
    │       ├── README.md
    │       ├── base_reranker.py
    │       ├── chunker.py
    │       ├── infinity_rerank.py
    │       └── jina_reranker.py
    │   ├── serp_search
    │       └── serp_search.py
    │   └── wolfram_tool.py
└── tests
    └── __init__.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # SEARXNG_INSTANCE_URL=http://searxng:8080
 2 | # or
 3 | # SERPER_API_KEY=
 4 | 
 5 | JINA_API_KEY=
 6 | WOLFRAM_ALPHA_APP_ID=
 7 | 
 8 | ### Providers ###
 9 | OPENAI_API_KEY=
10 | OPENAI_BASE_URL=
11 | ANTHROPIC_API_KEY=
12 | OPENROUTER_API_KEY=
13 | 
14 | # LiteLLM model IDs for different tasks
15 | LITELLM_MODEL_ID=openrouter/google/gemini-2.0-flash-001
16 | LITELLM_SEARCH_MODEL_ID=openrouter/google/gemini-2.0-flash-001
17 | LITELLM_ORCHESTRATOR_MODEL_ID=openrouter/google/gemini-2.0-flash-001
18 | LITELLM_EVAL_MODEL_ID=gpt-4o-mini
19 | FIREWORKS_API_KEY=
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | *.ipynb
  9 | *.ipynb_checkpoints
 10 | output/
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm-project.org/#use-with-ide
113 | .pdm.toml
114 | .pdm-python
115 | .pdm-build/
116 | 
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 | 
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 | 
124 | # SageMath parsed files
125 | *.sage.py
126 | 
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 | 
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 | 
140 | # Rope project settings
141 | .ropeproject
142 | 
143 | # mkdocs documentation
144 | /site
145 | 
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 | 
151 | # Pyre type checker
152 | .pyre/
153 | 
154 | # pytype static type analyzer
155 | .pytype/
156 | 
157 | # Cython debug symbols
158 | cython_debug/
159 | 
160 | # PyCharm
161 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
164 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | #.idea/
166 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🔍OpenDeepSearch: Democratizing Search with Open-source Reasoning Models and Reasoning Agents 🚀
  2 | 
  3 | <!-- markdownlint-disable first-line-h1 -->
  4 | <!-- markdownlint-disable html -->
  5 | <!-- markdownlint-disable no-duplicate-header -->
  6 | 
  7 | <div align="center">
  8 |     <img src="./assets/sentient-logo-narrow.png" alt="alt text" width="60%"/>
  9 | </div>
 10 | 
 11 | <hr>
 12 | <div align="center" style="line-height: 1;">
 13 |   <a href="https://sentient.xyz/" target="_blank" style="margin: 2px;">
 14 |     <img alt="Homepage" src="https://img.shields.io/badge/Sentient-Homepage-%23EAEAEA?logo=data%3Aimage%2Fsvg%2Bxml%3Bbase64%2CPHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIzNDEuMzMzIiBoZWlnaHQ9IjM0MS4zMzMiIHZlcnNpb249IjEuMCIgdmlld0JveD0iMCAwIDI1NiAyNTYiPjxwYXRoIGQ9Ik0xMzIuNSAyOC40Yy0xLjUgMi4yLTEuMiAzLjkgNC45IDI3LjIgMy41IDEzLjcgOC41IDMzIDExLjEgNDIuOSAyLjYgOS45IDUuMyAxOC42IDYgMTkuNCAzLjIgMy4zIDExLjctLjggMTMuMS02LjQuNS0xLjktMTcuMS03Mi0xOS43LTc4LjYtMS4yLTMtNy41LTYuOS0xMS4zLTYuOS0xLjYgMC0zLjEuOS00LjEgMi40ek0xMTAgMzBjLTEuMSAxLjEtMiAzLjEtMiA0LjVzLjkgMy40IDIgNC41IDMuMSAyIDQuNSAyIDMuNC0uOSA0LjUtMiAyLTMuMSAyLTQuNS0uOS0zLjQtMi00LjUtMy4xLTItNC41LTItMy40LjktNC41IDJ6TTgxLjUgNDYuMWMtMi4yIDEuMi00LjYgMi44LTUuMiAzLjctMS44IDIuMy0xLjYgNS42LjUgNy40IDEuMyAxLjIgMzIuMSAxMC4yIDQ1LjQgMTMuMyAzIC44IDYuOC0yLjIgNi44LTUuMyAwLTMuNi0yLjItOS4yLTMuOS0xMC4xQzEyMy41IDU0LjIgODcuMiA0NCA4NiA0NGMtLjMuMS0yLjMgMS00LjUgMi4xek0xNjUgNDZjLTEuMSAxLjEtMiAyLjUtMiAzLjIgMCAyLjggMTEuMyA0NC41IDEyLjYgNDYuNS45IDEuNSAyLjQgMi4zIDQuMiAyLjMgMy44IDAgOS4yLTUuNiA5LjItOS40IDAtMS41LTIuMS0xMC45LTQuNy0yMC44bC00LjctMTguMS00LjUtMi44Yy01LjMtMy40LTcuNC0zLjYtMTAuMS0uOXpNNDguNyA2NS4xYy03LjcgNC4xLTYuOSAxMC43IDEuNSAxMyAyLjQuNiAyMS40IDUuOCA0Mi4yIDExLjYgMjIuOCA2LjIgMzguOSAxMC4yIDQwLjMgOS44IDMuNS0uOCA0LjYtMy44IDMuMi04LjgtMS41LTUuNy0yLjMtNi41LTguMy04LjJDOTQuMiA3My4xIDU2LjYgNjMgNTQuOCA2M2MtMS4zLjEtNCAxLTYuMSAyLjF6TTE5OC4yIDY0LjdjLTMuMSAyLjgtMy41IDUuNi0xLjEgOC42IDQgNS4xIDEwLjkgMi41IDEwLjktNC4xIDAtNS4zLTUuOC03LjktOS44LTQuNXpNMTgxLjggMTEzLjFjLTI3IDI2LjQtMzEuOCAzMS41LTMxLjggMzMuOSAwIDEuNi43IDMuNSAxLjUgNC40IDEuNyAxLjcgNy4xIDMgMTAuMiAyLjQgMi4xLS4zIDU2LjktNTMuNCA1OS01Ny4xIDEuNy0zLjEgMS42LTkuOC0uMy0xMi41LTMuNi01LjEtNC45LTQuMi0zOC42IDI4Ljl6TTM2LjYgODguMWMtNSA0LTIuNCAxMC45IDQuMiAxMC45IDMuMyAwIDYuMi0yLjkgNi4yLTYuMyAwLTIuMS00LjMtNi43LTYuMy02LjctLjggMC0yLjYuOS00LjEgMi4xek02My40IDk0LjVjLTEuNi43LTguOSA3LjMtMTYuMSAxNC43TDM0IDEyMi43djUuNmMwIDYuMyAxLjYgOC43IDUuOSA4LjcgMi4xIDAgNi0zLjQgMTkuOS0xNy4zIDkuNS05LjUgMTcuMi0xOCAxNy4yLTE4LjkgMC00LjctOC40LTguNi0xMy42LTYuM3pNNjIuOSAxMzAuNiAzNCAxNTkuNXY1LjZjMCA2LjIgMS44IDguOSA2IDguOSAzLjIgMCA2Ni02Mi40IDY2LTY1LjYgMC0zLjMtMy41LTUuNi05LjEtNi4ybC01LS41LTI5IDI4Ljl6TTE5Ni4zIDEzNS4yYy05IDktMTYuNiAxNy4zLTE2LjkgMTguNS0xLjMgNS4xIDIuNiA4LjMgMTAgOC4zIDIuOCAwIDUuMi0yIDE3LjktMTQuOCAxNC41LTE0LjcgMTQuNy0xNC45IDE0LjctMTkuMyAwLTUuOC0yLjItOC45LTYuMi04LjktMi42IDAtNS40IDIuMy0xOS41IDE2LjJ6TTk2IDEzNi44Yy0yLjkuOS04IDYuNi04IDkgMCAxLjMgMi45IDEzLjQgNi40IDI3IDMuNiAxMy42IDcuOSAzMC4zIDkuNyAzNy4yIDEuNyA2LjkgMy42IDEzLjMgNC4xIDE0LjIuNSAxIDIuNiAyLjcgNC44IDMuOCA2LjggMy41IDExIDIuMyAxMS0zLjIgMC0zLTIwLjYtODMuMS0yMi4xLTg1LjktLjktMS45LTMuNi0yLjgtNS45LTIuMXpNMTIwLjUgMTU4LjRjLTEuOSAyLjktMS4yIDguNSAxLjQgMTEuNiAxLjEgMS40IDEyLjEgNC45IDM5LjYgMTIuNSAyMC45IDUuOCAzOC44IDEwLjUgMzkuOCAxMC41czMuNi0xIDUuNy0yLjJjOC4xLTQuNyA3LjEtMTAuNi0yLjMtMTMuMi0yOC4yLTguMS03OC41LTIxLjYtODAuMy0yMS42LTEuNCAwLTMgMS0zLjkgMi40ek0yMTAuNyAxNTguOGMtMS44IDEuOS0yLjIgNS45LS45IDcuOCAxLjUgMi4zIDUgMy40IDcuNiAyLjQgNi40LTIuNCA1LjMtMTEuMi0xLjUtMTEuOC0yLjQtLjItNCAuMy01LjIgMS42ek02OS42IDE2MmMtMiAyLjItMy42IDQuMy0zLjYgNC44LjEgMi42IDEwLjEgMzguNiAxMS4xIDM5LjkgMi4yIDIuNiA5IDUuNSAxMS41IDQuOSA1LTEuMyA0LjktMy0xLjUtMjcuNy0zLjMtMTIuNy02LjUtMjMuNy03LjItMjQuNS0yLjItMi43LTYuNC0xLjctMTAuMyAyLjZ6TTQ5LjYgMTgxLjVjLTIuNCAyLjUtMi45IDUuNC0xLjIgOEM1MiAxOTUgNjAgMTkzIDYwIDE4Ni42YzAtMS45LS44LTQtMS44LTQuOS0yLjMtMi4xLTYuNi0yLjItOC42LS4yek0xMjguNSAxODdjLTIuMyAyLjUtMS4zIDEwLjMgMS42IDEyLjggMi4yIDEuOSAzNC44IDExLjIgMzkuNCAxMS4yIDMuNiAwIDEwLjEtNC4xIDExLTcgLjYtMS45LTEuNy03LTMuMS03LS4yIDAtMTAuMy0yLjctMjIuMy02cy0yMi41LTYtMjMuMy02Yy0uOCAwLTIuMy45LTMuMyAyek0xMzYuNyAyMTYuOGMtMy40IDMuOC0xLjUgOS41IDMuNSAxMC43IDMuOSAxIDguMy0zLjQgNy4zLTcuMy0xLjItNS4xLTcuNS03LjEtMTAuOC0zLjR6Ii8%2BPC9zdmc%2B&link=https%3A%2F%2Fhuggingface.co%2FSentientagi" style="display: inline-block; vertical-align: middle;"/>
 15 |   </a>
 16 |   <a href="https://github.com/sentient-agi" target="_blank" style="margin: 2px;">
 17 |     <img alt="GitHub" src="https://img.shields.io/badge/Github-sentient_agi-181717?logo=github" style="display: inline-block; vertical-align: middle;"/>
 18 |   </a>
 19 |   <a href="https://huggingface.co/Sentientagi" target="_blank" style="margin: 2px;">
 20 |     <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-SentientAGI-ffc107?color=ffc107&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
 21 |   </a>
 22 | </div>
 23 | 
 24 | <div align="center" style="line-height: 1;">
 25 |   <a href="https://discord.gg/sentientfoundation" target="_blank" style="margin: 2px;">
 26 |     <img alt="Discord" src="https://img.shields.io/badge/Discord-SentientAGI-7289da?logo=discord&logoColor=white&color=7289da" style="display: inline-block; vertical-align: middle;"/>
 27 |   </a>
 28 |   <a href="https://x.com/SentientAGI" target="_blank" style="margin: 2px;">
 29 |     <img alt="Twitter Follow" src="https://img.shields.io/badge/-SentientAGI-grey?logo=x&link=https%3A%2F%2Fx.com%2FSentientAGI%2F" style="display: inline-block; vertical-align: middle;"/>
 30 |   </a>
 31 | </div>
 32 | 
 33 | <h4 align="center">
 34 |         <a href="https://arxiv.org/pdf/2503.20201"> Paper  </a>
 35 | </h4>
 36 | 
 37 | ## Description 📝
 38 | 
 39 | OpenDeepSearch is a lightweight yet powerful search tool designed for seamless integration with AI agents. It enables deep web search and retrieval, optimized for use with Hugging Face's **[SmolAgents](https://github.com/huggingface/smolagents)** ecosystem.
 40 | 
 41 | <div align="center">
 42 |     <img src="./assets/evals.png" alt="Evaluation Results" width="80%"/>
 43 | </div>
 44 | 
 45 | - **Performance**: ODS performs on par with closed source search alternatives on single-hop queries such as [SimpleQA](https://openai.com/index/introducing-simpleqa/) 🔍.
 46 | - **Advanced Capabilities**: ODS performs much better than closed source search alternatives on multi-hop queries such as [FRAMES bench](https://huggingface.co/datasets/google/frames-benchmark) 🚀.
 47 | 
 48 | ## Table of Contents 📑
 49 | 
 50 | - [🔍OpenDeepSearch: Democratizing Search with Open-source Reasoning Models and Reasoning Agents 🚀](#opendeepsearch-democratizing-search-with-open-source-reasoning-models-and-reasoning-agents-)
 51 |   - [Description 📝](#description-)
 52 |   - [Table of Contents 📑](#table-of-contents-)
 53 |   - [Features ✨](#features-)
 54 |   - [Installation 📚](#installation-)
 55 |   - [Setup](#setup)
 56 |   - [Usage ️](#usage-️)
 57 |     - [Using OpenDeepSearch Standalone 🔍](#using-opendeepsearch-standalone-)
 58 |     - [Running the Gradio Demo 🖥️](#running-the-gradio-demo-️)
 59 |     - [Integrating with SmolAgents \& LiteLLM 🤖⚙️](#integrating-with-smolagents--litellm-️)
 60 |       - [](#)
 61 |     - [ReAct agent with math and search tools 🤖⚙️](#react-agent-with-math-and-search-tools-️)
 62 |       - [](#-1)
 63 |   - [Search Modes 🔄](#search-modes-)
 64 |     - [Default Mode ⚡](#default-mode-)
 65 |     - [Pro Mode 🔍](#pro-mode-)
 66 |   - [Acknowledgments 💡](#acknowledgments-)
 67 |   - [Citation](#citation)
 68 |   - [Contact 📩](#contact-)
 69 | 
 70 | ## Features ✨
 71 | 
 72 | - **Semantic Search** 🧠: Leverages **[Crawl4AI](https://github.com/unclecode/crawl4ai)** and semantic search rerankers (such as [Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/tree/main) and [Jina AI](https://jina.ai/)) to provide in-depth results
 73 | - **Two Modes of Operation** ⚡:
 74 |   - **Default Mode**: Quick and efficient search with minimal latency.
 75 |   - **Pro Mode (Deep Search)**: More in-depth and accurate results at the cost of additional processing time.
 76 | - **Optimized for AI Agents** 🤖: Works seamlessly with **SmolAgents** like `CodeAgent`.
 77 | - **Fast and Lightweight** ⚡: Designed for speed and efficiency with minimal setup.
 78 | - **Extensible** 🔌: Easily configurable to work with different models and APIs.
 79 | 
 80 | ## Installation 📚
 81 | 
 82 | To install OpenDeepSearch, run:
 83 | 
 84 | ```bash
 85 | pip install -e . #you can also use: uv pip install -e .
 86 | pip install -r requirements.txt #you can also use: uv pip install -r requirements.txt
 87 | ```
 88 | 
 89 | Note: you must have `torch` installed.
 90 | Note: using `uv` instead of regular `pip` makes life much easier!
 91 | 
 92 | ### Using PDM (Alternative Package Manager) 📦
 93 | 
 94 | You can also use PDM as an alternative package manager for OpenDeepSearch. PDM is a modern Python package and dependency manager supporting the latest PEP standards.
 95 | 
 96 | ```bash
 97 | # Install PDM if you haven't already
 98 | curl -sSL https://raw.githubusercontent.com/pdm-project/pdm/main/install-pdm.py | python3 -
 99 | 
100 | # Initialize a new PDM project
101 | pdm init
102 | 
103 | # Install OpenDeepSearch and its dependencies
104 | pdm install
105 | 
106 | # Activate the virtual environment
107 | eval "$(pdm venv activate)"
108 | ```
109 | 
110 | PDM offers several advantages:
111 | - Lockfile support for reproducible installations
112 | - PEP 582 support (no virtual environment needed)
113 | - Fast dependency resolution
114 | - Built-in virtual environment management
115 | 
116 | ## Setup
117 | 
118 | 1. **Choose a Search Provider**:
119 |    - **Option 1: Serper.dev**: Get **free 2500 credits** and add your API key.
120 |      - Visit [serper.dev](https://serper.dev) to create an account.
121 |      - Retrieve your API key and store it as an environment variable:
122 | 
123 |      ```bash
124 |      export SERPER_API_KEY='your-api-key-here'
125 |      ```
126 | 
127 |    - **Option 2: SearXNG**: Use a self-hosted or public SearXNG instance.
128 |      - Specify the SearXNG instance URL when initializing OpenDeepSearch.
129 |      - Optionally provide an API key if your instance requires authentication:
130 | 
131 |      ```bash
132 |      export SEARXNG_INSTANCE_URL='https://your-searxng-instance.com'
133 |      export SEARXNG_API_KEY='your-api-key-here'  # Optional
134 |      ```
135 | 
136 | 2. **Choose a Reranking Solution**:
137 |    - **Quick Start with Jina**: Sign up at [Jina AI](https://jina.ai/) to get an API key for immediate use
138 |    - **Self-hosted Option**: Set up [Infinity Embeddings](https://github.com/michaelfeil/infinity) server locally with open source models such as [Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/tree/main)
139 |    - For more details on reranking options, see our [Rerankers Guide](src/opendeepsearch/ranking_models/README.md)
140 | 
141 | 3. **Set up LiteLLM Provider**:
142 |    - Choose a provider from the [supported list](https://docs.litellm.ai/docs/providers/), including:
143 |      - OpenAI
144 |      - Anthropic
145 |      - Google (Gemini)
146 |      - OpenRouter
147 |      - HuggingFace
148 |      - Fireworks
149 |      - And many more!
150 |    - Set your chosen provider's API key as an environment variable:
151 |    ```bash
152 |    export <PROVIDER>_API_KEY='your-api-key-here'  # e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY
153 |    ```
154 |    - For OpenAI, you can also set a custom base URL (useful for self-hosted endpoints or proxies):
155 |    ```bash
156 |    export OPENAI_BASE_URL='https://your-custom-openai-endpoint.com'
157 |    ```
158 |    - You can set default LiteLLM model IDs for different tasks:
159 |    ```bash
160 |    # General default model (fallback for all tasks)
161 |    export LITELLM_MODEL_ID='openrouter/google/gemini-2.0-flash-001'
162 | 
163 |    # Task-specific models
164 |    export LITELLM_SEARCH_MODEL_ID='openrouter/google/gemini-2.0-flash-001'  # For search tasks
165 |    export LITELLM_ORCHESTRATOR_MODEL_ID='openrouter/google/gemini-2.0-flash-001'  # For agent orchestration
166 |    export LITELLM_EVAL_MODEL_ID='gpt-4o-mini'  # For evaluation tasks
167 |    ```
168 |    - When initializing OpenDeepSearch, you can specify your chosen model using the provider's format (this will override the environment variables):
169 |    ```python
170 |    search_agent = OpenDeepSearchTool(model_name="provider/model-name")  # e.g., "anthropic/claude-3-opus-20240229", 'huggingface/microsoft/codebert-base', 'openrouter/google/gemini-2.0-flash-001'
171 |    ```
172 | 
173 | ## Usage ️
174 | 
175 | You can use OpenDeepSearch independently or integrate it with **SmolAgents** for enhanced reasoning and code generation capabilities.
176 | 
177 | ### Using OpenDeepSearch Standalone 🔍
178 | 
179 | ```python
180 | from opendeepsearch import OpenDeepSearchTool
181 | import os
182 | 
183 | # Set environment variables for API keys
184 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here"  # If using Serper
185 | # Or for SearXNG
186 | # os.environ["SEARXNG_INSTANCE_URL"] = "https://your-searxng-instance.com"
187 | # os.environ["SEARXNG_API_KEY"] = "your-api-key-here"  # Optional
188 | 
189 | os.environ["OPENROUTER_API_KEY"] = "your-openrouter-api-key-here"
190 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here"
191 | 
192 | # Using Serper (default)
193 | search_agent = OpenDeepSearchTool(
194 |     model_name="openrouter/google/gemini-2.0-flash-001",
195 |     reranker="jina"
196 | )
197 | 
198 | # Or using SearXNG
199 | # search_agent = OpenDeepSearchTool(
200 | #     model_name="openrouter/google/gemini-2.0-flash-001",
201 | #     reranker="jina",
202 | #     search_provider="searxng",
203 | #     searxng_instance_url="https://your-searxng-instance.com",
204 | #     searxng_api_key="your-api-key-here"  # Optional
205 | # )
206 | 
207 | if not search_agent.is_initialized:
208 |     search_agent.setup()
209 |     
210 | query = "Fastest land animal?"
211 | result = search_agent.forward(query)
212 | print(result)
213 | ```
214 | 
215 | ### Running the Gradio Demo 🖥️
216 | 
217 | To try out OpenDeepSearch with a user-friendly interface, simply run:
218 | 
219 | ```bash
220 | python gradio_demo.py
221 | ```
222 | 
223 | This will launch a local web interface where you can test different search queries and modes interactively.
224 | 
225 | You can customize the demo with command-line arguments:
226 | 
227 | ```bash
228 | # Using Serper (default)
229 | python gradio_demo.py --model-name "openrouter/google/gemini-2.0-flash-001" --reranker "jina"
230 | 
231 | # Using SearXNG
232 | python gradio_demo.py --model-name "openrouter/google/gemini-2.0-flash-001" --reranker "jina" \
233 |   --search-provider "searxng" --searxng-instance "https://your-searxng-instance.com" \
234 |   --searxng-api-key "your-api-key-here"  # Optional
235 | ```
236 | 
237 | Available options:
238 | - `--model-name`: LLM model to use for search
239 | - `--orchestrator-model`: LLM model for the agent orchestrator
240 | - `--reranker`: Reranker to use (`jina` or `infinity`)
241 | - `--search-provider`: Search provider to use (`serper` or `searxng`)
242 | - `--searxng-instance`: SearXNG instance URL (required if using `searxng`)
243 | - `--searxng-api-key`: SearXNG API key (optional)
244 | - `--serper-api-key`: Serper API key (optional, will use environment variable if not provided)
245 | - `--openai-base-url`: OpenAI API base URL (optional, will use OPENAI_BASE_URL env var if not provided)
246 | 
247 | ### Integrating with SmolAgents & LiteLLM 🤖⚙️
248 | 
249 | ####
250 | 
251 | ```python
252 | from opendeepsearch import OpenDeepSearchTool
253 | from smolagents import CodeAgent, LiteLLMModel
254 | import os
255 | 
256 | # Set environment variables for API keys
257 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here"  # If using Serper
258 | # Or for SearXNG
259 | # os.environ["SEARXNG_INSTANCE_URL"] = "https://your-searxng-instance.com"
260 | # os.environ["SEARXNG_API_KEY"] = "your-api-key-here"  # Optional
261 | 
262 | os.environ["OPENROUTER_API_KEY"] = "your-openrouter-api-key-here"
263 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here"
264 | 
265 | # Using Serper (default)
266 | search_agent = OpenDeepSearchTool(
267 |     model_name="openrouter/google/gemini-2.0-flash-001",
268 |     reranker="jina"
269 | )
270 | 
271 | # Or using SearXNG
272 | # search_agent = OpenDeepSearchTool(
273 | #     model_name="openrouter/google/gemini-2.0-flash-001",
274 | #     reranker="jina",
275 | #     search_provider="searxng",
276 | #     searxng_instance_url="https://your-searxng-instance.com",
277 | #     searxng_api_key="your-api-key-here"  # Optional
278 | # )
279 | 
280 | model = LiteLLMModel(
281 |     "openrouter/google/gemini-2.0-flash-001",
282 |     temperature=0.2
283 | )
284 | 
285 | code_agent = CodeAgent(tools=[search_agent], model=model)
286 | query = "How long would a cheetah at full speed take to run the length of Pont Alexandre III?"
287 | result = code_agent.run(query)
288 | 
289 | print(result)
290 | ```
291 | ### ReAct agent with math and search tools 🤖⚙️
292 | 
293 | ####
294 | ```python
295 | from opendeepsearch import OpenDeepSearchTool
296 | from opendeepsearch.wolfram_tool import WolframAlphaTool
297 | from opendeepsearch.prompts import REACT_PROMPT
298 | from smolagents import LiteLLMModel, ToolCallingAgent, Tool
299 | import os
300 | 
301 | # Set environment variables for API keys
302 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here"
303 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here"
304 | os.environ["WOLFRAM_ALPHA_APP_ID"] = "your-wolfram-alpha-app-id-here"
305 | os.environ["FIREWORKS_API_KEY"] = "your-fireworks-api-key-here"
306 | 
307 | model = LiteLLMModel(
308 |     "fireworks_ai/llama-v3p1-70b-instruct",  # Your Fireworks Deepseek model
309 |     temperature=0.7
310 | )
311 | search_agent = OpenDeepSearchTool(model_name="fireworks_ai/llama-v3p1-70b-instruct", reranker="jina") # Set reranker to "jina" or "infinity"
312 | 
313 | # Initialize the Wolfram Alpha tool
314 | wolfram_tool = WolframAlphaTool(app_id=os.environ["WOLFRAM_ALPHA_APP_ID"])
315 | 
316 | # Initialize the React Agent with search and wolfram tools
317 | react_agent = ToolCallingAgent(
318 |     tools=[search_agent, wolfram_tool],
319 |     model=model,
320 |     prompt_templates=REACT_PROMPT # Using REACT_PROMPT as system prompt
321 | )
322 | 
323 | # Example query for the React Agent
324 | query = "What is the distance, in metres, between the Colosseum in Rome and the Rialto bridge in Venice"
325 | result = react_agent.run(query)
326 | 
327 | print(result)
328 | ```
329 | 
330 | ## Search Modes 🔄
331 | 
332 | OpenDeepSearch offers two distinct search modes to balance between speed and depth:
333 | 
334 | ### Default Mode ⚡
335 | - Uses SERP-based interaction for quick results
336 | - Minimal processing overhead
337 | - Ideal for single-hop, straightforward queries
338 | - Fast response times
339 | - Perfect for basic information retrieval
340 | 
341 | ### Pro Mode 🔍
342 | - Involves comprehensive web scraping
343 | - Implements semantic reranking of results
344 | - Includes advanced post-processing of data
345 | - Slightly longer processing time
346 | - Excels at:
347 |   - Multi-hop queries
348 |   - Complex search requirements
349 |   - Detailed information gathering
350 |   - Questions requiring cross-reference verification
351 | 
352 | ## Acknowledgments 💡
353 | 
354 | OpenDeepSearch is built on the shoulders of great open-source projects:
355 | 
356 | - **[SmolAgents](https://huggingface.co/docs/smolagents/index)** 🤗 – Powers the agent framework and reasoning capabilities.
357 | - **[Crawl4AI](https://github.com/unclecode/crawl4ai)** 🕷️ – Provides data crawling support.
358 | - **[Infinity Embedding API](https://github.com/michaelfeil/infinity)** 🌍 – Powers semantic search capabilities.
359 | - **[LiteLLM](https://www.litellm.ai/)** 🔥 – Used for efficient AI model integration.
360 | - **Various Open-Source Libraries** 📚 – Enhancing search and retrieval functionalities.
361 | 
362 | ## Citation
363 | 
364 | If you use `OpenDeepSearch` in your works, please cite it using the following BibTex entry:
365 | 
366 | ```
367 | @misc{alzubi2025opendeepsearchdemocratizing,
368 |       title={Open Deep Search: Democratizing Search with Open-source Reasoning Agents},
369 |       author={Salaheddin Alzubi and Creston Brooks and Purva Chiniya and Edoardo Contente and Chiara von Gerlach and Lucas Irwin and Yihan Jiang and Arda Kaz and Windsor Nguyen and Sewoong Oh and Himanshu Tyagi and Pramod Viswanath},
370 |       year={2025},
371 |       eprint={2503.20201},
372 |       archivePrefix={arXiv},
373 |       primaryClass={cs.LG},
374 |       url={https://arxiv.org/abs/2503.20201},
375 | }
376 | ```
377 | 
378 | 
379 | ## Contact 📩
380 | 
381 | For questions or collaborations, open an issue or reach out to the maintainers.
382 | 


--------------------------------------------------------------------------------
/assets/evals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-agi/OpenDeepSearch/HEAD/assets/evals.png


--------------------------------------------------------------------------------
/assets/sentient-logo-narrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-agi/OpenDeepSearch/HEAD/assets/sentient-logo-narrow.png


--------------------------------------------------------------------------------
/evals/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation Scripts
 2 | 
 3 | This repository contains scripts for running evaluations and autograding on model outputs.
 4 | 
 5 | ## Available Commands
 6 | 
 7 | ### Autograde DataFrame Evaluation
 8 | To evaluate and autograde DataFrame outputs:
 9 | 
10 | ```bash
11 | python -m evals.autograde_dataframe --csv_path <path_to_csv> --output_path <path_to_output_csv>
12 | ```
13 | 
14 | Example:
15 | 
16 | ```bash
17 | python evals/autograde_df.py output/fireworks_ai__accounts__fireworks__models__qwq-32b/codeact/simple_qa_test_set/fireworks_ai__accounts__fireworks__models__qwq-32b__codeact__simple_qa_test_set__trial1.jsonl
18 | ```
19 | 
20 | This command processes the specified JSONL file and performs automated grading on DataFrame outputs.
21 | 
22 | ### Run Task Evaluations
23 | To run evaluations on a dataset with parallel processing:
24 | 
25 | ```bash
26 | python ./evals/eval_tasks.py --parallel-workers=8 --num-trials=1 --eval-tasks=./evals/datasets/frames_test_set.csv ./evals/datasets/simple_qa_test_set.csv
27 | ```
28 | 
29 | Parameters:
30 | - `--date`: Optional date for the evaluation
31 | - `--eval-tasks`: List of paths to CSV files containing evaluation tasks (default: ["./evals/datasets/frames_test_set.csv", "./evals/datasets/simple_qa_test_set.csv"])
32 | - `--search-model-id`: Model ID for the search tool (default: "fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct")
33 | - `--model-type`: Type of model to use, either "LiteLLMModel" or "HfApiModel" (default: "LiteLLMModel")
34 | - `--model-id`: ID of the model to use (default: "fireworks_ai/accounts/fireworks/models/qwq-32b")
35 | - `--agent-action-type`: Type of agent action: "codeact", "tool-calling", or "vanilla" (default: "codeact")
36 | - `--parallel-workers`: Number of parallel workers to use (default: 8)
37 | - `--num-trials`: Number of evaluation trials to run (default: 1)
38 | 
39 | The results will be saved as a DataFrame in the `evals` directory.
40 | 
41 | ## Output
42 | Evaluation results are stored in the following locations:
43 | - Task evaluation results: `evals/` directory
44 | - DataFrame autograding results: Generated in the script's output
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/evals/autograde_df.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import litellm
 3 | import argparse
 4 | from evals.grader_prompts import GRADER_TEMPLATE
 5 | from multiprocessing import Pool, cpu_count
 6 | from tqdm import tqdm
 7 | 
 8 | def grade_row(row_data):
 9 |     idx, row = row_data
10 |     question = row['original_question']
11 |     predicted_answer = row['answer']
12 |     gold_answer = row['true_answer']
13 |     
14 |     input_prompt = GRADER_TEMPLATE.format(
15 |         question=question,
16 |         predicted_answer=predicted_answer,
17 |         target=gold_answer
18 |     )
19 |     
20 |     try:
21 |         output = litellm.completion(
22 |             model="openrouter/google/gemini-2.0-flash-001",
23 |             messages=[{"role": "user", "content": input_prompt}],
24 |             temperature=0.0
25 |         )['choices'][0]['message']['content']
26 |         return idx, output
27 |     except Exception as e:
28 |         print(f"Error processing row {idx}: {e}")
29 |         return idx, "Error"
30 | 
31 | def autograde_df(df_path, num_cpus=4):
32 |     # Read the dataframe
33 |     df = pd.read_json(df_path, lines=True)
34 |     
35 |     # Prepare data for parallel processing
36 |     row_data = list(df.iterrows())
37 |     
38 |     # Use specified number of CPU cores
39 |     n_processes = max(1, min(num_cpus, cpu_count()))
40 |     print(f"Using {n_processes} processes")
41 |     
42 |     # Create process pool and process rows in parallel
43 |     with Pool(n_processes) as pool:
44 |         # Use tqdm for progress bar
45 |         results = list(tqdm(
46 |             pool.imap(grade_row, row_data),
47 |             total=len(row_data),
48 |             desc="Grading"
49 |         ))
50 |     
51 |     # Sort results by index and extract grades
52 |     results.sort(key=lambda x: x[0])
53 |     final_grades = [grade for _, grade in results]
54 |     
55 |     # Add the grades as a new column
56 |     df['final_grade'] = final_grades
57 |     
58 |     # Save the updated dataframe back to the same file
59 |     df.to_json(df_path, orient='records', lines=True)
60 |     print("Grading completed and results saved!")
61 | 
62 | if __name__ == "__main__":
63 |     parser = argparse.ArgumentParser(description='Auto-grade answers in a DataFrame')
64 |     parser.add_argument('df_path', type=str, help='Path to the DataFrame JSON file')
65 |     parser.add_argument('--num_cpus', type=int, default=4, help='Number of CPU cores to use')
66 |     
67 |     args = parser.parse_args()
68 |     autograde_df(args.df_path, args.num_cpus)
69 | 


--------------------------------------------------------------------------------
/evals/eval_gpt_web.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from openai import OpenAI
  3 | import time
  4 | from typing import List, Dict, Any
  5 | import json
  6 | import pandas as pd
  7 | from pathlib import Path
  8 | import argparse
  9 | from dotenv import load_dotenv
 10 | import os
 11 | from tqdm import tqdm
 12 | import multiprocessing as mp
 13 | from queue import Empty
 14 | from concurrent.futures import ProcessPoolExecutor
 15 | 
 16 | load_dotenv()
 17 | 
 18 | class WebSearchEvaluator:
 19 |     def __init__(self, model: str, output_path: Path, num_workers: int = 4, trial: int = 0):
 20 |         self.model = model
 21 |         self.output_path = output_path
 22 |         self.num_workers = num_workers
 23 |         self.trial = trial
 24 | 
 25 |         # Load existing results if any
 26 |         self.processed_questions = set()
 27 |         if self.output_path.exists():
 28 |             with open(self.output_path, 'r') as f:
 29 |                 for line in f:
 30 |                     try:
 31 |                         result = json.loads(line)
 32 |                         self.processed_questions.add(result['question'])
 33 |                     except:
 34 |                         continue
 35 | 
 36 |     def worker_init(self):
 37 |         """Initialize OpenAI client for each worker."""
 38 |         # Create new client for each process
 39 |         self.client = OpenAI(
 40 |             api_key=os.environ.get("OPENAI_API_KEY"),
 41 |             base_url=os.environ.get("OPENAI_BASE_URL")
 42 |         )
 43 | 
 44 |     def evaluate_single(self, row: pd.Series) -> Dict[str, Any]:
 45 |         """Evaluate a single question with its true answer."""
 46 |         # Skip if already processed
 47 |         if row['question'] in self.processed_questions:
 48 |             return None
 49 | 
 50 |         if not hasattr(self, 'client'):
 51 |             self.worker_init()
 52 | 
 53 |         try:
 54 |             start_time = time.time()
 55 |             response = self.client.responses.create(
 56 |                 model=self.model,
 57 |                 tools=[{"type": "web_search_preview"}],
 58 |                 input=row['question']
 59 |             )
 60 |             end_time = time.time()
 61 |             result = {
 62 |                 "question": row['question'],
 63 |                 "true_answer": row['true_answer'],
 64 |                 "answer": response.output_text,
 65 |                 "model": self.model,
 66 |                 "time_taken": end_time - start_time,
 67 |                 "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
 68 |             }
 69 |             return result
 70 |         except Exception as e:
 71 |             return {
 72 |                 "question": row['question'],
 73 |                 "true_answer": row['true_answer'],
 74 |                 "answer": None,
 75 |                 "error": str(e),
 76 |                 "model": self.model,
 77 |                 "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
 78 |             }
 79 | 
 80 |     def save_result(self, result: Dict[str, Any]) -> None:
 81 |         """Save a single result to the JSONL file."""
 82 |         with open(self.output_path, 'a') as f:
 83 |             f.write(json.dumps(result) + '\n')
 84 | 
 85 |     def evaluate_batch(self, df: pd.DataFrame) -> None:
 86 |         """Evaluate questions in parallel using multiple workers."""
 87 |         with ProcessPoolExecutor(
 88 |             max_workers=self.num_workers,
 89 |             initializer=self.worker_init
 90 |         ) as executor:
 91 |             # Convert DataFrame rows to list of Series
 92 |             rows = [row for _, row in df.iterrows()]
 93 | 
 94 |             # Create progress bar for total rows
 95 |             with tqdm(total=len(rows), desc="Processing questions") as pbar:
 96 |                 # Submit all tasks
 97 |                 futures = [executor.submit(self.evaluate_single, row) for row in rows]
 98 | 
 99 |                 # Process results as they complete
100 |                 for future in futures:
101 |                     result = future.result()
102 |                     if result is not None:  # Only save if not already processed
103 |                         self.save_result(result)
104 |                     pbar.update(1)
105 | 
106 | def parse_args():
107 |     parser = argparse.ArgumentParser(description='Evaluate questions using GPT-4 with web search')
108 |     parser.add_argument('--output_dir', type=str, default='output',
109 |                       help='Directory to save results (default: output)')
110 |     parser.add_argument('--input_data', type=str,
111 |                       default='./evals/datasets/frames_test_set.csv',
112 |                       help='Path to input CSV file')
113 |     parser.add_argument('--model', type=str,
114 |                       default=os.getenv("LITELLM_EVAL_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "gpt-4o-mini")),
115 |                       help='Model to use for evaluation')
116 |     parser.add_argument('--num_workers', type=int, default=4,
117 |                       help='Number of parallel workers (default: 4)')
118 |     parser.add_argument('--trial', type=int, default=0,
119 |                       help='Trial number for this evaluation run (default: 0)')
120 |     return parser.parse_args()
121 | 
122 | def main():
123 |     args = parse_args()
124 | 
125 |     # Create output directory if it doesn't exist
126 |     output_dir = Path(args.output_dir)
127 |     output_dir.mkdir(parents=True, exist_ok=True)
128 | 
129 |     # Set up output path (now without timestamp)
130 |     output_path = output_dir / f"evaluation_results_{args.model}_trial{args.trial}.jsonl"
131 | 
132 |     # Load input data
133 |     print(f"Loading data from {args.input_data}")
134 |     df = pd.read_csv(args.input_data)
135 |     print(f"Loaded {len(df)} examples")
136 | 
137 |     # Initialize evaluator
138 |     evaluator = WebSearchEvaluator(
139 |         model=args.model,
140 |         output_path=output_path,
141 |         num_workers=args.num_workers,
142 |         trial=args.trial
143 |     )
144 | 
145 |     # Run evaluation
146 |     print(f"Starting evaluation with model {args.model} using {args.num_workers} workers...")
147 |     evaluator.evaluate_batch(df)
148 |     print(f"Results saved to {output_path}")
149 | 
150 |     # Load and display summary
151 |     results_df = pd.read_json(output_path, lines=True)
152 |     print("\nResults summary:")
153 |     print(f"Model: {args.model}")
154 |     print(f"Total evaluations: {len(results_df)}")
155 |     print(f"Successful evaluations: {len(results_df[~results_df['answer'].isna()])}")
156 |     print(f"Failed evaluations: {len(results_df[results_df['answer'].isna()])}")
157 | 
158 | if __name__ == "__main__":
159 |     main()
160 | 


--------------------------------------------------------------------------------
/evals/eval_tasks.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | import threading
  6 | import time
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed
  8 | from pathlib import Path
  9 | 
 10 | import datasets
 11 | import pandas as pd
 12 | from datasets import Dataset
 13 | from dotenv import load_dotenv
 14 | from tqdm import tqdm
 15 | from opendeepsearch import OpenDeepSearchTool
 16 | 
 17 | from smolagents import (
 18 |     AgentError,
 19 |     CodeAgent,
 20 |     LiteLLMModel,
 21 |     HfApiModel,
 22 |     PythonInterpreterTool,
 23 |     ToolCallingAgent,
 24 | )
 25 | from smolagents.agents import ActionStep
 26 | 
 27 | 
 28 | load_dotenv()
 29 | 
 30 | APPEND_ANSWER_LOCK = threading.Lock()
 31 | 
 32 | 
 33 | def parse_arguments():
 34 |     parser = argparse.ArgumentParser(description="Runs an agent powered by the given model on smolagent benchmark.")
 35 |     parser.add_argument(
 36 |         "--date",
 37 |         type=str,
 38 |         default=None,
 39 |         help="The date for the evaluation.",
 40 |     )
 41 |     parser.add_argument(
 42 |         "--eval-tasks",
 43 |         type=str,
 44 |         nargs="+",
 45 |         default=["./evals/datasets/frames_test_set.csv", "./evals/datasets/simple_qa_test_set.csv"],
 46 |         help="List of evaluation task paths",
 47 |     )
 48 |     parser.add_argument(
 49 |         "--search-model-id",
 50 |         type=str,
 51 |         default="fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct",
 52 |         help="The model ID to use for the search tool (defaults to same as model-id)",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--model-type",
 56 |         type=str,
 57 |         default="LiteLLMModel",
 58 |         choices=["LiteLLMModel", "HfApiModel"],
 59 |         help="The model type to use (LiteLLMModel or HfApiModel)",
 60 |     )
 61 |     parser.add_argument(
 62 |         "--model-id",
 63 |         type=str,
 64 |         default="fireworks_ai/accounts/fireworks/models/qwq-32b",
 65 |         help="The model ID to use for the specified model type",
 66 |     )
 67 |     parser.add_argument(
 68 |         "--agent-action-type",
 69 |         type=str,
 70 |         default="codeact",
 71 |         choices=["codeact", "tool-calling", "vanilla"],
 72 |         help="The agent action type: 'codeact', 'tool-calling', or 'vanilla' to use the vanilla llm",
 73 |     )
 74 |     parser.add_argument(
 75 |         "--parallel-workers",
 76 |         type=int,
 77 |         default=8,
 78 |         help="The number of processes to run in parallel",
 79 |     )
 80 |     parser.add_argument(
 81 |         "--num-trials",
 82 |         type=int,
 83 |         default=1,
 84 |         help="Number of trials to run for each evaluation",
 85 |     )
 86 |     return parser.parse_args()
 87 | 
 88 | 
 89 | def load_eval_dataset(eval_tasks: list):
 90 |     eval_ds = {}
 91 |     for task_path in eval_tasks:
 92 |         task_name = task_path.split("/")[-1][:-4]
 93 |         df = pd.read_csv(task_path)
 94 |         dataset = Dataset.from_pandas(df)
 95 |         eval_ds[task_name] = dataset
 96 |     return eval_ds
 97 | 
 98 | 
 99 | def serialize_agent_error(obj):
100 |     if isinstance(obj, AgentError):
101 |         return {"error_type": obj.__class__.__name__, "message": obj.message}
102 |     else:
103 |         return str(obj)
104 | 
105 | 
106 | def append_answer(entry: dict, jsonl_file: str) -> None:
107 |     jsonl_file = Path(jsonl_file)
108 |     jsonl_file.parent.mkdir(parents=True, exist_ok=True)
109 |     with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp:
110 |         fp.write(json.dumps(entry) + "\n")
111 |     assert os.path.exists(jsonl_file), "File not found!"
112 | 
113 | 
114 | def run_with_timeout(func, timeout):
115 |     with ThreadPoolExecutor(max_workers=1) as executor:
116 |         future = executor.submit(func)
117 |         try:
118 |             return future.result(timeout=timeout)
119 |         except TimeoutError:
120 |             return "Timed Out"
121 | 
122 | 
123 | def answer_single_question(example, model, answers_file, action_type, search_model_id=None):
124 |     if action_type == "vanilla":
125 |         agent = model
126 |     elif action_type == "codeact":
127 |         agent = CodeAgent(
128 |             tools=[OpenDeepSearchTool(model_name=search_model_id or model.model_id)],
129 |             model=model,
130 |             additional_authorized_imports=["numpy"],
131 |             max_steps=15,
132 |         )
133 |     elif action_type == "tool-calling":
134 |         agent = ToolCallingAgent(
135 |             tools=[OpenDeepSearchTool(model_name=search_model_id or model.model_id), PythonInterpreterTool()],
136 |             model=model,
137 |             additional_authorized_imports=["numpy"],
138 |             max_steps=15,
139 |         )
140 | 
141 |     augmented_question = example["question"]
142 |     start_time = time.time()
143 |     TIMEOUT_SECONDS = 300  # 5 minutes timeout
144 | 
145 |     try:
146 |         if action_type == "vanilla":
147 |             def get_vanilla_response():
148 |                 response = agent([{"role": "user", "content": augmented_question}])
149 |                 return response.content, agent.last_output_token_count
150 |             
151 |             answer, token_count = run_with_timeout(get_vanilla_response, TIMEOUT_SECONDS)
152 |             intermediate_steps = answer
153 |         else:
154 |             def get_agent_response():
155 |                 response = str(agent.run(augmented_question))
156 |                 token_count = agent.monitor.get_total_token_counts()
157 |                 # Remove memory from logs to make them more compact.
158 |                 for step in agent.memory.steps:
159 |                     if isinstance(step, ActionStep):
160 |                         step.agent_memory = None
161 |                 return response, token_count, str(agent.memory.steps)
162 |             
163 |             answer, token_count, intermediate_steps = run_with_timeout(get_agent_response, TIMEOUT_SECONDS)
164 | 
165 |         end_time = time.time()
166 |     except Exception as e:
167 |         print("Error on ", augmented_question, e)
168 |         intermediate_steps = []
169 |     end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
170 |     annotated_example = {
171 |         "model_id": model.model_id,
172 |         "agent_action_type": action_type,
173 |         "original_question": example["question"],
174 |         "answer": answer,
175 |         "true_answer": example["true_answer"],
176 |         "intermediate_steps": intermediate_steps,
177 |         "start_time": start_time,
178 |         "end_time": end_time,
179 |         "token_counts": token_count,
180 |     }
181 |     append_answer(annotated_example, answers_file)
182 | 
183 | 
184 | def answer_questions(
185 |     eval_ds,
186 |     model,
187 |     date,
188 |     action_type: str = "codeact",
189 |     output_dir: str = "output",
190 |     parallel_workers: int = 32,
191 |     search_model_id: str = None,
192 |     num_trials: int = 1,
193 | ):
194 |     date = date or datetime.date.today().isoformat()
195 |     model_id = model.model_id
196 |     
197 |     # Create directory structure: output/model_id/action_type/task
198 |     model_dir = model_id.replace('/', '__')
199 |     
200 |     for task in eval_ds:
201 |         task_dir = os.path.join(output_dir, model_dir, action_type, task)
202 |         os.makedirs(task_dir, exist_ok=True)
203 |         
204 |         for trial in range(num_trials):
205 |             file_name = f"{task_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__trial{trial}.jsonl"
206 |             print(f"Starting processing trial {trial + 1}/{num_trials} and writing output to '{file_name}'")
207 |             answered_questions = []
208 |             if os.path.exists(file_name):
209 |                 with open(file_name, "r") as f:
210 |                     for line in f:
211 |                         answered_questions.append(json.loads(line)["original_question"])
212 |             examples_todo = [example for example in eval_ds[task] if example["question"] not in answered_questions]
213 |             print(f"Launching {parallel_workers} parallel workers.")
214 | 
215 |             with ThreadPoolExecutor(max_workers=parallel_workers) as exe:
216 |                 futures = [
217 |                     exe.submit(answer_single_question, example, model, file_name, action_type, search_model_id) 
218 |                     for example in examples_todo
219 |                 ]
220 |                 for f in tqdm(as_completed(futures), total=len(examples_todo), desc="Processing tasks"):
221 |                     f.result()
222 | 
223 |             print("All tasks processed.")
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     args = parse_arguments()
228 | 
229 |     eval_ds = load_eval_dataset(args.eval_tasks)
230 | 
231 |     if args.model_type == "LiteLLMModel":
232 |         model = LiteLLMModel(
233 |             args.model_id,
234 |             max_completion_tokens=8192,
235 |             temperature=0.2,
236 |             # api_key=os.getenv("OPENROUTER_API_KEY"),
237 |         )
238 |     else:
239 |         model = HfApiModel(args.model_id, provider="together", max_tokens=8192)
240 | 
241 |     answer_questions(
242 |         eval_ds,
243 |         model,
244 |         args.date,
245 |         action_type=args.agent_action_type,
246 |         parallel_workers=args.parallel_workers,
247 |         search_model_id=args.search_model_id,
248 |         num_trials=args.num_trials,
249 |     )


--------------------------------------------------------------------------------
/evals/gpt_web_extract.py:
--------------------------------------------------------------------------------
 1 | import litellm
 2 | from multiprocessing import Pool
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | import argparse
 6 | 
 7 | input_prompt = """You are a precise answer extractor. Your job is to read a question and a detailed answer, then output ONLY the final answer without any explanation.
 8 | 
 9 | For example:
10 | Question: "What is 2+2?"
11 | Detailed Answer: "Let me calculate this. 2 plus 2 equals 4, which is a basic mathematical fact."
12 | Final Answer: 4
13 | 
14 | Question: "What color is the sky on a clear day?"
15 | Detailed Answer: "When we look up on a clear day, the sky appears blue due to a phenomenon called Rayleigh scattering."
16 | Final Answer: blue
17 | 
18 | Question: "If my future wife has the same first name as the 15th first lady of the United States' mother and her surname is the same as the second assassinated president's mother's maiden name, what is my future wife's name?"
19 | Detailed Answer: "The 15th First Lady of the United States was Ellen Wilson, and her mother's name was Hannah. The second assassinated president was Abraham Lincoln, and his mother's maiden name was Hodge. \n\nPutting that together, your future wife's name is **Hannah Hodge**."
20 | Final Answer: Hannah Hodge
21 | 
22 | Now do this:
23 | Question: {question}
24 | Detailed Answer: {detailed_answer}
25 | Final Answer:"""
26 | 
27 | def process_row(row):
28 |     """Process a single row using litellm."""
29 |     try:
30 |         output = litellm.completion(
31 |             model="openrouter/google/gemini-2.0-flash-001",
32 |             messages=[{
33 |                 "role": "user", 
34 |                 "content": input_prompt.format(
35 |                     question=row['question'], 
36 |                     detailed_answer=row['original_answer']
37 |                 )
38 |             }],
39 |             temperature=0.3
40 |         )
41 |         return output['choices'][0]['message']['content']
42 |     except Exception as e:
43 |         print(f"Error processing row: {e}")
44 |         return None
45 | 
46 | def process_dataframe(df, num_workers=4):
47 |     """Process the entire dataframe using a pool of workers."""
48 |     with Pool(num_workers) as pool:
49 |         # Use tqdm to show progress bar
50 |         results = list(tqdm(
51 |             pool.imap(process_row, [row for _, row in df.iterrows()]),
52 |             total=len(df)
53 |         ))
54 |     
55 |     # Add results as a new column
56 |     df['processed_output'] = results
57 |     return df
58 | 
59 | if __name__ == '__main__':
60 |     parser = argparse.ArgumentParser(description='Process a CSV file using litellm in parallel')
61 |     parser.add_argument('input_file', type=str, help='Path to the input CSV file')
62 |     parser.add_argument('--workers', type=int, default=4, help='Number of worker processes (default: 4)')
63 |     
64 |     args = parser.parse_args()
65 |     
66 |     # Load and process the dataframe
67 |     df = pd.read_json(args.input_file, lines=True)
68 |     
69 |     # Rename 'answer' to 'original_answer'
70 |     df = df.rename(columns={'answer': 'original_answer'})
71 |     
72 |     # Process the dataframe and store results in 'answer' column
73 |     processed_df = process_dataframe(df, num_workers=args.workers)
74 |     processed_df = processed_df.rename(columns={'processed_output': 'answer'})
75 |     
76 |     # Save to output file (adding '_processed' before the extension)
77 |     output_file = args.input_file.rsplit('.', 1)[0] + '_processed.' + args.input_file.rsplit('.', 1)[1]
78 |     processed_df.to_csv(output_file, index=False)
79 |     print(f"Processed data saved to: {output_file}")
80 | 


--------------------------------------------------------------------------------
/evals/grader_prompts.py:
--------------------------------------------------------------------------------
 1 | GRADER_TEMPLATE = """
 2 | Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
 3 | First, I will give examples of each grade, and then you will grade a new example.
 4 | 
 5 | 
 6 | The following are examples of CORRECT predicted answers.
 7 | ```
 8 | Question: What are the names of Barack Obama's children?
 9 | Gold target: Malia Obama and Sasha Obama
10 | Predicted answer 1: sasha and malia obama
11 | Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
12 | Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
13 | ```
14 | These predicted answers are all CORRECT because:
15 |     - They fully contain the important information in the gold target.
16 |     - They do not contain any information that contradicts the gold target.
17 |     - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
18 |     - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
19 | 
20 | 
21 | The following are examples of INCORRECT predicted answers.
22 | ```
23 | Question: What are the names of Barack Obama's children?
24 | Gold target: Malia and Sasha
25 | Predicted answer 1: Malia.
26 | Predicted answer 2: Malia, Sasha, and Susan.
27 | Predicted answer 3: Barack Obama does not have any children.
28 | Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
29 | Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
30 | Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
31 | Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
32 | ```
33 | These predicted answers are all INCORRECT because:
34 |     - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
35 | 
36 | 
37 | The following are examples of NOT_ATTEMPTED predicted answers.
38 | ```
39 | Question: What are the names of Barack Obama's children?
40 | Gold target: Malia and Sasha
41 | Predicted answer 1: I don't know.
42 | Predicted answer 2: I need more context about which Obama you are talking about.
43 | Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
44 | Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
45 | ```
46 | These predicted answers are all NOT_ATTEMPTED because:
47 |     - The important information in the gold target is not included in the answer.
48 |     - No statements in the answer contradict the gold target.
49 | 
50 | 
51 | Also note the following things:
52 | - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". 
53 |     - Predicted answers "120k", "124k", and 115k" are all CORRECT. 
54 |     - Predicted answers "100k" and "113k" are INCORRECT. 
55 |     - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
56 |     - The presence or absence of commas in numbers (e.g., "5,876" vs "5876") does not affect grading.
57 |     - Numbers written as words or digits are equivalent (e.g., "2 million" vs "2000000" vs "2,000,000" are all considered the same).
58 |     - For large numerical answers, a margin of error of ±1% is acceptable (e.g., if the gold answer is 855, predicted answers between 846.45 and 863.55 are CORRECT).
59 | - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
60 |     - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
61 | - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
62 |     - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
63 |     - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
64 |     - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
65 |     - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
66 | - Do not punish for typos in people's name if it's clearly the same name. 
67 |     - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
68 | 
69 | 
70 | Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
71 | ```
72 | Question: {question}
73 | Gold target: {target}
74 | Predicted answer: {predicted_answer}
75 | ```
76 | 
77 | Grade the predicted answer of this new question as one of:
78 | A: CORRECT
79 | B: INCORRECT
80 | C: NOT_ATTEMPTED
81 | 
82 | Just return the letters "A", "B", or "C", with no text around it.
83 | """.strip()
84 | 


--------------------------------------------------------------------------------
/gradio_demo.py:
--------------------------------------------------------------------------------
 1 | from smolagents import CodeAgent, GradioUI, LiteLLMModel
 2 | from opendeepsearch import OpenDeepSearchTool
 3 | import os
 4 | from dotenv import load_dotenv
 5 | import argparse
 6 | 
 7 | # Load environment variables
 8 | load_dotenv()
 9 | 
10 | # Add command line argument parsing
11 | parser = argparse.ArgumentParser(description='Run the Gradio demo with custom models')
12 | parser.add_argument('--model-name',
13 |                    default=os.getenv("LITELLM_SEARCH_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")),
14 |                    help='Model name for search')
15 | parser.add_argument('--orchestrator-model',
16 |                    default=os.getenv("LITELLM_ORCHESTRATOR_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")),
17 |                    help='Model name for orchestration')
18 | parser.add_argument('--reranker',
19 |                    choices=['jina', 'infinity'],
20 |                    default='jina',
21 |                    help='Reranker to use (jina or infinity)')
22 | parser.add_argument('--search-provider',
23 |                    choices=['serper', 'searxng'],
24 |                    default='serper',
25 |                    help='Search provider to use (serper or searxng)')
26 | parser.add_argument('--searxng-instance',
27 |                    help='SearXNG instance URL (required if search-provider is searxng)')
28 | parser.add_argument('--searxng-api-key',
29 |                    help='SearXNG API key (optional)')
30 | parser.add_argument('--serper-api-key',
31 |                    help='Serper API key (optional, will use SERPER_API_KEY env var if not provided)')
32 | parser.add_argument('--openai-base-url',
33 |                    help='OpenAI API base URL (optional, will use OPENAI_BASE_URL env var if not provided)')
34 | parser.add_argument('--server-port',
35 |                    type=int,
36 |                    default=7860,
37 |                    help='Port to run the Gradio server on')
38 | 
39 | args = parser.parse_args()
40 | 
41 | # Validate arguments
42 | if args.search_provider == 'searxng' and not (args.searxng_instance or os.getenv('SEARXNG_INSTANCE_URL')):
43 |     parser.error("--searxng-instance is required when using --search-provider=searxng")
44 | 
45 | # Set OpenAI base URL if provided via command line
46 | if args.openai_base_url:
47 |     os.environ["OPENAI_BASE_URL"] = args.openai_base_url
48 | 
49 | # Use the command line arguments
50 | search_tool = OpenDeepSearchTool(
51 |     model_name=args.model_name,
52 |     reranker=args.reranker,
53 |     search_provider=args.search_provider,
54 |     serper_api_key=args.serper_api_key,
55 |     searxng_instance_url=args.searxng_instance,
56 |     searxng_api_key=args.searxng_api_key
57 | )
58 | model = LiteLLMModel(
59 |     model_id=args.orchestrator_model,
60 |     temperature=0.2,
61 | )
62 | 
63 | # Initialize the agent with the search tool
64 | agent = CodeAgent(tools=[search_tool], model=model)
65 | 
66 | # Add a name when initializing GradioUI
67 | GradioUI(agent).launch(server_name="127.0.0.1", server_port=args.server_port, share=False)
68 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "OpenDeepSearch"
 3 | version = "0.1.0"
 4 | description = "Default template for PDM package"
 5 | authors = [
 6 |     {name = "Salaheddin Alzu'bi", email = "salaheddinalzubi@gmail.com"},
 7 | ]
 8 | 
 9 | dependencies = ["openai>=1.66.2", "datasets>=3.3.2", "transformers>=4.49.0", "litellm>=1.61.20", "langchain>=0.3.19", "crawl4ai @ git+https://github.com/salzubi401/crawl4ai.git@main", "fasttext-wheel>=0.9.2", "wikipedia-api>=0.8.1", "pillow>=10.4.0", "smolagents>=1.9.2", "gradio==5.20.1"]
10 | requires-python = ">=3.10"
11 | readme = "README.md"
12 | license = {text = "MIT"}
13 | 
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 | 
18 | 
19 | [tool.pdm]
20 | distribution = true
21 | 
22 | [tool.hatch.metadata]
23 | allow-direct-references = true
24 | 
25 | [tool.uv]
26 | python = "3.10"
27 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai>=1.65.1
 2 | datasets>=3.3.2
 3 | transformers>=4.49.0
 4 | litellm>=1.61.20
 5 | langchain>=0.3.19
 6 | git+https://github.com/salzubi401/crawl4ai.git@main
 7 | fasttext-wheel>=0.9.2
 8 | wikipedia-api>=0.8.1
 9 | pillow>=10.4.0
10 | smolagents>=1.9.2
11 | gradio==5.20.1
12 | 
13 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/__init__.py:
--------------------------------------------------------------------------------
1 | from .ods_agent import OpenDeepSearchAgent
2 | from .ods_tool import OpenDeepSearchTool
3 | 
4 | __all__ = ['OpenDeepSearchAgent', 'OpenDeepSearchTool']
5 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_building/build_context.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Optional
 2 | from loguru import logger
 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 4 | 
 5 | 
 6 | def extract_information(organic_results: List[Dict]) -> List[str]:
 7 |     """Extract snippets from organic search results in a formatted string."""
 8 |     formatted_results = []
 9 |     for item in organic_results:
10 |         if 'snippet' in item:
11 |             result_parts = [
12 |                 f"title: {item.get('title', 'N/A')}",
13 |                 f"date authored: {item.get('date', 'N/A')}",
14 |                 f"link: {item.get('link', 'N/A')}",
15 |                 f"snippet: {item['snippet']}"
16 |             ]
17 |             
18 |             if 'html' in item:
19 |                 result_parts.append(f"additional information: {item['html']}")
20 |             
21 |             formatted_results.append('\n'.join(result_parts))
22 |     
23 |     return formatted_results
24 | 
25 | def extract_top_stories(top_stories: Optional[List[Dict]]) -> List[str]:
26 |     """Extract titles from top stories."""
27 |     if not top_stories:
28 |         return []
29 |     
30 |     return [
31 |         item['title'] 
32 |         for item in top_stories 
33 |         if 'title' in item
34 |     ]
35 | 
36 | def extract_answer_box(
37 |     answer_box: Optional[Dict]
38 | ) -> List[str]:
39 |     """Extract information from answer box."""
40 |     results = []
41 |     
42 |     if answer_box:
43 |         for key in ['answer', 'snippet']:
44 |             if answer_box.get(key):
45 |                 results.append(answer_box[key])
46 |     
47 |     return results
48 | 
49 | def build_context(
50 |     sources_result: Dict,
51 | ) -> str:
52 |     """
53 |     Build context from search results.
54 |     
55 |     Args:
56 |         sources_result: Dictionary containing search results
57 |         
58 |     Returns:
59 |         A formatted string containing all relevant search results
60 |     """
61 |     try:
62 |         # Build context from different components
63 |         organic_results = extract_information(sources_result.get('organic', []))
64 |         top_stories = extract_top_stories(sources_result.get('topStories'))
65 |         answer_box = extract_answer_box(
66 |             sources_result.get('answerBox')
67 |         )
68 |         
69 |         # Combine all results into a single string
70 |         context_parts = []
71 |         
72 |         # Add answer box if available
73 |         if answer_box:
74 |             context_parts.append("ANSWER BOX:")
75 |             context_parts.extend(answer_box)
76 |             context_parts.append("")  # Empty line for separation
77 |         
78 |         # Add organic results
79 |         if organic_results:
80 |             context_parts.append("SEARCH RESULTS:")
81 |             context_parts.extend(organic_results)
82 |             context_parts.append("")  # Empty line for separation
83 |         
84 |         # Add top stories if available
85 |         if top_stories:
86 |             context_parts.append("TOP STORIES:")
87 |             context_parts.extend(top_stories)
88 |         
89 |         # Join all parts with newlines
90 |         return "\n".join(context_parts)
91 | 
92 |     except Exception as e:
93 |         logger.exception(f"An error occurred while building context: {e}")
94 |         return ""  # Return empty string in case of error
95 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_building/process_sources_pro.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List, Optional, Tuple
  3 | from opendeepsearch.context_scraping.crawl4ai_scraper import WebScraper
  4 | from opendeepsearch.ranking_models.infinity_rerank import InfinitySemanticSearcher
  5 | from opendeepsearch.ranking_models.jina_reranker import JinaReranker
  6 | from opendeepsearch.ranking_models.chunker import Chunker 
  7 | 
  8 | @dataclass
  9 | class Source:
 10 |     link: str
 11 |     html: str = ""
 12 |     # Add other relevant fields here
 13 | 
 14 | class SourceProcessor:
 15 |     def __init__(
 16 |         self, 
 17 |         top_results: int = 5,
 18 |         strategies: List[str] = ["no_extraction"],
 19 |         filter_content: bool = True,
 20 |         reranker: str = "infinity"
 21 |     ):
 22 |         self.strategies = strategies
 23 |         self.filter_content = filter_content
 24 |         self.scraper = WebScraper(
 25 |             strategies=self.strategies, 
 26 |             filter_content=self.filter_content
 27 |         )
 28 |         self.top_results = top_results
 29 |         self.chunker = Chunker()
 30 |         
 31 |         # Initialize the appropriate reranker
 32 |         if reranker.lower() == "jina":
 33 |             self.semantic_searcher = JinaReranker()
 34 |             print("Using Jina Reranker")
 35 |         else:  # default to infinity
 36 |             self.semantic_searcher = InfinitySemanticSearcher()
 37 |             print("Using Infinity Reranker")
 38 | 
 39 |     async def process_sources(
 40 |         self, 
 41 |         sources: List[dict], 
 42 |         num_elements: int, 
 43 |         query: str, 
 44 |         pro_mode: bool = False
 45 |     ) -> List[dict]:
 46 |         try:
 47 |             valid_sources = self._get_valid_sources(sources, num_elements)
 48 |             if not valid_sources:
 49 |                 return sources
 50 | 
 51 |             if not pro_mode:
 52 |                 # Check if there's a Wikipedia article among valid sources
 53 |                 wiki_sources = [(i, source) for i, source in valid_sources 
 54 |                               if 'wikipedia.org' in source['link']]
 55 |                 if not wiki_sources:
 56 |                     return sources.data
 57 |                 # If Wikipedia article exists, only process that
 58 |                 valid_sources = wiki_sources[:1]  # Take only the first Wikipedia source
 59 | 
 60 |             html_contents = await self._fetch_html_contents([s[1]['link'] for s in valid_sources])
 61 |             return self._update_sources_with_content(sources.data, valid_sources, html_contents, query)
 62 |         except Exception as e:
 63 |             print(f"Error in process_sources: {e}")
 64 |             return sources
 65 | 
 66 |     def _get_valid_sources(self, sources: List[dict], num_elements: int) -> List[Tuple[int, dict]]:
 67 |         return [(i, source) for i, source in enumerate(sources.data['organic'][:num_elements]) if source]
 68 | 
 69 |     async def _fetch_html_contents(self, links: List[str]) -> List[str]:
 70 |         raw_contents = await self.scraper.scrape_many(links)
 71 |         return [x['no_extraction'].content for x in raw_contents.values()]
 72 | 
 73 |     def _process_html_content(self, html: str, query: str) -> str:
 74 |         if not html:
 75 |             return ""
 76 |         try:
 77 |             # Split the HTML content into chunks
 78 |             documents = self.chunker.split_text(html)
 79 |             
 80 |             # Rerank the chunks based on the query
 81 |             reranked_content = self.semantic_searcher.get_reranked_documents(
 82 |                 query,
 83 |                 documents,
 84 |                 top_k=self.top_results
 85 |             )
 86 |             
 87 |             return reranked_content
 88 |         
 89 |         except Exception as e:
 90 |             print(f"Error in content processing: {e}")
 91 |             return ""
 92 | 
 93 |     def _update_sources_with_content(
 94 |         self, 
 95 |         sources: List[dict],
 96 |         valid_sources: List[Tuple[int, dict]], 
 97 |         html_contents: List[str],
 98 |         query: str
 99 |     ) -> List[dict]:
100 |         for (i, source), html in zip(valid_sources, html_contents):
101 |             source['html'] = self._process_html_content(html, query)
102 |             # sources[i] = source
103 |         return sources


--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/basic_web_scraper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains the BasicWebScraper class for basic web scraping functionality.
 3 | """
 4 | 
 5 | from dataclasses import dataclass
 6 | from typing import Optional
 7 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 8 | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 9 | from crawl4ai.content_filter_strategy import PruningContentFilter
10 | 
11 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult
12 | from crawl4ai.extraction_strategy import ExtractionStrategy
13 | 
14 | @dataclass
15 | class ExtractionConfig:
16 |     """Configuration for extraction strategies"""
17 |     name: str
18 |     strategy: ExtractionStrategy 
19 | 
20 | class BasicWebScraper:
21 |     """Basic web scraper implementation"""
22 |     def __init__(self, browser_config: Optional[BrowserConfig] = None):
23 |         self.browser_config = browser_config or BrowserConfig(headless=True, verbose=True)
24 |         
25 |     def _create_crawler_config(self) -> CrawlerRunConfig:
26 |         """Creates default crawler configuration"""
27 |         return CrawlerRunConfig(
28 |             cache_mode=CacheMode.BYPASS,
29 |             markdown_generator=DefaultMarkdownGenerator(
30 |                 content_filter=PruningContentFilter()
31 |             )
32 |         )
33 | 
34 |     async def extract(self, extraction_config: ExtractionConfig, url: str) -> ExtractionResult:
35 |         """Performs extraction using specified strategy"""
36 |         try:
37 |             config = self._create_crawler_config()
38 |             config.extraction_strategy = extraction_config.strategy
39 | 
40 |             async with AsyncWebCrawler(config=self.browser_config) as crawler:
41 |                 result = await crawler.arun(url=url, config=config)
42 | 
43 |             extraction_result = ExtractionResult(
44 |                 name=extraction_config.name,
45 |                 success=result.success,
46 |                 content=result.extracted_content
47 |             )
48 |             
49 |             if result.success:
50 |                 extraction_result.raw_markdown_length = len(result.markdown_v2.raw_markdown)
51 |                 extraction_result.citations_markdown_length = len(result.markdown_v2.markdown_with_citations)
52 | 
53 |             return extraction_result
54 | 
55 |         except Exception as e:
56 |             return ExtractionResult(
57 |                 name=extraction_config.name,
58 |                 success=False,
59 |                 error=str(e)
60 |             ) 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/crawl4ai_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Modular web scraping implementation using Crawl4AI.
  3 | Supports multiple extraction strategies including LLM, CSS, and XPath.
  4 | """
  5 | 
  6 | import asyncio
  7 | import os
  8 | from dataclasses import dataclass
  9 | from typing import Dict, List, Optional
 10 | 
 11 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 12 | from crawl4ai.content_filter_strategy import PruningContentFilter
 13 | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 14 | 
 15 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult, print_extraction_result
 16 | from opendeepsearch.context_scraping.basic_web_scraper import ExtractionConfig
 17 | from opendeepsearch.context_scraping.strategy_factory import StrategyFactory
 18 | 
 19 | class WebScraper:
 20 |     """Unified scraper that encapsulates all extraction strategies and configuration"""
 21 |     def __init__(
 22 |         self, 
 23 |         browser_config: Optional[BrowserConfig] = None,
 24 |         strategies: List[str] = ['no_extraction'],
 25 |         llm_instruction: str = "Extract relevant content from the provided text, only return the text, no markdown formatting, remove all footnotes, citations, and other metadata and only keep the main content",
 26 |         user_query: Optional[str] = None,
 27 |         debug: bool = False,
 28 |         filter_content: bool = False
 29 |     ):
 30 |         self.browser_config = browser_config or BrowserConfig(headless=True, verbose=True)
 31 |         self.debug = debug
 32 |         self.factory = StrategyFactory()
 33 |         self.strategies = strategies or ['markdown_llm', 'html_llm', 'fit_markdown_llm', 'css', 'xpath', 'no_extraction', 'cosine']
 34 |         self.llm_instruction = llm_instruction
 35 |         self.user_query = user_query
 36 |         self.filter_content = filter_content
 37 |         
 38 |         # Validate strategies
 39 |         valid_strategies = {'markdown_llm', 'html_llm', 'fit_markdown_llm', 'css', 'xpath', 'no_extraction', 'cosine'}
 40 |         invalid_strategies = set(self.strategies) - valid_strategies
 41 |         if invalid_strategies:
 42 |             raise ValueError(f"Invalid strategies: {invalid_strategies}")
 43 |             
 44 |         # Initialize strategy map
 45 |         self.strategy_map = {
 46 |             'markdown_llm': lambda: self.factory.create_llm_strategy('markdown', self.llm_instruction),
 47 |             'html_llm': lambda: self.factory.create_llm_strategy('html', self.llm_instruction),
 48 |             'fit_markdown_llm': lambda: self.factory.create_llm_strategy('fit_markdown', self.llm_instruction),
 49 |             'css': self.factory.create_css_strategy,
 50 |             'xpath': self.factory.create_xpath_strategy,
 51 |             'no_extraction': self.factory.create_no_extraction_strategy,
 52 |             'cosine': lambda: self.factory.create_cosine_strategy(debug=self.debug)
 53 |         }
 54 | 
 55 |     def _create_crawler_config(self) -> CrawlerRunConfig:
 56 |         """Creates default crawler configuration"""
 57 |         content_filter = PruningContentFilter(user_query=self.user_query) if self.user_query else PruningContentFilter()
 58 |         return CrawlerRunConfig(
 59 |             cache_mode=CacheMode.BYPASS,
 60 |             markdown_generator=DefaultMarkdownGenerator(
 61 |                 content_filter=content_filter
 62 |             )
 63 |         )
 64 | 
 65 |     async def scrape(self, url: str) -> Dict[str, ExtractionResult]:
 66 |         """
 67 |         Scrape URL using configured strategies
 68 |         
 69 |         Args:
 70 |             url: Target URL to scrape
 71 |         """
 72 |         # Handle Wikipedia URLs
 73 |         if 'wikipedia.org/wiki/' in url:
 74 |             from src.opendeepsearch.context_scraping.utils import get_wikipedia_content
 75 |             try:
 76 |                 content = get_wikipedia_content(url)
 77 |                 # Create same result for all strategies since we're using Wikipedia content
 78 |                 return {
 79 |                     strategy_name: ExtractionResult(
 80 |                         name=strategy_name,
 81 |                         success=True,
 82 |                         content=content
 83 |                     ) for strategy_name in self.strategies
 84 |                 }
 85 |             except Exception as e:
 86 |                 if self.debug:
 87 |                     print(f"Debug: Wikipedia extraction failed: {str(e)}")
 88 |                 # If Wikipedia extraction fails, fall through to normal scraping
 89 |         
 90 |         # Normal scraping for non-Wikipedia URLs or if Wikipedia extraction failed
 91 |         results = {}
 92 |         for strategy_name in self.strategies:
 93 |             config = ExtractionConfig(
 94 |                 name=strategy_name,
 95 |                 strategy=self.strategy_map[strategy_name]()
 96 |             )
 97 |             result = await self.extract(config, url)
 98 |             results[strategy_name] = result
 99 |             
100 |         return results
101 |     
102 |     async def scrape_many(self, urls: List[str]) -> Dict[str, Dict[str, ExtractionResult]]:
103 |         """
104 |         Scrape multiple URLs using configured strategies in parallel
105 |         
106 |         Args:
107 |             urls: List of target URLs to scrape
108 |             
109 |         Returns:
110 |             Dictionary mapping URLs to their extraction results
111 |         """
112 |         # Create tasks for all URLs
113 |         tasks = [self.scrape(url) for url in urls]
114 |         # Run all tasks concurrently
115 |         results_list = await asyncio.gather(*tasks)
116 |         
117 |         # Build results dictionary
118 |         results = {}
119 |         for url, result in zip(urls, results_list):
120 |             results[url] = result
121 |             
122 |         return results
123 | 
124 |     async def extract(self, extraction_config: ExtractionConfig, url: str) -> ExtractionResult:
125 |         """Internal method to perform extraction using specified strategy"""
126 |         try:
127 |             config = self._create_crawler_config()
128 |             config.extraction_strategy = extraction_config.strategy
129 | 
130 |             if self.debug:
131 |                 print(f"\nDebug: Attempting extraction with strategy: {extraction_config.name}")
132 |                 print(f"Debug: URL: {url}")
133 |                 print(f"Debug: Strategy config: {config.extraction_strategy}")
134 |                 if self.user_query:
135 |                     print(f"Debug: User query: {self.user_query}")
136 | 
137 |             async with AsyncWebCrawler(config=self.browser_config) as crawler:
138 |                 if isinstance(url, list):
139 |                     result = await crawler.arun_many(urls=url, config=config)
140 |                 else:
141 |                     result = await crawler.arun(url=url, config=config)
142 | 
143 |             if self.debug:
144 |                 print(f"Debug: Raw result attributes: {dir(result)}")
145 |                 print(f"Debug: Raw result: {result.__dict__}")
146 | 
147 |             # Handle different result formats based on strategy
148 |             content = None
149 |             if result.success:
150 |                 if extraction_config.name in ['no_extraction', 'cosine']:
151 |                     # For strategies that return a list of dictionaries
152 |                     if hasattr(result, 'markdown_v2'):
153 |                         content = result.markdown_v2.raw_markdown
154 |                     elif hasattr(result, 'raw_html'):
155 |                         content = result.raw_html
156 |                     elif hasattr(result, 'extracted_content') and result.extracted_content:
157 |                         if isinstance(result.extracted_content, list):
158 |                             content = '\n'.join(item.get('content', '') for item in result.extracted_content)
159 |                         else:
160 |                             content = result.extracted_content
161 |                     
162 |                     if self.filter_content and content:
163 |                         from src.opendeepsearch.context_scraping.utils import filter_quality_content
164 |                         content = filter_quality_content(content)
165 |                 else:
166 |                     content = result.extracted_content
167 |                     if self.filter_content and content:
168 |                         from src.opendeepsearch.context_scraping.utils import filter_quality_content
169 |                         content = filter_quality_content(content)
170 | 
171 |             if self.debug:
172 |                 print(f"Debug: Processed content: {content[:200] if content else None}")
173 | 
174 |             extraction_result = ExtractionResult(
175 |                 name=extraction_config.name,
176 |                 success=result.success,
177 |                 content=content,
178 |                 error=getattr(result, 'error', None)  # Capture error if available
179 |             )
180 |             
181 |             if result.success:
182 |                 extraction_result.raw_markdown_length = len(result.markdown_v2.raw_markdown)
183 |                 extraction_result.citations_markdown_length = len(result.markdown_v2.markdown_with_citations)
184 |             elif self.debug:
185 |                 print(f"Debug: Final extraction result: {extraction_result.__dict__}")
186 | 
187 |             return extraction_result
188 | 
189 |         except Exception as e:
190 |             if self.debug:
191 |                 import traceback
192 |                 print(f"Debug: Exception occurred during extraction:")
193 |                 print(traceback.format_exc())
194 |             
195 |             return ExtractionResult(
196 |                 name=extraction_config.name,
197 |                 success=False,
198 |                 error=str(e)
199 |             )
200 | 
201 | async def main():
202 |     # Example usage with single URL
203 |     single_url = "https://example.com/product-page"
204 |     scraper = WebScraper(debug=True)
205 |     results = await scraper.scrape(single_url)
206 |     
207 |     # Print single URL results
208 |     for result in results.values():
209 |         print_extraction_result(result)
210 | 
211 |     # Example usage with multiple URLs
212 |     urls = [
213 |         "https://example.com",
214 |         "https://python.org",
215 |         "https://github.com"
216 |     ]
217 |     
218 |     multi_results = await scraper.scrape_many(urls)
219 |     
220 |     # Print multiple URL results
221 |     for url, url_results in multi_results.items():
222 |         print(f"\nResults for {url}:")
223 |         for result in url_results.values():
224 |             print_extraction_result(result)
225 | 
226 | if __name__ == "__main__":
227 |     asyncio.run(main())
228 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/extraction_result.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains the ExtractionResult class for holding extraction operation results.
 3 | """
 4 | 
 5 | from typing import Optional
 6 | 
 7 | class ExtractionResult:
 8 |     """Holds the results of an extraction operation"""
 9 |     def __init__(self, name: str, success: bool, content: Optional[str] = None, error: Optional[str] = None):
10 |         self.name = name
11 |         self.success = success
12 |         self.content = content
13 |         self.error = error
14 |         self.raw_markdown_length = 0
15 |         self.citations_markdown_length = 0
16 | 
17 | def print_extraction_result(result: ExtractionResult):
18 |     """Utility function to print extraction results"""
19 |     if result.success:
20 |         print(f"\n=== {result.name} Results ===")
21 |         print(f"Extracted Content: {result.content}")
22 |         print(f"Raw Markdown Length: {result.raw_markdown_length}")
23 |         print(f"Citations Markdown Length: {result.citations_markdown_length}")
24 |     else:
25 |         print(f"Error in {result.name}: {result.error}") 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/fast_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Enhanced web scraping implementation using Crawl4AI and vLLM.
  3 | Supports multiple extraction strategies with LLM-powered content processing.
  4 | """
  5 | 
  6 | import asyncio
  7 | from dataclasses import dataclass
  8 | from typing import Dict, List, Optional, Any
  9 | import json
 10 | 
 11 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 12 | from vllm import LLM, SamplingParams
 13 | 
 14 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult
 15 | from opendeepsearch.context_scraping.utils import clean_html, get_wikipedia_content
 16 | 
 17 | @dataclass
 18 | class LLMConfig:
 19 |     """Configuration for LLM-based extraction"""
 20 |     model_name: str = 'jinaai/ReaderLM-v2'
 21 |     max_model_len: int = 512_000
 22 |     temperature: float = 0.0
 23 |     top_k: int = 1
 24 |     presence_penalty: float = 0.25
 25 |     frequency_penalty: float = 0.25
 26 |     repetition_penalty: float = 1.13
 27 |     max_tokens: int = 16_384
 28 | 
 29 | # DEFAULT_SCHEMA = """
 30 | # {
 31 | #   "type": "object",
 32 | #   "properties": {
 33 | #     "title": {
 34 | #       "type": "string"
 35 | #     },
 36 | #     "author": {
 37 | #       "type": "string"
 38 | #     },
 39 | #     "date": {
 40 | #       "type": "string"
 41 | #     },
 42 | #     "content": {
 43 | #       "type": "string"
 44 | #     }
 45 | #   },
 46 | #   "required": ["title", "author", "date", "content"]
 47 | # }
 48 | # """
 49 | 
 50 | class FastWebScraper:
 51 |     """Enhanced scraper with LLM-powered extraction and multiple strategies"""
 52 |     def __init__(
 53 |         self,
 54 |         llm_config: Optional[LLMConfig] = None,
 55 |         browser_config: Optional[BrowserConfig] = None,
 56 |         json_schema: Optional[Dict[str, Any]] = None,
 57 |         debug: bool = False
 58 |     ):
 59 |         self.debug = debug
 60 |         self.browser_config = browser_config or BrowserConfig(headless=True, verbose=debug)
 61 |         self.llm_config = llm_config or LLMConfig()
 62 |         self.json_schema = None #json_schema or json.loads(DEFAULT_SCHEMA)
 63 |         
 64 |         # Initialize LLM
 65 |         self.sampling_params = SamplingParams(
 66 |             temperature=self.llm_config.temperature,
 67 |             top_k=self.llm_config.top_k,
 68 |             presence_penalty=self.llm_config.presence_penalty,
 69 |             repetition_penalty=self.llm_config.repetition_penalty,
 70 |             max_tokens=self.llm_config.max_tokens,
 71 |             frequency_penalty=self.llm_config.frequency_penalty
 72 |         )
 73 |         
 74 |         self.llm = LLM(
 75 |             model=self.llm_config.model_name,
 76 |             max_model_len=self.llm_config.max_model_len,
 77 |             dtype='float16'
 78 |         )
 79 |         
 80 |         self.tokenizer = self.llm.get_tokenizer()
 81 | 
 82 |     def _create_prompt(self, text: str, instruction: Optional[str] = None) -> str:
 83 |         """Create a prompt for the LLM"""
 84 |         if not instruction:
 85 |             instruction = "Extract the main content and convert to structured format."
 86 |         
 87 |         if self.json_schema:
 88 |             instruction = "Extract information according to the schema and return JSON."
 89 |             prompt = f"{instruction}\n```html\n{text}\n```\nSchema:```json\n{json.dumps(self.json_schema, indent=2)}\n```"
 90 |         else:
 91 |             prompt = f"{instruction}\n```html\n{text}\n```"
 92 | 
 93 |         messages = [{"role": "user", "content": prompt}]
 94 |         return self.tokenizer.apply_chat_template(
 95 |             messages, tokenize=False, add_generation_prompt=True
 96 |         )
 97 | 
 98 |     async def _extract_content(self, html: str, instruction: Optional[str] = None) -> str:
 99 |         """Extract content using LLM"""
100 |         cleaned_html = clean_html(html, clean_svg=True, clean_base64=True)
101 |         prompt = self._create_prompt(cleaned_html, instruction)
102 |         
103 |         outputs = self.llm.generate(prompt, self.sampling_params)
104 |         raw_text = outputs[0].outputs[0].text
105 |         return self._parse_llm_output(raw_text)
106 | 
107 |     def _parse_llm_output(self, text: str) -> str:
108 |         """
109 |         Parse LLM output, handling both single dictionaries and lists of dictionaries.
110 |         Returns the content field from the most appropriate dictionary.
111 |         """
112 |         try:
113 |             # Strip any markdown code block markers
114 |             text = text.strip()
115 |             if text.startswith('```') and text.endswith('```'):
116 |                 text = text.split('```')[1]
117 |                 if text.startswith('json'):
118 |                     text = text[4:]
119 |             
120 |             data = json.loads(text.strip())
121 |             
122 |             if isinstance(data, dict):
123 |                 return data.get('content', '')
124 |             
125 |             if isinstance(data, list):
126 |                 # First try to find a dictionary with non-empty content
127 |                 for item in data:
128 |                     if isinstance(item, dict) and item.get('content'):
129 |                         return item['content']
130 |                 
131 |                 # If no content found, return content from last item or empty string
132 |                 last_item = data[-1]
133 |                 return last_item.get('content', '') if isinstance(last_item, dict) else ''
134 |             
135 |             return ''
136 |             
137 |         except json.JSONDecodeError:
138 |             # If JSON parsing fails, return the original text
139 |             return text.strip()
140 |         except Exception:
141 |             return ''
142 | 
143 |     async def scrape(self, url: str, instruction: Optional[str] = None) -> ExtractionResult:
144 |         """
145 |         Scrape and process content from a URL
146 |         
147 |         Args:
148 |             url: Target URL to scrape
149 |             instruction: Optional custom instruction for the LLM
150 |         """
151 |         try:
152 |             if self.debug:
153 |                 print(f"Debug: Processing URL: {url}")
154 | 
155 |             # Handle Wikipedia URLs
156 |             if 'wikipedia.org/wiki/' in url:
157 |                 try:
158 |                     content = get_wikipedia_content(url)
159 |                     return ExtractionResult(
160 |                         name="llm_extraction",
161 |                         success=True,
162 |                         content=content
163 |                     )
164 |                 except Exception as e:
165 |                     if self.debug:
166 |                         print(f"Debug: Wikipedia extraction failed: {str(e)}")
167 |                     # If Wikipedia extraction fails, fall through to normal scraping
168 | 
169 |             # Fetch HTML
170 |             async with AsyncWebCrawler(config=self.browser_config) as crawler:
171 |                 result = await crawler.arun(url=url, config=CrawlerRunConfig())
172 |                 
173 |             if not result.success:
174 |                 return ExtractionResult(
175 |                     name="llm_extraction",
176 |                     success=False,
177 |                     error="Failed to fetch HTML"
178 |                 )
179 | 
180 |             # Process with LLM
181 |             content = await self._extract_content(result.html, instruction)
182 |             
183 |             return ExtractionResult(
184 |                 name="llm_extraction",
185 |                 success=True,
186 |                 content=content
187 |             )
188 | 
189 |         except Exception as e:
190 |             if self.debug:
191 |                 import traceback
192 |                 print(f"Debug: Exception during scraping:")
193 |                 print(traceback.format_exc())
194 |             
195 |             return ExtractionResult(
196 |                 name="llm_extraction",
197 |                 success=False,
198 |                 error=str(e)
199 |             )
200 | 
201 |     async def scrape_many(self, urls: List[str], instruction: Optional[str] = None) -> Dict[str, ExtractionResult]:
202 |         """
203 |         Scrape multiple URLs
204 |         
205 |         Args:
206 |             urls: List of target URLs
207 |             instruction: Optional custom instruction for the LLM
208 |         """
209 |         results = {}
210 |         for url in urls:
211 |             results[url] = await self.scrape(url, instruction)
212 |         return results
213 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/strategy_factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains the StrategyFactory class for creating various extraction strategies.
 3 | """
 4 | 
 5 | import os
 6 | from typing import Optional
 7 | 
 8 | from crawl4ai.extraction_strategy import (
 9 |     LLMExtractionStrategy,
10 |     JsonCssExtractionStrategy,
11 |     JsonXPathExtractionStrategy,
12 |     NoExtractionStrategy,
13 |     CosineStrategy,
14 | )
15 | 
16 | class StrategyFactory:
17 |     """Factory for creating extraction strategies"""
18 |     @staticmethod
19 |     def create_llm_strategy(
20 |         input_format: str = "markdown",
21 |         instruction: str = "Extract relevant content from the provided text, only return the text, no markdown formatting, remove all footnotes, citations, and other metadata and only keep the main content",
22 |     ) -> LLMExtractionStrategy:
23 |         return LLMExtractionStrategy(
24 |             input_format=input_format,
25 |             provider="openrouter/google/gemini-2.0-flash-lite-001",  # Uses LiteLLM as provider
26 |             api_token=os.getenv("OPENROUTER_API_KEY"),
27 |             instruction=instruction
28 |         )
29 | 
30 |     @staticmethod
31 |     def create_css_strategy() -> JsonCssExtractionStrategy:
32 |         schema = {
33 |             "baseSelector": ".product",
34 |             "fields": [
35 |                 {"name": "title", "selector": "h1.product-title", "type": "text"},
36 |                 {"name": "price", "selector": ".price", "type": "text"},
37 |                 {"name": "description", "selector": ".description", "type": "text"},
38 |             ],
39 |         }
40 |         return JsonCssExtractionStrategy(schema=schema)
41 | 
42 |     @staticmethod
43 |     def create_xpath_strategy() -> JsonXPathExtractionStrategy:
44 |         schema = {
45 |             "baseSelector": "//div[@class='product']",
46 |             "fields": [
47 |                 {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
48 |                 {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
49 |                 {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"},
50 |             ],
51 |         }
52 |         return JsonXPathExtractionStrategy(schema=schema)
53 | 
54 |     @staticmethod
55 |     def create_no_extraction_strategy() -> NoExtractionStrategy:
56 |         return NoExtractionStrategy()
57 | 
58 |     @staticmethod
59 |     def create_cosine_strategy(
60 |         semantic_filter: Optional[str] = None,
61 |         word_count_threshold: int = 10,
62 |         max_dist: float = 0.2,
63 |         sim_threshold: float = 0.3,
64 |         debug: bool = False
65 |     ) -> CosineStrategy:
66 |         return CosineStrategy(
67 |             semantic_filter=semantic_filter,
68 |             word_count_threshold=word_count_threshold,
69 |             max_dist=max_dist,
70 |             sim_threshold=sim_threshold,
71 |             verbose=debug
72 |         ) 


--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Tuple
  3 | import fasttext
  4 | from huggingface_hub import hf_hub_download
  5 | import wikipediaapi
  6 | 
  7 | # Load the model
  8 | model = fasttext.load_model(hf_hub_download("kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2", "model.bin"))
  9 | 
 10 | def clean_markdown_links(text: str, min_quality_score: float = 0.2) -> Tuple[str, float]:
 11 |     """
 12 |     Clean markdown links and filter low-quality content.
 13 |     Returns tuple of (cleaned_text, quality_score)
 14 |     """
 15 |     # Split by double newlines to preserve paragraph structure
 16 |     paragraphs = text.split('\n\n')
 17 |     
 18 |     cleaned_paragraphs = []
 19 |     for paragraph in paragraphs:
 20 |         # Preserve code blocks by checking if paragraph contains ``` tags
 21 |         if '```' in paragraph:
 22 |             cleaned_paragraphs.append(paragraph)
 23 |             continue
 24 |             
 25 |         lines = paragraph.split('\n')
 26 |         filtered_lines = []
 27 |         for line in lines:
 28 |             line = line.strip()
 29 |             # Keep headers regardless of length
 30 |             if re.match(r'^#{1,6}\s+', line):
 31 |                 filtered_lines.append(line)
 32 |                 continue
 33 |             
 34 |             # Skip common UI/navigation elements
 35 |             if re.match(r'^(Share|Trade|More|Buy|Sell|Download|Menu|Home|Back|Next|Previous|\d+\s*(BTC|USD|EUR|GBP)|\w{3}-\w{1,3}|Currency:.*|You (Buy|Spend|Receive)|≈|\d+\.\d+)', line, re.IGNORECASE):
 36 |                 continue
 37 |                 
 38 |             # Count words before removing markdown
 39 |             word_count = len(re.sub(r'\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|<.*?>', '', line).split())
 40 |             
 41 |             # Increase minimum word threshold to 12
 42 |             if word_count < 12:
 43 |                 # Check if line only contains markdown patterns or appears to be a currency/trading related line
 44 |                 cleaned_line = re.sub(r'\[!\[.*?\]\(.*?\)\]\(.*?\)|\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|<.*?>|\d+(\.\d+)?%?|\$\d+(\.\d+)?', '', line).strip()
 45 |                 if not cleaned_line or len(cleaned_line.split()) < 8:  # If nothing substantial remains, skip this line
 46 |                     continue
 47 |             
 48 |             filtered_lines.append(line)
 49 |         
 50 |         # Only add paragraph if it has any lines left
 51 |         if filtered_lines:
 52 |             cleaned_paragraphs.append('\n'.join(filtered_lines))
 53 |     
 54 |     # Rejoin with double newlines
 55 |     cleaned_text = '\n\n'.join(cleaned_paragraphs)
 56 |     
 57 |     # Get quality score
 58 |     quality_score = predict_educational_value([cleaned_text])[0]
 59 |     
 60 |     return cleaned_text, quality_score
 61 | 
 62 | def filter_quality_content(text: str, min_quality_score: float = 0.2) -> str:
 63 |     """
 64 |     Filter content based on quality and returns concatenated quality content
 65 |     """
 66 |     # Split text into paragraphs
 67 |     paragraphs = text.split('\n\n')
 68 |     
 69 |     # Process each paragraph
 70 |     quality_content = []
 71 |     for paragraph in paragraphs:
 72 |         if not paragraph.strip():  # Skip empty paragraphs
 73 |             continue
 74 |             
 75 |         cleaned_text, quality_score = clean_markdown_links(paragraph, min_quality_score)
 76 |         if cleaned_text and quality_score >= min_quality_score:
 77 |             quality_content.append((cleaned_text, quality_score))
 78 |     
 79 |     # Debug print
 80 |     print(f"Found {len(quality_content)} quality paragraphs out of {len(paragraphs)} total")
 81 |     
 82 |     if quality_content:
 83 |         return "\n\n".join(text for text, _ in quality_content)
 84 |     return text  # Return original text if no quality content found
 85 | 
 86 | def replace_newlines(text: str) -> str:
 87 |     """Replace multiple newlines with a single space."""
 88 |     return re.sub("\n+", " ", text)
 89 | 
 90 | score_dict = {
 91 |     '__label__': 0, 
 92 |     '__label__Low': 0, 
 93 |     '__label__Mid': 1,
 94 |     '__label__High': 2
 95 | }
 96 | 
 97 | def predict_educational_value(text_list: List[str]) -> List[float]:
 98 |     """
 99 |     Predict educational value scores for a list of texts.
100 |     Returns a list of scores between 0 and 2.
101 |     """
102 |     text_list = [replace_newlines(text) for text in text_list]
103 |     pred = model.predict(text_list, k=-1)
104 |     score_list = []
105 |     for l, s in zip(*pred):
106 |         score = 0
107 |         for _l, _s in zip(l, s):
108 |             score += score_dict[_l] * _s
109 |         score_list.append(float(score))
110 |     return score_list
111 | 
112 | def get_wikipedia_content(url: str) -> str | None:
113 |     """
114 |     Extract content from a Wikipedia URL.
115 |     
116 |     Args:
117 |         url: Wikipedia URL to scrape
118 |         
119 |     Returns:
120 |         str: Page content if found, None otherwise
121 |     """
122 |     wiki = wikipediaapi.Wikipedia(user_agent="opendeepsearch", language='en')
123 |     
124 |     # Extract the page title from URL (everything after /wiki/)
125 |     try:
126 |         title = url.split('/wiki/')[-1]
127 |         page = wiki.page(title)
128 |         if page.exists():
129 |             return page.text
130 |         return None
131 |     except Exception:
132 |         return None
133 | 
134 | # Patterns
135 | SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
136 | STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
137 | META_PATTERN = r"<[ ]*meta.*?>"
138 | COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
139 | LINK_PATTERN = r"<[ ]*link.*?>"
140 | BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
141 | SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
142 | IFRAME_PATTERN = r"<[ ]*iframe.*?\/[ ]*iframe[ ]*>"
143 | NOSCRIPT_PATTERN = r"<[ ]*noscript.*?\/[ ]*noscript[ ]*>"
144 | HEADER_PATTERN = r"<[ ]*header.*?\/[ ]*header[ ]*>"
145 | FOOTER_PATTERN = r"<[ ]*footer.*?\/[ ]*footer[ ]*>"
146 | NAV_PATTERN = r"<[ ]*nav.*?\/[ ]*nav[ ]*>"
147 | FORM_PATTERN = r"<[ ]*form.*?\/[ ]*form[ ]*>"
148 | 
149 | 
150 | def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
151 |     return re.sub(
152 |         SVG_PATTERN,
153 |         lambda match: f"{match.group(1)}{new_content}{match.group(3)}",
154 |         html,
155 |         flags=re.DOTALL,
156 |     )
157 | 
158 | 
159 | def replace_base64_images(html: str, new_image_src: str = "#") -> str:
160 |     return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
161 | 
162 | 
163 | def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
164 |     """Clean HTML content by removing various elements."""
165 |     patterns = [
166 |         SCRIPT_PATTERN,
167 |         STYLE_PATTERN,
168 |         META_PATTERN,
169 |         COMMENT_PATTERN,
170 |         LINK_PATTERN,
171 |         IFRAME_PATTERN,
172 |         NOSCRIPT_PATTERN,
173 |         HEADER_PATTERN,
174 |         FOOTER_PATTERN,
175 |         NAV_PATTERN,
176 |         FORM_PATTERN
177 |     ]
178 |     
179 |     for pattern in patterns:
180 |         html = re.sub(pattern, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
181 | 
182 |     if clean_svg:
183 |         html = replace_svg(html)
184 |     if clean_base64:
185 |         html = replace_base64_images(html)
186 |         
187 |     # Remove empty lines and excessive whitespace
188 |     html = re.sub(r'\n\s*\n', '\n', html)
189 |     html = re.sub(r'\s+', ' ', html)
190 |     
191 |     return html.strip()
192 | 
193 | JSON_SCHEMA = """
194 | {
195 |   "type": "object",
196 |   "properties": {
197 |     "title": {
198 |       "type": "string"
199 |     },
200 |     "author": {
201 |       "type": "string"
202 |     },
203 |     "date": {
204 |       "type": "string"
205 |     },
206 |     "content": {
207 |       "type": "string"
208 |     }
209 |   },
210 |   "required": ["title", "author", "date", "content"]
211 | }
212 | """


--------------------------------------------------------------------------------
/src/opendeepsearch/ods_agent.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Dict, Any, Literal
  2 | from opendeepsearch.serp_search.serp_search import create_search_api, SearchAPI
  3 | from opendeepsearch.context_building.process_sources_pro import SourceProcessor
  4 | from opendeepsearch.context_building.build_context import build_context
  5 | from litellm import completion, utils
  6 | from dotenv import load_dotenv
  7 | import os
  8 | from opendeepsearch.prompts import SEARCH_SYSTEM_PROMPT
  9 | import asyncio
 10 | import nest_asyncio
 11 | load_dotenv()
 12 | 
 13 | class OpenDeepSearchAgent:
 14 |     def __init__(
 15 |         self,
 16 |         model: Optional[str] = None, #We use LiteLLM to call the model
 17 |         system_prompt: Optional[str] = SEARCH_SYSTEM_PROMPT,
 18 |         search_provider: Literal["serper", "searxng"] = "serper",
 19 |         serper_api_key: Optional[str] = None,
 20 |         searxng_instance_url: Optional[str] = None,
 21 |         searxng_api_key: Optional[str] = None,
 22 |         source_processor_config: Optional[Dict[str, Any]] = None,
 23 |         temperature: float = 0.2, # Slight variation while maintaining reliability
 24 |         top_p: float = 0.3, # Focus on high-confidence tokens
 25 |         reranker: Optional[str] = "None", # Optional reranker identifier
 26 |     ):
 27 |         """
 28 |         Initialize an OpenDeepSearch agent that combines web search, content processing, and LLM capabilities.
 29 | 
 30 |         This agent performs web searches using either SerperAPI or SearXNG, processes the search results to extract
 31 |         relevant information, and uses a language model to generate responses based on the gathered context.
 32 | 
 33 |         Args:
 34 |             model (str): The identifier for the language model to use (compatible with LiteLLM).
 35 |             system_prompt (str, optional): Custom system prompt for the language model. If not provided,
 36 |                 uses a default prompt that instructs the model to answer based on context.
 37 |             search_provider (str, optional): The search provider to use ('serper' or 'searxng'). Default is 'serper'.
 38 |             serper_api_key (str, optional): API key for SerperAPI. Required if search_provider is 'serper' and
 39 |                 SERPER_API_KEY environment variable is not set.
 40 |             searxng_instance_url (str, optional): URL of the SearXNG instance. Required if search_provider is 'searxng'
 41 |                 and SEARXNG_INSTANCE_URL environment variable is not set.
 42 |             searxng_api_key (str, optional): API key for SearXNG instance. Optional even if search_provider is 'searxng'.
 43 |             source_processor_config (Dict[str, Any], optional): Configuration dictionary for the
 44 |                 SourceProcessor. Supports the following options:
 45 |                 - strategies (List[str]): Content extraction strategies to use
 46 |                 - filter_content (bool): Whether to enable content filtering
 47 |                 - top_results (int): Number of top results to process
 48 |             temperature (float, default=0.2): Controls randomness in model outputs. Lower values make
 49 |                 the output more focused and deterministic.
 50 |             top_p (float, default=0.3): Controls nucleus sampling for model outputs. Lower values make
 51 |                 the output more focused on high-probability tokens.
 52 |             reranker (str, optional): Identifier for the reranker to use. If not provided,
 53 |                 uses the default reranker from SourceProcessor.
 54 |         """
 55 |         # Initialize search API based on provider
 56 |         self.serp_search = create_search_api(
 57 |             search_provider=search_provider,
 58 |             serper_api_key=serper_api_key,
 59 |             searxng_instance_url=searxng_instance_url,
 60 |             searxng_api_key=searxng_api_key
 61 |         )
 62 | 
 63 |         # Update source_processor_config with reranker if provided
 64 |         if source_processor_config is None:
 65 |             source_processor_config = {}
 66 |         if reranker:
 67 |             source_processor_config['reranker'] = reranker
 68 | 
 69 |         # Initialize SourceProcessor with provided config or defaults
 70 |         self.source_processor = SourceProcessor(**source_processor_config)
 71 | 
 72 |         # Initialize LLM settings
 73 |         self.model = model if model is not None else os.getenv("LITELLM_SEARCH_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001"))
 74 |         self.temperature = temperature
 75 |         self.top_p = top_p
 76 |         self.system_prompt = system_prompt
 77 | 
 78 |         # Configure LiteLLM with OpenAI base URL if provided
 79 |         openai_base_url = os.environ.get("OPENAI_BASE_URL")
 80 |         if openai_base_url:
 81 |             utils.set_provider_config("openai", {"base_url": openai_base_url})
 82 | 
 83 |     async def search_and_build_context(
 84 |         self,
 85 |         query: str,
 86 |         max_sources: int = 2,
 87 |         pro_mode: bool = False
 88 |     ) -> str:
 89 |         """
 90 |         Performs a web search and builds a context from the search results.
 91 | 
 92 |         This method executes a search query, processes the returned sources, and builds a
 93 |         consolidated context, inspired by FreshPrompt in the FreshLLMs paper, that can be used for answering questions.
 94 | 
 95 |         Args:
 96 |             query (str): The search query to execute.
 97 |             max_sources (int, default=2): Maximum number of sources to process. If pro_mode
 98 |                 is enabled, this overrides the top_results setting in source_processor_config
 99 |                 when it's smaller.
100 |             pro_mode (bool, default=False): When enabled, performs a deeper search and more
101 |                 thorough content processing.
102 | 
103 |         Returns:
104 |             str: A formatted context string built from the processed search results.
105 |         """
106 |         # Get sources from SERP
107 |         sources = self.serp_search.get_sources(query)
108 | 
109 |         # Process sources
110 |         processed_sources = await self.source_processor.process_sources(
111 |             sources,
112 |             max_sources,
113 |             query,
114 |             pro_mode
115 |         )
116 | 
117 |         # Build and return context
118 |         return build_context(processed_sources)
119 | 
120 |     async def ask(
121 |         self,
122 |         query: str,
123 |         max_sources: int = 2,
124 |         pro_mode: bool = False,
125 |     ) -> str:
126 |         """
127 |         Searches for information and generates an AI response to the query.
128 | 
129 |         This method combines web search, context building, and AI completion to provide
130 |         informed answers to questions. It first gathers relevant information through search,
131 |         then uses an LLM to generate a response based on the collected context.
132 | 
133 |         Args:
134 |             query (str): The question or query to answer.
135 |             max_sources (int, default=2): Maximum number of sources to include in the context.
136 |             pro_mode (bool, default=False): When enabled, performs a more comprehensive search
137 |                 and analysis of sources.
138 | 
139 |         Returns:
140 |             str: An AI-generated response that answers the query based on the gathered context.
141 |         """
142 |         # Get context from search results
143 |         context = await self.search_and_build_context(query, max_sources, pro_mode)
144 |         # Prepare messages for the LLM
145 |         messages = [
146 |             {"role": "system", "content": self.system_prompt},
147 |             {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
148 |         ]
149 |         # Get completion from LLM
150 |         response = completion(
151 |             model=self.model,
152 |             messages=messages,
153 |             temperature=self.temperature,
154 |             top_p=self.top_p
155 |         )
156 | 
157 |         return response.choices[0].message.content
158 | 
159 |     def ask_sync(
160 |         self,
161 |         query: str,
162 |         max_sources: int = 2,
163 |         pro_mode: bool = False,
164 |     ) -> str:
165 |         """
166 |         Synchronous version of ask() method.
167 |         """
168 |         try:
169 |             # Try getting the current event loop
170 |             loop = asyncio.get_event_loop()
171 |             if loop.is_running():
172 |                 # If we're in a running event loop (e.g., Jupyter), use nest_asyncio
173 |                 nest_asyncio.apply()
174 |         except RuntimeError:
175 |             # If there's no event loop, create a new one
176 |             loop = asyncio.new_event_loop()
177 |             asyncio.set_event_loop(loop)
178 | 
179 |         return loop.run_until_complete(self.ask(query, max_sources, pro_mode))
180 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/ods_tool.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Literal
 2 | from smolagents import Tool
 3 | from opendeepsearch.ods_agent import OpenDeepSearchAgent
 4 | 
 5 | class OpenDeepSearchTool(Tool):
 6 |     name = "web_search"
 7 |     description = """
 8 |     Performs web search based on your query (think a Google search) then returns the final answer that is processed by an llm."""
 9 |     inputs = {
10 |         "query": {
11 |             "type": "string",
12 |             "description": "The search query to perform",
13 |         },
14 |     }
15 |     output_type = "string"
16 | 
17 |     def __init__(
18 |         self,
19 |         model_name: Optional[str] = None,
20 |         reranker: str = "infinity",
21 |         search_provider: Literal["serper", "searxng"] = "serper",
22 |         serper_api_key: Optional[str] = None,
23 |         searxng_instance_url: Optional[str] = None,
24 |         searxng_api_key: Optional[str] = None
25 |     ):
26 |         super().__init__()
27 |         self.search_model_name = model_name  # LiteLLM model name
28 |         self.reranker = reranker
29 |         self.search_provider = search_provider
30 |         self.serper_api_key = serper_api_key
31 |         self.searxng_instance_url = searxng_instance_url
32 |         self.searxng_api_key = searxng_api_key
33 | 
34 |     def forward(self, query: str):
35 |         answer = self.search_tool.ask_sync(query, max_sources=2, pro_mode=True)
36 |         return answer
37 | 
38 |     def setup(self):
39 |         self.search_tool = OpenDeepSearchAgent(
40 |             self.search_model_name,
41 |             reranker=self.reranker,
42 |             search_provider=self.search_provider,
43 |             serper_api_key=self.serper_api_key,
44 |             searxng_instance_url=self.searxng_instance_url,
45 |             searxng_api_key=self.searxng_api_key
46 |         )
47 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/prompts.py:
--------------------------------------------------------------------------------
  1 | from smolagents import PromptTemplates
  2 | 
  3 | SEARCH_SYSTEM_PROMPT = """
  4 | You are an AI-powered search agent that takes in a user’s search query, retrieves relevant search results, and provides an accurate and concise answer based on the provided context.
  5 | 
  6 | ## **Guidelines**
  7 | 
  8 | ### 1. **Prioritize Reliable Sources**
  9 | - Use **ANSWER BOX** when available, as it is the most likely authoritative source.
 10 | - Prefer **Wikipedia** if present in the search results for general knowledge queries.
 11 | - If there is a conflict between **Wikipedia** and the **ANSWER BOX**, rely on **Wikipedia**.
 12 | - Prioritize **government (.gov), educational (.edu), reputable organizations (.org), and major news outlets** over less authoritative sources.
 13 | - When multiple sources provide conflicting information, prioritize the most **credible, recent, and consistent** source.
 14 | 
 15 | ### 2. **Extract the Most Relevant Information**
 16 | - Focus on **directly answering the query** using the information from the **ANSWER BOX** or **SEARCH RESULTS**.
 17 | - Use **additional information** only if it provides **directly relevant** details that clarify or expand on the query.
 18 | - Ignore promotional, speculative, or repetitive content.
 19 | 
 20 | ### 3. **Provide a Clear and Concise Answer**
 21 | - Keep responses **brief (1–3 sentences)** while ensuring accuracy and completeness.
 22 | - If the query involves **numerical data** (e.g., prices, statistics), return the **most recent and precise value** available.
 23 | - If the source is available, then mention it in the answer to the question. If you're relying on the answer box, then do not mention the source if it's not there.
 24 | - For **diverse or expansive queries** (e.g., explanations, lists, or opinions), provide a more detailed response when the context justifies it.
 25 | 
 26 | ### 4. **Handle Uncertainty and Ambiguity**
 27 | - If **conflicting answers** are present, acknowledge the discrepancy and mention the different perspectives if relevant.
 28 | - If **no relevant information** is found in the context, explicitly state that the query could not be answered.
 29 | 
 30 | ### 5. **Answer Validation**
 31 | - Only return answers that can be **directly validated** from the provided context.
 32 | - Do not generate speculative or outside knowledge answers. If the context does not contain the necessary information, state that the answer could not be found.
 33 | 
 34 | ### 6. **Bias and Neutrality**
 35 | - Maintain **neutral language** and avoid subjective opinions.
 36 | - For controversial topics, present multiple perspectives if they are available and relevant.
 37 | """
 38 | 
 39 | REACT_PROMPT = PromptTemplates(system_prompt="""
 40 | You are an expert assistant who can solve any task using tool calls. You will be given a task to solve as best you can.
 41 | To do so, you have been given access to some tools.
 42 | 
 43 | The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation".
 44 | This Action/Observation can repeat N times, you should take several steps when needed.
 45 | 
 46 | You can use the result of the previous action as input for the next action.
 47 | The observation will always be a string: it can represent a file, like "image_1.jpg".
 48 | Then you can use it as input for the next action. You can do it for instance as follows:
 49 | 
 50 | Observation: "image_1.jpg"
 51 | 
 52 | Action:
 53 | {
 54 |   "name": "image_transformer",
 55 |   "arguments": {"image": "image_1.jpg"}
 56 | }
 57 | 
 58 | To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this:
 59 | Action:
 60 | {
 61 |   "name": "final_answer",
 62 |   "arguments": {"answer": "insert your final answer here"}
 63 | }
 64 | 
 65 | 
 66 | Here are a few examples using notional tools:
 67 | ---
 68 | Task: "What historical event happened closest in time to the invention of the telephone: the American Civil War or the establishment of the Eiffel Tower?"
 69 | 
 70 | Action:
 71 | {
 72 |   "name": "web_search",
 73 |   "arguments": {"query": "year of telephone invention"}
 74 | }
 75 | Observation: "The telephone was invented in 1876."
 76 | 
 77 | Action:
 78 | {
 79 |   "name": "web_search",
 80 |   "arguments": {"query": "year American Civil War ended"}
 81 | }
 82 | Observation: "The American Civil War ended in 1865."
 83 | 
 84 | Action:
 85 | {
 86 |   "name": "web_search",
 87 |   "arguments": {"query": "year Eiffel Tower established"}
 88 | }
 89 | Observation: "The Eiffel Tower was completed in 1889."
 90 | 
 91 | Action:
 92 | {
 93 |   "name": "calculate",
 94 |   "arguments": {"expression": "|1876 - 1865| and |1889 - 1876|"}
 95 | }
 96 | Observation: "11 years (Civil War) and 13 years (Eiffel Tower)."
 97 | 
 98 | Action:
 99 | {
100 |   "name": "final_answer",
101 |   "arguments": {"answer": "The historical event closest in time to the invention of the telephone is the end of the American Civil War (11 years apart)."}
102 | }
103 | 
104 | ---
105 | Task: "Which country has a higher population density: Japan or India?"
106 | 
107 | Action:
108 | {
109 |   "name": "web_search",
110 |   "arguments": {"query": "population and area of Japan"}
111 | }
112 | Observation: "Japan has a population of 125 million and an area of 377,975 square kilometers."
113 | 
114 | Action:
115 | {
116 |   "name": "web_search",
117 |   "arguments": {"query": "population and area of India"}
118 | }
119 | Observation: "India has a population of 1.38 billion and an area of 3,287,263 square kilometers."
120 | 
121 | Action:
122 | {
123 |   "name": "calculate",
124 |   "arguments": {"expression": "125 million / 377,975 and 1.38 billion / 3,287,263"}
125 | }
126 | Observation: "Japan: 330.7 people/km²; India: 419.6 people/km²."
127 | 
128 | Action:
129 | {
130 |   "name": "final_answer",
131 |   "arguments": {"answer": "India has a higher population density (419.6 people/km²) than Japan (330.7 people/km²)."}
132 | }
133 | 
134 | ---
135 | Task: "Which country has won more total Olympic gold medals: the United States or China?"
136 | 
137 | Action:
138 | {
139 |   "name": "web_search",
140 |   "arguments": {"query": "total Olympic gold medals won by the United States"}
141 | }
142 | Observation: "The United States has won 1,127 gold medals."
143 | 
144 | Action:
145 | {
146 |   "name": "web_search",
147 |   "arguments": {"query": "total Olympic gold medals won by China"}
148 | }
149 | Observation: "China has won 283 gold medals."
150 | 
151 | Action:
152 | {
153 |   "name": "calculate",
154 |   "arguments": {"expression": "1,127 - 283"}
155 | }
156 | Observation: "The United States has 844 more gold medals than China."
157 | 
158 | Action:
159 | {
160 |   "name": "final_answer",
161 |   "arguments": {"answer": "The United States has won more Olympic gold medals (1,127) than China (283)."}
162 | }
163 | 
164 | ---
165 | Task: "Who discovered the structure of DNA, and in which year was the discovery made?"
166 | 
167 | Action:
168 | {
169 |   "name": "web_search",
170 |   "arguments": {"query": "scientists who discovered DNA structure"}
171 | }
172 | Observation: "James Watson and Francis Crick discovered the structure of DNA."
173 | 
174 | Action:
175 | {
176 |   "name": "web_search",
177 |   "arguments": {"query": "year DNA structure discovered"}
178 | }
179 | Observation: "The structure of DNA was discovered in 1953."
180 | 
181 | Action:
182 | {
183 |   "name": "final_answer",
184 |   "arguments": {"answer": "James Watson and Francis Crick discovered the structure of DNA in 1953."}
185 | }
186 | 
187 | ---
188 | Task: "How many meters taller is the Burj Khalifa compared to the Empire State Building?"
189 | 
190 | Action:
191 | {
192 |   "name": "web_search",
193 |   "arguments": {"query": "height of Burj Khalifa"}
194 | }
195 | Observation: "The Burj Khalifa is 828 meters tall."
196 | 
197 | Action:
198 | {
199 |   "name": "web_search",
200 |   "arguments": {"query": "height of Empire State Building"}
201 | }
202 | Observation: "The Empire State Building is 381 meters tall."
203 | 
204 | Action:
205 | {
206 |   "name": "calculate",
207 |   "arguments": {"expression": "828 - 381"}
208 | }
209 | Observation: "The difference is 447 meters."
210 | 
211 | Action:
212 | {
213 |   "name": "final_answer",
214 |   "arguments": {"answer": "The Burj Khalifa is 447 meters taller than the Empire State Building."}
215 | }
216 | 
217 | ---
218 | Task: "Which country launched the first satellite into space, and what was the name of the satellite?"
219 | 
220 | Action:
221 | {
222 |   "name": "web_search",
223 |   "arguments": {"query": "first satellite launched into space"}
224 | }
225 | Observation: "The Soviet Union launched the first satellite."
226 | 
227 | Action:
228 | {
229 |   "name": "web_search",
230 |   "arguments": {"query": "name of first satellite in space"}
231 | }
232 | Observation: "The first satellite was Sputnik 1."
233 | 
234 | Action:
235 | {
236 |   "name": "final_answer",
237 |   "arguments": {"answer": "The Soviet Union launched the first satellite into space, named Sputnik 1."}
238 | }
239 | 
240 | ---
241 | Task: "Which novel by George Orwell introduced the concept of 'Big Brother,' and in what year was it published?"
242 | 
243 | Action:
244 | {
245 |   "name": "web_search",
246 |   "arguments": {"query": "novel by George Orwell Big Brother"}
247 | }
248 | Observation: "The novel is '1984.'"
249 | 
250 | Action:
251 | {
252 |   "name": "web_search",
253 |   "arguments": {"query": "year '1984' by George Orwell published"}
254 | }
255 | Observation: "'1984' was published in 1949."
256 | 
257 | Action:
258 | {
259 |   "name": "final_answer",
260 |   "arguments": {"answer": "George Orwell's novel '1984,' which introduced the concept of 'Big Brother,' was published in 1949."}
261 | }
262 | 
263 | ---
264 | Task: "Which country hosted the first FIFA World Cup, and in what year?"
265 | 
266 | Action:
267 | {
268 |   "name": "web_search",
269 |   "arguments": {"query": "country hosted first FIFA World Cup"}
270 | }
271 | Observation: "Uruguay hosted the first FIFA World Cup."
272 | 
273 | Action:
274 | {
275 |   "name": "web_search",
276 |   "arguments": {"query": "year of first FIFA World Cup"}
277 | }
278 | Observation: "The first FIFA World Cup was held in 1930."
279 | 
280 | Action:
281 | {
282 |   "name": "final_answer",
283 |   "arguments": {"answer": "Uruguay hosted the first FIFA World Cup in 1930."}
284 | }
285 | 
286 | ---
287 | Task: "Who invented the light bulb, and what company did he later establish?"
288 | 
289 | Action:
290 | {
291 |   "name": "web_search",
292 |   "arguments": {"query": "inventor of the light bulb"}
293 | }
294 | Observation: "Thomas Edison invented the light bulb."
295 | 
296 | Action:
297 | {
298 |   "name": "web_search",
299 |   "arguments": {"query": "company founded by Thomas Edison"}
300 | }
301 | Observation: "Thomas Edison founded General Electric."
302 | 
303 | Action:
304 | {
305 |   "name": "final_answer",
306 |   "arguments": {"answer": "Thomas Edison invented the light bulb and later established General Electric."}
307 | }
308 | 
309 | ---
310 | Task: "In which city was the Declaration of Independence signed, and in what building?"
311 | 
312 | Action:
313 | {
314 |   "name": "web_search",
315 |   "arguments": {"query": "city where Declaration of Independence was signed"}
316 | }
317 | Observation: "The Declaration of Independence was signed in Philadelphia."
318 | 
319 | Action:
320 | {
321 |   "name": "web_search",
322 |   "arguments": {"query": "building where Declaration of Independence was signed"}
323 | }
324 | Observation: "It was signed in Independence Hall."
325 | 
326 | Action:
327 | {
328 |   "name": "final_answer",
329 |   "arguments": {"answer": "The Declaration of Independence was signed in Philadelphia at Independence Hall."}
330 | }
331 | 
332 | ---
333 | Task: "Who developed the theory of general relativity, and in what year was it published?"
334 | 
335 | Action:
336 | {
337 |   "name": "web_search",
338 |   "arguments": {"query": "developer of general relativity"}
339 | }
340 | Observation: "Albert Einstein developed the theory of general relativity."
341 | 
342 | Action:
343 | {
344 |   "name": "web_search",
345 |   "arguments": {"query": "year general relativity published"}
346 | }
347 | Observation: "The theory of general relativity was published in 1915."
348 | 
349 | Action:
350 | {
351 |   "name": "final_answer",
352 |   "arguments": {"answer": "Albert Einstein developed the theory of general relativity, which was published in 1915."}
353 | }
354 | 
355 | ---
356 | Task: "Which Shakespeare play features the phrase 'To be, or not to be,' and who speaks this line?"
357 | 
358 | Action:
359 | {
360 |   "name": "web_search",
361 |   "arguments": {"query": "Shakespeare play To be, or not to be"}
362 | }
363 | Observation: "The play is 'Hamlet.'"
364 | 
365 | Action:
366 | {
367 |   "name": "web_search",
368 |   "arguments": {"query": "character who says To be, or not to be in Hamlet"}
369 | }
370 | Observation: "The line is spoken by Hamlet."
371 | 
372 | Action:
373 | {
374 |   "name": "final_answer",
375 |   "arguments": {"answer": "The phrase 'To be, or not to be' is from Shakespeare's 'Hamlet,' and it is spoken by the character Hamlet."}
376 | }
377 | 
378 | ---
379 | Task: "What is the tallest mountain in Africa, and how high is it?"
380 | 
381 | Action:
382 | {
383 |   "name": "web_search",
384 |   "arguments": {"query": "tallest mountain in Africa"}
385 | }
386 | Observation: "Mount Kilimanjaro is the tallest mountain in Africa."
387 | 
388 | Action:
389 | {
390 |   "name": "web_search",
391 |   "arguments": {"query": "height of Mount Kilimanjaro"}
392 | }
393 | Observation: "Mount Kilimanjaro is 5,895 meters tall."
394 | 
395 | Action:
396 | {
397 |   "name": "final_answer",
398 |   "arguments": {"answer": "Mount Kilimanjaro, the tallest mountain in Africa, is 5,895 meters high."}
399 | }
400 | 
401 | ---
402 | Task: "Who was the first President of the United States to serve two non-consecutive terms?"
403 | 
404 | Action:
405 | {
406 |   "name": "web_search",
407 |   "arguments": {"query": "President who served two non-consecutive terms"}
408 | }
409 | Observation: "Grover Cleveland was the first President to serve two non-consecutive terms."
410 | 
411 | Action:
412 | {
413 |   "name": "final_answer",
414 |   "arguments": {"answer": "Grover Cleveland was the first President of the United States to serve two non-consecutive terms."}
415 | }
416 | 
417 | ---
418 | Task: "What planet is the largest in our solar system, and what is its diameter?"
419 | 
420 | Action:
421 | {
422 |   "name": "web_search",
423 |   "arguments": {"query": "largest planet in solar system"}
424 | }
425 | Observation: "Jupiter is the largest planet in the solar system."
426 | 
427 | Action:
428 | {
429 |   "name": "web_search",
430 |   "arguments": {"query": "diameter of Jupiter"}
431 | }
432 | Observation: "Jupiter's diameter is approximately 139,820 kilometers."
433 | 
434 | Action:
435 | {
436 |   "name": "final_answer",
437 |   "arguments": {"answer": "Jupiter is the largest planet in the solar system, with a diameter of approximately 139,820 kilometers."}
438 | }
439 | 
440 | ---
441 | Task: "What was the first airplane to fly, and in what year did it achieve this feat?"
442 | 
443 | Action:
444 | {
445 |   "name": "web_search",
446 |   "arguments": {"query": "first airplane to fly"}
447 | }
448 | Observation: "The first airplane to fly was the Wright Flyer."
449 | 
450 | Action:
451 | {
452 |   "name": "web_search",
453 |   "arguments": {"query": "year Wright Flyer first flight"}
454 | }
455 | Observation: "The Wright Flyer flew for the first time in 1903."
456 | 
457 | Action:
458 | {
459 |   "name": "final_answer",
460 |   "arguments": {"answer": "The Wright Flyer was the first airplane to fly, achieving this feat in 1903."}
461 | }
462 | 
463 | ---
464 | Task: "Who painted the Mona Lisa, and where is it displayed?"
465 | 
466 | Action:
467 | {
468 |   "name": "web_search",
469 |   "arguments": {"query": "artist who painted Mona Lisa"}
470 | }
471 | Observation: "Leonardo da Vinci painted the Mona Lisa."
472 | 
473 | Action:
474 | {
475 |   "name": "web_search",
476 |   "arguments": {"query": "where is the Mona Lisa displayed"}
477 | }
478 | Observation: "The Mona Lisa is displayed in the Louvre Museum in Paris."
479 | 
480 | Action:
481 | {
482 |   "name": "final_answer",
483 |   "arguments": {"answer": "Leonardo da Vinci painted the Mona Lisa, which is displayed in the Louvre Museum in Paris."}
484 | }
485 | 
486 | ---
487 | Task: "Who has won the most Grand Slam tennis titles, and how many have they won?"
488 | 
489 | Action:
490 | {
491 |   "name": "web_search",
492 |   "arguments": {"query": "player with most Grand Slam tennis titles"}
493 | }
494 | Observation: "Novak Djokovic has won the most Grand Slam titles."
495 | 
496 | Action:
497 | {
498 |   "name": "web_search",
499 |   "arguments": {"query": "number of Grand Slam titles Novak Djokovic"}
500 | }
501 | Observation: "Novak Djokovic has won 24 Grand Slam titles."
502 | 
503 | Action:
504 | {
505 |   "name": "final_answer",
506 |   "arguments": {"answer": "Novak Djokovic has won the most Grand Slam tennis titles, with 24 titles."}
507 | }
508 | 
509 | ---
510 | Task: "Who was the longest-reigning monarch in British history, and how many years did they reign?"
511 | 
512 | Action:
513 | {
514 |   "name": "web_search",
515 |   "arguments": {"query": "longest reigning monarch in British history"}
516 | }
517 | Observation: "Queen Elizabeth II was the longest-reigning monarch in British history."
518 | 
519 | Action:
520 | {
521 |   "name": "web_search",
522 |   "arguments": {"query": "length of reign Queen Elizabeth II"}
523 | }
524 | Observation: "Queen Elizabeth II reigned for 70 years."
525 | 
526 | Action:
527 | {
528 |   "name": "final_answer",
529 |   "arguments": {"answer": "Queen Elizabeth II was the longest-reigning monarch in British history, with a reign of 70 years."}
530 | }
531 | 
532 | ---
533 | Task: "Which Shakespeare play contains the line \"All the world's a stage,\" and how many years ago was it first performed if today is 2024?"
534 | 
535 | Action:
536 | {
537 |   "name": "web_search",
538 |   "arguments": {"query": "Shakespeare play All the world's a stage"}
539 | }
540 | Observation: "The line is from \"As You Like It.\""
541 | 
542 | Action:
543 | {
544 |   "name": "web_search",
545 |   "arguments": {"query": "year As You Like It first performed"}
546 | }
547 | Observation: "\"As You Like It\" was first performed in 1603."
548 | 
549 | Action:
550 | {
551 |   "name": "calculate",
552 |   "arguments": {"expression": "2024 - 1603"}
553 | }
554 | Observation: "421 years."
555 | 
556 | Action:
557 | {
558 |   "name": "final_answer",
559 |   "arguments": {"answer": "\"As You Like It\" contains the line \"All the world's a stage\" and was first performed 421 years ago in 1603."}
560 | }
561 | 
562 | Above examples were using notional tools that might not exist for you. You only have access to these tools:
563 | {%- for tool in tools.values() %}
564 | - {{ tool.name }}: {{ tool.description }}
565 |     Takes inputs: {{tool.inputs}}
566 |     Returns an output of type: {{tool.output_type}}
567 | {%- endfor %}
568 | 
569 | {%- if managed_agents and managed_agents.values() | list %}
570 | You can also give tasks to team members.
571 | Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
572 | Given that this team member is a real human, you should be very verbose in your task.
573 | Here is a list of the team members that you can call:
574 | {%- for agent in managed_agents.values() %}
575 | - {{ agent.name }}: {{ agent.description }}
576 | {%- endfor %}
577 | {%- else %}
578 | {%- endif %}
579 | 
580 | Here are the rules you should always follow to solve your task:
581 | 1. ALWAYS provide a tool call, else you will fail.
582 | 2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead.
583 | 3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself.
584 | If no tool call is needed, use final_answer tool to return your answer.
585 | 4. Never re-do a tool call that you previously did with the exact same parameters.
586 | 
587 | Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
588 | """)


--------------------------------------------------------------------------------
/src/opendeepsearch/ranking_models/README.md:
--------------------------------------------------------------------------------
 1 | ## Semantic Search and Reranking
 2 | 
 3 | The OpenDeepSearch library provides a flexible framework for semantic search and document reranking. At its core is the `BaseSemanticSearcher` class, which can be extended to implement various reranking strategies.
 4 | 
 5 | ### Creating Your Own Reranker
 6 | 
 7 | To implement your own reranker, simply inherit from `BaseSemanticSearcher` and implement the `_get_embeddings()` method:
 8 | 
 9 | ```python
10 | from opendeepsearch.ranking_models.base_reranker import BaseSemanticSearcher
11 | import torch
12 | class MyCustomReranker(BaseSemanticSearcher):
13 |     def init(self):
14 |         # Initialize your embedding model here
15 |         super().__init__()
16 |         self.model = YourEmbeddingModel()
17 |     def get_embeddings(self, texts: List[str]) -> torch.Tensor:
18 |         # Implement your embedding logic here
19 |         pass
20 |     embeddings = self.model.encode(texts)
21 |     return torch.tensor(embeddings)
22 | ```
23 | 
24 | The base class automatically handles:
25 | - Similarity score calculation
26 | - Score normalization (softmax, scaling, or none)
27 | - Document reranking
28 | - Top-k selection
29 | 
30 | ### Using Infinity Rerankers
31 | 
32 | For high-performance reranking, we support [Infinity](https://github.com/michaelfeil/infinity) rerankers which offer state-of-the-art performance. To use an Infinity reranker, first start the Infinity server:
33 | 
34 | ```bash
35 | # requires ~16-32GB VRAM NVIDIA Compute Capability >= 8.0
36 | docker run \
37 | -v $PWD/data:/app/.cache --gpus "0" -p "7997":"7997" \
38 | michaelf34/infinity:0.0.68-trt-onnx \
39 | v2 --model-id Alibaba-NLP/gte-Qwen2-7B-instruct --revision "refs/pr/38" \
40 | --dtype bfloat16 --batch-size 8 --device cuda --engine torch --port 7997 \
41 | --no-bettertransformer
42 | ```
43 | 
44 | This will start an Infinity server using the Qwen2-7B-instruct model. The server will be available at `localhost:7997`.
45 | 
46 | Key parameters:
47 | - `--model-id`: The Hugging Face model ID to use (see [supported models](https://github.com/michaelfeil/infinity#supported-tasks-and-models-by-infinity))
48 | - `--dtype`: Data type for inference (bfloat16 recommended for modern GPUs)
49 | - `--batch-size`: Batch size for inference
50 | - `--port`: Port to expose the server on
51 | 
52 | For specialized deployments, Infinity provides several Docker images:
53 | - `latest-cpu` - For CPU-only inference
54 | - `latest-rocm` - For AMD ROCm GPUs
55 | - `latest-trt-onnx` - For NVIDIA GPUs with TensorRT/ONNX optimizations
56 | 
57 | See the [Infinity documentation](https://michaelfeil.github.io/infinity/) for more details on deployment options and configuration.
58 | 
59 | Note: Ensure you have sufficient VRAM (16-32GB) and a compatible NVIDIA GPU (Compute Capability ≥ 8.0) before running the Infinity server.
60 | 
61 | ### Using Jina AI (or API based) Rerankers
62 | 
63 | Jina AI provides powerful embedding models through their API service. The `JinaReranker` class offers a simple way to leverage these models:
64 | 
65 | ```python
66 | from opendeepsearch.ranking_models.jina_reranker import JinaReranker
67 | 
68 | # Initialize with your API key
69 | reranker = JinaReranker(api_key="your_api_key")  # or set JINA_API_KEY env variable
70 | 
71 | # Example usage
72 | query = "What is machine learning?"
73 | documents = [
74 |     "Machine learning is a subset of artificial intelligence",
75 |     "Deep learning is a type of machine learning",
76 |     "Natural language processing uses machine learning"
77 | ]
78 | 
79 | # Get top 2 most relevant documents
80 | results = reranker.search(query, documents, k=2)
81 | ```
82 | 
83 | The JinaReranker uses Jina's v3 embeddings by default, which provides:
84 | - 1024-dimensional embeddings
85 | - Optimized for text matching tasks
86 | - State-of-the-art performance for semantic search
87 | 
88 | To use JinaReranker:
89 | 1. Sign up for a Jina AI API key at https://jina.ai
90 | 2. Either pass the API key directly or set it as an environment variable `JINA_API_KEY`
91 | 3. Optionally specify a different model using the `model` parameter
92 | 
93 | Note: Unlike Infinity rerankers which run locally, Jina rerankers require an internet connection and API credits.


--------------------------------------------------------------------------------
/src/opendeepsearch/ranking_models/base_reranker.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import torch
  3 | from typing import List, Dict, Union
  4 | 
  5 | class BaseSemanticSearcher(ABC):
  6 |     """
  7 |     Abstract base class for semantic search implementations.
  8 |     
  9 |     This class defines the interface that all semantic searchers must implement.
 10 |     Subclasses should implement the _get_embeddings method according to their
 11 |     specific embedding source.
 12 |     """
 13 |     
 14 |     @abstractmethod
 15 |     def _get_embeddings(self, texts: List[str]) -> torch.Tensor:
 16 |         """
 17 |         Get embeddings for a list of texts.
 18 |         
 19 |         Args:
 20 |             texts: List of text strings to embed
 21 |             
 22 |         Returns:
 23 |             torch.Tensor containing the embeddings shape: (num_texts, embedding_dim)
 24 |         """
 25 |         pass
 26 | 
 27 |     def calculate_scores(
 28 |         self,
 29 |         queries: List[str],
 30 |         documents: List[str],
 31 |         normalize: str = "softmax"  # Options: "softmax", "scale", "none"
 32 |     ) -> torch.Tensor:
 33 |         """
 34 |         Calculate similarity scores between queries and documents.
 35 |         
 36 |         Args:
 37 |             queries: List of query strings
 38 |             documents: List of document strings
 39 |             normalize: Normalization method:
 40 |                       - "softmax": Apply softmax normalization (default)
 41 |                       - "scale": Scale to 0-100 range
 42 |                       - "none": No normalization
 43 |             
 44 |         Returns:
 45 |             torch.Tensor of shape (num_queries, num_documents) containing similarity scores
 46 |         """
 47 |         # Get embeddings for queries and documents
 48 |         query_embeddings = self._get_embeddings(queries)
 49 |         doc_embeddings = self._get_embeddings(documents)
 50 |         
 51 |         # Calculate similarity scores
 52 |         scores = query_embeddings @ doc_embeddings.T
 53 |         
 54 |         # Apply normalization
 55 |         if normalize == "softmax":
 56 |             scores = torch.softmax(scores, dim=-1)
 57 |         elif normalize == "scale":
 58 |             scores = scores * 100
 59 |         elif normalize == "none":
 60 |             pass
 61 |         else:
 62 |             raise ValueError(f"Unknown normalization method: {normalize}")
 63 |             
 64 |         return scores
 65 | 
 66 |     def rerank(
 67 |         self,
 68 |         query: Union[str, List[str]],
 69 |         documents: List[str],
 70 |         top_k: int = 5,
 71 |         normalize: str = "softmax"
 72 |     ) -> List[Dict[str, Union[str, float]]]:
 73 |         """
 74 |         Rerank documents based on their semantic similarity to the query.
 75 |         
 76 |         Args:
 77 |             query: Query string or list of query strings
 78 |             documents: List of documents to rerank
 79 |             top_k: Number of top results to return per query
 80 |             normalize: Normalization method for scores
 81 |             
 82 |         Returns:
 83 |             List of dicts containing reranked documents and their scores.
 84 |             For single query: [{"document": str, "score": float}, ...]
 85 |             For multiple queries: [[{"document": str, "score": float}, ...], ...]
 86 |         """
 87 |         queries = [query] if isinstance(query, str) else query
 88 |         scores = self.calculate_scores(queries, documents, normalize=normalize)
 89 |         
 90 |         results = []
 91 |         for query_scores in scores:
 92 |             top_indices = torch.topk(query_scores, min(top_k, len(documents)), dim=0)
 93 |             query_results = [
 94 |                 {
 95 |                     "document": documents[idx.item()],
 96 |                     "score": score.item()
 97 |                 }
 98 |                 for score, idx in zip(top_indices.values, top_indices.indices)
 99 |             ]
100 |             results.append(query_results)
101 |         
102 |         return results[0] if isinstance(query, str) else results
103 | 
104 |     def get_reranked_documents(
105 |         self,
106 |         query: Union[str, List[str]],
107 |         documents: List[str],
108 |         top_k: int = 5,
109 |         normalize: str = "softmax"
110 |     ) -> Union[List[str], List[List[str]]]:
111 |         """
112 |         Returns only the reranked documents without scores.
113 |         
114 |         Args:
115 |             query: Query string or list of query strings
116 |             documents: List of documents to rerank
117 |             top_k: Number of top results to return per query
118 |             normalize: Normalization method for scores
119 |             
120 |         Returns:
121 |             For single query: List of reranked document strings
122 |             For multiple queries: List of lists of reranked document strings
123 |         """
124 |         results = self.rerank(query, documents, top_k, normalize)
125 |         return "\n".join([x['document'].strip() for x in results])
126 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/ranking_models/chunker.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from langchain_text_splitters import RecursiveCharacterTextSplitter
 3 | 
 4 | class Chunker:
 5 |     """A modular text chunking class that splits text into smaller, overlapping segments.
 6 |     
 7 |     This class provides a flexible way to break down large texts into smaller chunks
 8 |     while maintaining context through configurable overlap. It uses RecursiveCharacterTextSplitter
 9 |     from langchain under the hood.
10 |     
11 |     Attributes:
12 |         chunk_size (int): The target size for each text chunk.
13 |         chunk_overlap (int): The number of characters to overlap between chunks.
14 |         separators (List[str]): List of separators to use for splitting, in order of preference.
15 |         length_function (callable): Function to measure text length (default: len).
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |         chunk_size: int = 150,
21 |         chunk_overlap: int = 50,
22 |         separators: Optional[List[str]] = None,
23 |         length_function: callable = len
24 |     ):
25 |         """Initialize the Chunker with specified parameters.
26 |         
27 |         Args:
28 |             chunk_size (int, optional): Target size for each chunk. Defaults to 250.
29 |             chunk_overlap (int, optional): Number of characters to overlap. Defaults to 50.
30 |             separators (List[str], optional): Custom separators for splitting.
31 |                 Defaults to ["\n\n", "\n", " "].
32 |             length_function (callable, optional): Function to measure text length.
33 |                 Defaults to len.
34 |         """
35 |         self.chunk_size = chunk_size
36 |         self.chunk_overlap = chunk_overlap
37 |         self.separators = separators or ["\n\n", "\n"]
38 |         self.length_function = length_function
39 |         
40 |         self.splitter = RecursiveCharacterTextSplitter(
41 |             separators=self.separators,
42 |             chunk_size=self.chunk_size,
43 |             chunk_overlap=self.chunk_overlap,
44 |             length_function=self.length_function
45 |         )
46 |     
47 |     def split_text(self, text: str) -> List[str]:
48 |         """Split a single text into chunks.
49 |         
50 |         Args:
51 |             text (str): The input text to be split into chunks.
52 |             
53 |         Returns:
54 |             List[str]: A list of text chunks.
55 |         """
56 |         return self.splitter.split_text(text)
57 |     
58 |     def split_texts(self, texts: List[str]) -> List[List[str]]:
59 |         """Split multiple texts into chunks.
60 |         
61 |         Args:
62 |             texts (List[str]): A list of input texts to be split into chunks.
63 |             
64 |         Returns:
65 |             List[List[str]]: A list of lists, where each inner list contains
66 |                 the chunks for one input text.
67 |         """
68 |         return [self.split_text(text) for text in texts]
69 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/ranking_models/infinity_rerank.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import requests
 3 | import json
 4 | from typing import List
 5 | from opendeepsearch.ranking_models.base_reranker import BaseSemanticSearcher
 6 | 
 7 | class InfinitySemanticSearcher(BaseSemanticSearcher):
 8 |     """
 9 |     A semantic reranking model that uses the Infinity Embedding API for text embeddings.
10 |     
11 |     This class provides methods to rerank documents based on their semantic similarity
12 |     to queries using embeddings from the Infinity API. The API endpoint expects to receive
13 |     text inputs and returns high-dimensional embeddings that capture semantic meaning.
14 |     
15 |     The default model used is 'Alibaba-NLP/gte-Qwen2-7B-instruct', but other models
16 |     available through the Infinity API can be specified.
17 |     
18 |     Attributes:
19 |         embedding_endpoint (str): URL of the Infinity Embedding API endpoint
20 |         model_name (str): Name of the embedding model to use
21 |         
22 |     Example:
23 |         ```python
24 |         reranker = SemanticSearch(
25 |             embedding_endpoint="http://localhost:7997/embeddings",
26 |             model_name="Alibaba-NLP/gte-Qwen2-7B-instruct"
27 |         )
28 |         
29 |         documents = [
30 |             "Munich is in Germany.",
31 |             "The sky is blue."
32 |         ]
33 |         
34 |         results = reranker.rerank(
35 |             query="What color is the sky?",
36 |             documents=documents,
37 |             top_k=1
38 |         )
39 |         ```
40 |     """
41 |     
42 |     def __init__(
43 |         self, 
44 |         embedding_endpoint: str = "http://localhost:7997/embeddings",
45 |         model_name: str = "Alibaba-NLP/gte-Qwen2-7B-instruct",
46 |         instruction_prefix: str = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
47 |     ):
48 |         """
49 |         Initialize the semantic search engine with Infinity Embedding API settings.
50 |         
51 |         Args:
52 |             embedding_endpoint: URL of the Infinity Embedding API endpoint
53 |             model_name: Name of the embedding model available in Infinity API
54 |             instruction_prefix: Prefix to add to queries for better search relevance
55 |         """
56 |         self.embedding_endpoint = embedding_endpoint
57 |         self.model_name = model_name
58 |         self.instruction_prefix = instruction_prefix
59 | 
60 |     def _get_embeddings(self, texts: List[str], embedding_type: str = "query") -> torch.Tensor:
61 |         """
62 |         Get embeddings for a list of texts using the Infinity API.
63 |         """
64 |         MAX_TEXTS = 2048
65 |         if len(texts) > MAX_TEXTS:
66 |             import warnings
67 |             warnings.warn(f"Number of texts ({len(texts)}) exceeds maximum of {MAX_TEXTS}. List will be truncated.")
68 |             texts = texts[:MAX_TEXTS]
69 | 
70 |         # Format queries with instruction prefix
71 |         formatted_texts = [
72 |             self.instruction_prefix + text if embedding_type == "query" else text
73 |             for text in texts
74 |         ]
75 | 
76 |         response = requests.post(
77 |             self.embedding_endpoint,
78 |             json={
79 |                 "model": self.model_name,
80 |                 "input": formatted_texts
81 |             }
82 |         )
83 |         
84 |         content_str = response.content.decode('utf-8')
85 |         content_json = json.loads(content_str)
86 |         return torch.tensor([item['embedding'] for item in content_json['data']])
87 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/ranking_models/jina_reranker.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import torch
 3 | from typing import List, Optional
 4 | from dotenv import load_dotenv
 5 | import os
 6 | from .base_reranker import BaseSemanticSearcher
 7 | 
 8 | class JinaReranker(BaseSemanticSearcher):
 9 |     """
10 |     Semantic searcher implementation using Jina AI's embedding API.
11 |     """
12 |     
13 |     def __init__(self, api_key: Optional[str] = None, model: str = "jina-embeddings-v3"):
14 |         """
15 |         Initialize the Jina reranker.
16 |         
17 |         Args:
18 |             api_key: Jina AI API key. If None, will load from environment variable JINA_API_KEY
19 |             model: Model name to use (default: "jina-embeddings-v3")
20 |         """
21 |         if api_key is None:
22 |             load_dotenv()
23 |             api_key = os.getenv('JINA_API_KEY')
24 |             if not api_key:
25 |                 raise ValueError("No API key provided and JINA_API_KEY not found in environment variables")
26 |         
27 |         self.api_url = 'https://api.jina.ai/v1/embeddings'
28 |         self.headers = {
29 |             'Content-Type': 'application/json',
30 |             'Authorization': f'Bearer {api_key}'
31 |         }
32 |         self.model = model
33 | 
34 |     def _get_embeddings(self, texts: List[str]) -> torch.Tensor:
35 |         """
36 |         Get embeddings for a list of texts using Jina AI API.
37 |         
38 |         Args:
39 |             texts: List of text strings to embed
40 |             
41 |         Returns:
42 |             torch.Tensor containing the embeddings
43 |         """
44 |         data = {
45 |             "model": self.model,
46 |             "task": "text-matching",
47 |             "late_chunking": False,
48 |             "dimensions": 1024,
49 |             "embedding_type": "float",
50 |             "input": texts
51 |         }
52 |         
53 |         try:
54 |             response = requests.post(self.api_url, headers=self.headers, json=data)
55 |             response.raise_for_status()  # Raise exception for non-200 status codes
56 |             
57 |             # Extract embeddings from response
58 |             embeddings_data = [item["embedding"] for item in response.json()["data"]]
59 |             
60 |             # Convert to torch tensor
61 |             embeddings = torch.tensor(embeddings_data)
62 |             
63 |             return embeddings
64 |             
65 |         except requests.exceptions.RequestException as e:
66 |             raise RuntimeError(f"Error calling Jina AI API: {str(e)}")
67 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/serp_search/serp_search.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass
  3 | from typing import Dict, Any, Optional, List, TypeVar, Generic, Union
  4 | from abc import ABC, abstractmethod
  5 | 
  6 | import requests
  7 | 
  8 | T = TypeVar('T')
  9 | 
 10 | class SearchAPIException(Exception):
 11 |     """Custom exception for Search API related errors"""
 12 |     pass
 13 | 
 14 | class SerperAPIException(SearchAPIException):
 15 |     """Custom exception for Serper API related errors"""
 16 |     pass
 17 | 
 18 | class SearXNGException(SearchAPIException):
 19 |     """Custom exception for SearXNG related errors"""
 20 |     pass
 21 | 
 22 | @dataclass
 23 | class SerperConfig:
 24 |     """Configuration for Serper API"""
 25 |     api_key: str
 26 |     api_url: str = "https://google.serper.dev/search"
 27 |     default_location: str = 'us'
 28 |     timeout: int = 10
 29 | 
 30 |     @classmethod
 31 |     def from_env(cls) -> 'SerperConfig':
 32 |         """Create config from environment variables"""
 33 |         api_key = os.getenv("SERPER_API_KEY")
 34 |         if not api_key:
 35 |             raise SerperAPIException("SERPER_API_KEY environment variable not set")
 36 |         return cls(api_key=api_key)
 37 | 
 38 | @dataclass
 39 | class SearXNGConfig:
 40 |     """Configuration for SearXNG instance"""
 41 |     instance_url: str
 42 |     api_key: Optional[str] = None
 43 |     default_location: str = 'all'
 44 |     timeout: int = 10
 45 | 
 46 |     @classmethod
 47 |     def from_env(cls) -> 'SearXNGConfig':
 48 |         """Create config from environment variables"""
 49 |         instance_url = os.getenv("SEARXNG_INSTANCE_URL")
 50 |         if not instance_url:
 51 |             raise SearXNGException("SEARXNG_INSTANCE_URL environment variable not set")
 52 |         api_key = os.getenv("SEARXNG_API_KEY")  # Optional
 53 |         return cls(instance_url=instance_url, api_key=api_key)
 54 | 
 55 | class SearchResult(Generic[T]):
 56 |     """Container for search results with error handling"""
 57 |     def __init__(self, data: Optional[T] = None, error: Optional[str] = None):
 58 |         self.data = data
 59 |         self.error = error
 60 |         self.success = error is None
 61 | 
 62 |     @property
 63 |     def failed(self) -> bool:
 64 |         return not self.success
 65 | 
 66 | class SearchAPI(ABC):
 67 |     """Abstract base class for search APIs"""
 68 |     @abstractmethod
 69 |     def get_sources(
 70 |         self,
 71 |         query: str,
 72 |         num_results: int = 8,
 73 |         stored_location: Optional[str] = None
 74 |     ) -> SearchResult[Dict[str, Any]]:
 75 |         """Get search results from the API"""
 76 |         pass
 77 | 
 78 | class SerperAPI(SearchAPI):
 79 |     def __init__(self, api_key: Optional[str] = None, config: Optional[SerperConfig] = None):
 80 |         if api_key:
 81 |             self.config = SerperConfig(api_key=api_key)
 82 |         else:
 83 |             self.config = config or SerperConfig.from_env()
 84 | 
 85 |         self.headers = {
 86 |             'X-API-KEY': self.config.api_key,
 87 |             'Content-Type': 'application/json'
 88 |         }
 89 | 
 90 |     @staticmethod
 91 |     def extract_fields(items: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]:
 92 |         """Extract specified fields from a list of dictionaries"""
 93 |         return [{key: item.get(key, "") for key in fields if key in item} for item in items]
 94 | 
 95 |     def get_sources(
 96 |         self,
 97 |         query: str,
 98 |         num_results: int = 8,
 99 |         stored_location: Optional[str] = None
100 |     ) -> SearchResult[Dict[str, Any]]:
101 |         """
102 |         Fetch search results from Serper API.
103 | 
104 |         Args:
105 |             query: Search query string
106 |             num_results: Number of results to return (default: 8, max: 10)
107 |             stored_location: Optional location string
108 | 
109 |         Returns:
110 |             SearchResult containing the search results or error information
111 |         """
112 |         if not query.strip():
113 |             return SearchResult(error="Query cannot be empty")
114 | 
115 |         try:
116 |             search_location = (stored_location or self.config.default_location).lower()
117 | 
118 |             payload = {
119 |                 "q": query,
120 |                 "num": min(max(1, num_results), 10),
121 |                 "gl": search_location
122 |             }
123 | 
124 |             response = requests.post(
125 |                 self.config.api_url,
126 |                 headers=self.headers,
127 |                 json=payload,
128 |                 timeout=self.config.timeout
129 |             )
130 |             response.raise_for_status()
131 |             data = response.json()
132 | 
133 |             results = {
134 |                 'organic': self.extract_fields(
135 |                     data.get('organic', []),
136 |                     ['title', 'link', 'snippet', 'date']
137 |                 ),
138 |                 'topStories': self.extract_fields(
139 |                     data.get('topStories', []),
140 |                     ['title', 'imageUrl']
141 |                 ),
142 |                 'images': self.extract_fields(
143 |                     data.get('images', [])[:6],
144 |                     ['title', 'imageUrl']
145 |                 ),
146 |                 'graph': data.get('knowledgeGraph'),
147 |                 'answerBox': data.get('answerBox'),
148 |                 'peopleAlsoAsk': data.get('peopleAlsoAsk'),
149 |                 'relatedSearches': data.get('relatedSearches')
150 |             }
151 | 
152 |             return SearchResult(data=results)
153 | 
154 |         except requests.RequestException as e:
155 |             return SearchResult(error=f"API request failed: {str(e)}")
156 |         except Exception as e:
157 |             return SearchResult(error=f"Unexpected error: {str(e)}")
158 | 
159 | 
160 | class SearXNGAPI(SearchAPI):
161 |     """API client for SearXNG search engine"""
162 | 
163 |     def __init__(self, instance_url: Optional[str] = None, api_key: Optional[str] = None, config: Optional[SearXNGConfig] = None):
164 |         if instance_url:
165 |             self.config = SearXNGConfig(instance_url=instance_url, api_key=api_key)
166 |         else:
167 |             self.config = config or SearXNGConfig.from_env()
168 | 
169 |         self.headers = {'Content-Type': 'application/json'}
170 |         if self.config.api_key:
171 |             self.headers['X-API-Key'] = self.config.api_key
172 | 
173 |     def get_sources(
174 |         self,
175 |         query: str,
176 |         num_results: int = 8,
177 |         stored_location: Optional[str] = None
178 |     ) -> SearchResult[Dict[str, Any]]:
179 |         """
180 |         Fetch search results from SearXNG instance.
181 | 
182 |         Args:
183 |             query: Search query string
184 |             num_results: Number of results to return (default: 8)
185 |             stored_location: Optional location string (may not be supported by all instances)
186 | 
187 |         Returns:
188 |             SearchResult containing the search results or error information
189 |         """
190 |         if not query.strip():
191 |             return SearchResult(error="Query cannot be empty")
192 | 
193 |         try:
194 |             # Ensure the instance URL ends with /search
195 |             search_url = self.config.instance_url
196 |             if not search_url.endswith('/search'):
197 |                 search_url = search_url.rstrip('/') + '/search'
198 | 
199 |             # Prepare parameters for SearXNG
200 |             params = {
201 |                 'q': query,
202 |                 'format': 'json',
203 |                 'pageno': 1,
204 |                 'categories': 'general',
205 |                 'language': 'all',
206 |                 'time_range': None,
207 |                 'safesearch': 0,
208 |                 'engines': 'google,bing,duckduckgo',  # Default engines, can be customised
209 |                 'max_results': min(max(1, num_results), 20)  # Limit to reasonable range
210 |             }
211 | 
212 |             # Add location if provided and supported
213 |             if stored_location and stored_location != 'all':
214 |                 params['language'] = stored_location
215 | 
216 |             response = requests.get(
217 |                 search_url,
218 |                 headers=self.headers,
219 |                 params=params,
220 |                 timeout=self.config.timeout
221 |             )
222 |             response.raise_for_status()
223 |             data = response.json()
224 | 
225 |             # Transform SearXNG results to match SerperAPI format
226 |             organic_results = []
227 |             for result in data.get('results', [])[:num_results]:
228 |                 organic_results.append({
229 |                     'title': result.get('title', ''),
230 |                     'link': result.get('url', ''),
231 |                     'snippet': result.get('content', ''),
232 |                     'date': result.get('publishedDate', '')
233 |                 })
234 | 
235 |             # Extract image results if available
236 |             image_results = []
237 |             for result in data.get('results', []):
238 |                 if result.get('img_src'):
239 |                     image_results.append({
240 |                         'title': result.get('title', ''),
241 |                         'imageUrl': result.get('img_src', '')
242 |                     })
243 |             image_results = image_results[:6]  # Limit to 6 images like SerperAPI
244 | 
245 |             # Format results to match SerperAPI structure
246 |             results = {
247 |                 'organic': organic_results,
248 |                 'images': image_results,
249 |                 'topStories': [],  # SearXNG might not have direct equivalent
250 |                 'graph': None,     # SearXNG doesn't provide knowledge graph
251 |                 'answerBox': None, # SearXNG doesn't provide answer box
252 |                 'peopleAlsoAsk': None,
253 |                 'relatedSearches': data.get('suggestions', [])
254 |             }
255 | 
256 |             return SearchResult(data=results)
257 | 
258 |         except requests.RequestException as e:
259 |             return SearchResult(error=f"SearXNG API request failed: {str(e)}")
260 |         except Exception as e:
261 |             return SearchResult(error=f"Unexpected error with SearXNG: {str(e)}")
262 | 
263 | 
264 | def create_search_api(
265 |     search_provider: str = "serper",
266 |     serper_api_key: Optional[str] = None,
267 |     searxng_instance_url: Optional[str] = None,
268 |     searxng_api_key: Optional[str] = None
269 | ) -> SearchAPI:
270 |     """
271 |     Factory function to create the appropriate search API client.
272 | 
273 |     Args:
274 |         search_provider: The search provider to use ('serper' or 'searxng')
275 |         serper_api_key: Optional API key for Serper
276 |         searxng_instance_url: Optional SearXNG instance URL
277 |         searxng_api_key: Optional API key for SearXNG instance
278 | 
279 |     Returns:
280 |         An instance of a SearchAPI implementation
281 | 
282 |     Raises:
283 |         ValueError: If an invalid search provider is specified
284 |     """
285 |     if search_provider.lower() == "serper":
286 |         return SerperAPI(api_key=serper_api_key)
287 |     elif search_provider.lower() == "searxng":
288 |         return SearXNGAPI(instance_url=searxng_instance_url, api_key=searxng_api_key)
289 |     else:
290 |         raise ValueError(f"Invalid search provider: {search_provider}. Must be 'serper' or 'searxng'")
291 | 


--------------------------------------------------------------------------------
/src/opendeepsearch/wolfram_tool.py:
--------------------------------------------------------------------------------
 1 | from smolagents import Tool
 2 | import wolframalpha
 3 | import json
 4 | import os
 5 | 
 6 | class WolframAlphaTool(Tool):
 7 |     name = "calculate"
 8 |     description = """
 9 |     Performs computational, mathematical, and factual queries using Wolfram Alpha's computational knowledge engine.
10 |     """
11 |     inputs = {
12 |         "query": {
13 |             "type": "string",
14 |             "description": "The query to send to Wolfram Alpha",
15 |         },
16 |     }
17 |     output_type = "string"
18 |     
19 |     def __init__(self, app_id: str):
20 |         super().__init__()
21 |         self.app_id = app_id
22 |     
23 |     def setup(self):
24 |         self.search_tool = WolframAlphaTool(
25 |             self.app_id,
26 |         )
27 |         
28 |     def forward(self, query: str):
29 |             
30 |         # Initialize the Wolfram Alpha client
31 |         self.wolfram_client = wolframalpha.Client(self.app_id)
32 |         
33 |         try:
34 |             # Send the query to Wolfram Alpha
35 |             res = self.wolfram_client.query(query)
36 |             
37 |             # Process the results
38 |             results = []
39 |             for pod in res.pods:
40 |                 if pod.title:
41 |                     for subpod in pod.subpods:
42 |                         if subpod.plaintext:
43 |                             results.append({
44 |                                 'title': pod.title,
45 |                                 'result': subpod.plaintext
46 |                             })
47 |                             
48 |             # Convert results to a JSON-serializable format
49 |             formatted_result = {
50 |                 'queryresult': {
51 |                     'success': bool(results),
52 |                     'inputstring': query,
53 |                     'pods': [
54 |                         {
55 |                             'title': result['title'], 
56 |                             'subpods': [{'title': '', 'plaintext': result['result']}]
57 |                         } for result in results
58 |                     ]
59 |                 }
60 |             }
61 |             
62 |             # Initialize final_result with a default value
63 |             final_result = "No result found."
64 |             
65 |             # Extract the pods from the query result
66 |             pods = formatted_result.get("queryresult", {}).get("pods", [])
67 |             
68 |             # Loop through pods to find the "Result" title
69 |             for pod in pods:
70 |                 if pod.get("title") == "Result":
71 |                     # Extract and return the plaintext from the subpods
72 |                     subpods = pod.get("subpods", [])
73 |                     if subpods:
74 |                         final_result = subpods[0].get("plaintext", "").strip()
75 |                         break
76 |             
77 |             # If no "Result" pod was found, use the first available result
78 |             if final_result == "No result found." and results:
79 |                 final_result = results[0]['result']
80 |                 
81 |             
82 |             print(f"QUERY: {query}\n\nRESULT: {final_result}")
83 |             return final_result
84 |             
85 |         except Exception as e:
86 |             error_message = f"Error querying Wolfram Alpha: {str(e)}"
87 |             print(error_message)
88 |             return error_message
89 |     


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------