├── .env.example ├── .gitignore ├── LICENSE ├── README.md ├── assets ├── evals.png └── sentient-logo-narrow.png ├── evals ├── README.md ├── autograde_df.py ├── datasets │ ├── frames_test_set.csv │ └── simple_qa_test_set.csv ├── eval_gpt_web.py ├── eval_tasks.py ├── gpt_web_extract.py └── grader_prompts.py ├── gradio_demo.py ├── pdm.lock ├── pyproject.toml ├── requirements.txt ├── src └── opendeepsearch │ ├── __init__.py │ ├── context_building │ ├── build_context.py │ └── process_sources_pro.py │ ├── context_scraping │ ├── basic_web_scraper.py │ ├── crawl4ai_scraper.py │ ├── extraction_result.py │ ├── fast_scraper.py │ ├── strategy_factory.py │ └── utils.py │ ├── ods_agent.py │ ├── ods_tool.py │ ├── prompts.py │ ├── ranking_models │ ├── README.md │ ├── base_reranker.py │ ├── chunker.py │ ├── infinity_rerank.py │ └── jina_reranker.py │ ├── serp_search │ └── serp_search.py │ └── wolfram_tool.py └── tests └── __init__.py /.env.example: -------------------------------------------------------------------------------- 1 | # SEARXNG_INSTANCE_URL=http://searxng:8080 2 | # or 3 | # SERPER_API_KEY= 4 | 5 | JINA_API_KEY= 6 | WOLFRAM_ALPHA_APP_ID= 7 | 8 | ### Providers ### 9 | OPENAI_API_KEY= 10 | OPENAI_BASE_URL= 11 | ANTHROPIC_API_KEY= 12 | OPENROUTER_API_KEY= 13 | 14 | # LiteLLM model IDs for different tasks 15 | LITELLM_MODEL_ID=openrouter/google/gemini-2.0-flash-001 16 | LITELLM_SEARCH_MODEL_ID=openrouter/google/gemini-2.0-flash-001 17 | LITELLM_ORCHESTRATOR_MODEL_ID=openrouter/google/gemini-2.0-flash-001 18 | LITELLM_EVAL_MODEL_ID=gpt-4o-mini 19 | FIREWORKS_API_KEY= 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | *.ipynb 9 | *.ipynb_checkpoints 10 | output/ 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm-project.org/#use-with-ide 113 | .pdm.toml 114 | .pdm-python 115 | .pdm-build/ 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🔍OpenDeepSearch: Democratizing Search with Open-source Reasoning Models and Reasoning Agents 🚀 2 | 3 | 4 | 5 | 6 | 7 |
8 | alt text 9 |
10 | 11 |
12 |
13 | 14 | Homepage 15 | 16 | 17 | GitHub 18 | 19 | 20 | Hugging Face 21 | 22 |
23 | 24 |
25 | 26 | Discord 27 | 28 | 29 | Twitter Follow 30 | 31 |
32 | 33 |

34 | Paper 35 |

36 | 37 | ## Description 📝 38 | 39 | OpenDeepSearch is a lightweight yet powerful search tool designed for seamless integration with AI agents. It enables deep web search and retrieval, optimized for use with Hugging Face's **[SmolAgents](https://github.com/huggingface/smolagents)** ecosystem. 40 | 41 |
42 | Evaluation Results 43 |
44 | 45 | - **Performance**: ODS performs on par with closed source search alternatives on single-hop queries such as [SimpleQA](https://openai.com/index/introducing-simpleqa/) 🔍. 46 | - **Advanced Capabilities**: ODS performs much better than closed source search alternatives on multi-hop queries such as [FRAMES bench](https://huggingface.co/datasets/google/frames-benchmark) 🚀. 47 | 48 | ## Table of Contents 📑 49 | 50 | - [🔍OpenDeepSearch: Democratizing Search with Open-source Reasoning Models and Reasoning Agents 🚀](#opendeepsearch-democratizing-search-with-open-source-reasoning-models-and-reasoning-agents-) 51 | - [Description 📝](#description-) 52 | - [Table of Contents 📑](#table-of-contents-) 53 | - [Features ✨](#features-) 54 | - [Installation 📚](#installation-) 55 | - [Setup](#setup) 56 | - [Usage ️](#usage-️) 57 | - [Using OpenDeepSearch Standalone 🔍](#using-opendeepsearch-standalone-) 58 | - [Running the Gradio Demo 🖥️](#running-the-gradio-demo-️) 59 | - [Integrating with SmolAgents \& LiteLLM 🤖⚙️](#integrating-with-smolagents--litellm-️) 60 | - [](#) 61 | - [ReAct agent with math and search tools 🤖⚙️](#react-agent-with-math-and-search-tools-️) 62 | - [](#-1) 63 | - [Search Modes 🔄](#search-modes-) 64 | - [Default Mode ⚡](#default-mode-) 65 | - [Pro Mode 🔍](#pro-mode-) 66 | - [Acknowledgments 💡](#acknowledgments-) 67 | - [Citation](#citation) 68 | - [Contact 📩](#contact-) 69 | 70 | ## Features ✨ 71 | 72 | - **Semantic Search** 🧠: Leverages **[Crawl4AI](https://github.com/unclecode/crawl4ai)** and semantic search rerankers (such as [Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/tree/main) and [Jina AI](https://jina.ai/)) to provide in-depth results 73 | - **Two Modes of Operation** ⚡: 74 | - **Default Mode**: Quick and efficient search with minimal latency. 75 | - **Pro Mode (Deep Search)**: More in-depth and accurate results at the cost of additional processing time. 76 | - **Optimized for AI Agents** 🤖: Works seamlessly with **SmolAgents** like `CodeAgent`. 77 | - **Fast and Lightweight** ⚡: Designed for speed and efficiency with minimal setup. 78 | - **Extensible** 🔌: Easily configurable to work with different models and APIs. 79 | 80 | ## Installation 📚 81 | 82 | To install OpenDeepSearch, run: 83 | 84 | ```bash 85 | pip install -e . #you can also use: uv pip install -e . 86 | pip install -r requirements.txt #you can also use: uv pip install -r requirements.txt 87 | ``` 88 | 89 | Note: you must have `torch` installed. 90 | Note: using `uv` instead of regular `pip` makes life much easier! 91 | 92 | ### Using PDM (Alternative Package Manager) 📦 93 | 94 | You can also use PDM as an alternative package manager for OpenDeepSearch. PDM is a modern Python package and dependency manager supporting the latest PEP standards. 95 | 96 | ```bash 97 | # Install PDM if you haven't already 98 | curl -sSL https://raw.githubusercontent.com/pdm-project/pdm/main/install-pdm.py | python3 - 99 | 100 | # Initialize a new PDM project 101 | pdm init 102 | 103 | # Install OpenDeepSearch and its dependencies 104 | pdm install 105 | 106 | # Activate the virtual environment 107 | eval "$(pdm venv activate)" 108 | ``` 109 | 110 | PDM offers several advantages: 111 | - Lockfile support for reproducible installations 112 | - PEP 582 support (no virtual environment needed) 113 | - Fast dependency resolution 114 | - Built-in virtual environment management 115 | 116 | ## Setup 117 | 118 | 1. **Choose a Search Provider**: 119 | - **Option 1: Serper.dev**: Get **free 2500 credits** and add your API key. 120 | - Visit [serper.dev](https://serper.dev) to create an account. 121 | - Retrieve your API key and store it as an environment variable: 122 | 123 | ```bash 124 | export SERPER_API_KEY='your-api-key-here' 125 | ``` 126 | 127 | - **Option 2: SearXNG**: Use a self-hosted or public SearXNG instance. 128 | - Specify the SearXNG instance URL when initializing OpenDeepSearch. 129 | - Optionally provide an API key if your instance requires authentication: 130 | 131 | ```bash 132 | export SEARXNG_INSTANCE_URL='https://your-searxng-instance.com' 133 | export SEARXNG_API_KEY='your-api-key-here' # Optional 134 | ``` 135 | 136 | 2. **Choose a Reranking Solution**: 137 | - **Quick Start with Jina**: Sign up at [Jina AI](https://jina.ai/) to get an API key for immediate use 138 | - **Self-hosted Option**: Set up [Infinity Embeddings](https://github.com/michaelfeil/infinity) server locally with open source models such as [Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/tree/main) 139 | - For more details on reranking options, see our [Rerankers Guide](src/opendeepsearch/ranking_models/README.md) 140 | 141 | 3. **Set up LiteLLM Provider**: 142 | - Choose a provider from the [supported list](https://docs.litellm.ai/docs/providers/), including: 143 | - OpenAI 144 | - Anthropic 145 | - Google (Gemini) 146 | - OpenRouter 147 | - HuggingFace 148 | - Fireworks 149 | - And many more! 150 | - Set your chosen provider's API key as an environment variable: 151 | ```bash 152 | export _API_KEY='your-api-key-here' # e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY 153 | ``` 154 | - For OpenAI, you can also set a custom base URL (useful for self-hosted endpoints or proxies): 155 | ```bash 156 | export OPENAI_BASE_URL='https://your-custom-openai-endpoint.com' 157 | ``` 158 | - You can set default LiteLLM model IDs for different tasks: 159 | ```bash 160 | # General default model (fallback for all tasks) 161 | export LITELLM_MODEL_ID='openrouter/google/gemini-2.0-flash-001' 162 | 163 | # Task-specific models 164 | export LITELLM_SEARCH_MODEL_ID='openrouter/google/gemini-2.0-flash-001' # For search tasks 165 | export LITELLM_ORCHESTRATOR_MODEL_ID='openrouter/google/gemini-2.0-flash-001' # For agent orchestration 166 | export LITELLM_EVAL_MODEL_ID='gpt-4o-mini' # For evaluation tasks 167 | ``` 168 | - When initializing OpenDeepSearch, you can specify your chosen model using the provider's format (this will override the environment variables): 169 | ```python 170 | search_agent = OpenDeepSearchTool(model_name="provider/model-name") # e.g., "anthropic/claude-3-opus-20240229", 'huggingface/microsoft/codebert-base', 'openrouter/google/gemini-2.0-flash-001' 171 | ``` 172 | 173 | ## Usage ️ 174 | 175 | You can use OpenDeepSearch independently or integrate it with **SmolAgents** for enhanced reasoning and code generation capabilities. 176 | 177 | ### Using OpenDeepSearch Standalone 🔍 178 | 179 | ```python 180 | from opendeepsearch import OpenDeepSearchTool 181 | import os 182 | 183 | # Set environment variables for API keys 184 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here" # If using Serper 185 | # Or for SearXNG 186 | # os.environ["SEARXNG_INSTANCE_URL"] = "https://your-searxng-instance.com" 187 | # os.environ["SEARXNG_API_KEY"] = "your-api-key-here" # Optional 188 | 189 | os.environ["OPENROUTER_API_KEY"] = "your-openrouter-api-key-here" 190 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here" 191 | 192 | # Using Serper (default) 193 | search_agent = OpenDeepSearchTool( 194 | model_name="openrouter/google/gemini-2.0-flash-001", 195 | reranker="jina" 196 | ) 197 | 198 | # Or using SearXNG 199 | # search_agent = OpenDeepSearchTool( 200 | # model_name="openrouter/google/gemini-2.0-flash-001", 201 | # reranker="jina", 202 | # search_provider="searxng", 203 | # searxng_instance_url="https://your-searxng-instance.com", 204 | # searxng_api_key="your-api-key-here" # Optional 205 | # ) 206 | 207 | if not search_agent.is_initialized: 208 | search_agent.setup() 209 | 210 | query = "Fastest land animal?" 211 | result = search_agent.forward(query) 212 | print(result) 213 | ``` 214 | 215 | ### Running the Gradio Demo 🖥️ 216 | 217 | To try out OpenDeepSearch with a user-friendly interface, simply run: 218 | 219 | ```bash 220 | python gradio_demo.py 221 | ``` 222 | 223 | This will launch a local web interface where you can test different search queries and modes interactively. 224 | 225 | You can customize the demo with command-line arguments: 226 | 227 | ```bash 228 | # Using Serper (default) 229 | python gradio_demo.py --model-name "openrouter/google/gemini-2.0-flash-001" --reranker "jina" 230 | 231 | # Using SearXNG 232 | python gradio_demo.py --model-name "openrouter/google/gemini-2.0-flash-001" --reranker "jina" \ 233 | --search-provider "searxng" --searxng-instance "https://your-searxng-instance.com" \ 234 | --searxng-api-key "your-api-key-here" # Optional 235 | ``` 236 | 237 | Available options: 238 | - `--model-name`: LLM model to use for search 239 | - `--orchestrator-model`: LLM model for the agent orchestrator 240 | - `--reranker`: Reranker to use (`jina` or `infinity`) 241 | - `--search-provider`: Search provider to use (`serper` or `searxng`) 242 | - `--searxng-instance`: SearXNG instance URL (required if using `searxng`) 243 | - `--searxng-api-key`: SearXNG API key (optional) 244 | - `--serper-api-key`: Serper API key (optional, will use environment variable if not provided) 245 | - `--openai-base-url`: OpenAI API base URL (optional, will use OPENAI_BASE_URL env var if not provided) 246 | 247 | ### Integrating with SmolAgents & LiteLLM 🤖⚙️ 248 | 249 | #### 250 | 251 | ```python 252 | from opendeepsearch import OpenDeepSearchTool 253 | from smolagents import CodeAgent, LiteLLMModel 254 | import os 255 | 256 | # Set environment variables for API keys 257 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here" # If using Serper 258 | # Or for SearXNG 259 | # os.environ["SEARXNG_INSTANCE_URL"] = "https://your-searxng-instance.com" 260 | # os.environ["SEARXNG_API_KEY"] = "your-api-key-here" # Optional 261 | 262 | os.environ["OPENROUTER_API_KEY"] = "your-openrouter-api-key-here" 263 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here" 264 | 265 | # Using Serper (default) 266 | search_agent = OpenDeepSearchTool( 267 | model_name="openrouter/google/gemini-2.0-flash-001", 268 | reranker="jina" 269 | ) 270 | 271 | # Or using SearXNG 272 | # search_agent = OpenDeepSearchTool( 273 | # model_name="openrouter/google/gemini-2.0-flash-001", 274 | # reranker="jina", 275 | # search_provider="searxng", 276 | # searxng_instance_url="https://your-searxng-instance.com", 277 | # searxng_api_key="your-api-key-here" # Optional 278 | # ) 279 | 280 | model = LiteLLMModel( 281 | "openrouter/google/gemini-2.0-flash-001", 282 | temperature=0.2 283 | ) 284 | 285 | code_agent = CodeAgent(tools=[search_agent], model=model) 286 | query = "How long would a cheetah at full speed take to run the length of Pont Alexandre III?" 287 | result = code_agent.run(query) 288 | 289 | print(result) 290 | ``` 291 | ### ReAct agent with math and search tools 🤖⚙️ 292 | 293 | #### 294 | ```python 295 | from opendeepsearch import OpenDeepSearchTool 296 | from opendeepsearch.wolfram_tool import WolframAlphaTool 297 | from opendeepsearch.prompts import REACT_PROMPT 298 | from smolagents import LiteLLMModel, ToolCallingAgent, Tool 299 | import os 300 | 301 | # Set environment variables for API keys 302 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here" 303 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here" 304 | os.environ["WOLFRAM_ALPHA_APP_ID"] = "your-wolfram-alpha-app-id-here" 305 | os.environ["FIREWORKS_API_KEY"] = "your-fireworks-api-key-here" 306 | 307 | model = LiteLLMModel( 308 | "fireworks_ai/llama-v3p1-70b-instruct", # Your Fireworks Deepseek model 309 | temperature=0.7 310 | ) 311 | search_agent = OpenDeepSearchTool(model_name="fireworks_ai/llama-v3p1-70b-instruct", reranker="jina") # Set reranker to "jina" or "infinity" 312 | 313 | # Initialize the Wolfram Alpha tool 314 | wolfram_tool = WolframAlphaTool(app_id=os.environ["WOLFRAM_ALPHA_APP_ID"]) 315 | 316 | # Initialize the React Agent with search and wolfram tools 317 | react_agent = ToolCallingAgent( 318 | tools=[search_agent, wolfram_tool], 319 | model=model, 320 | prompt_templates=REACT_PROMPT # Using REACT_PROMPT as system prompt 321 | ) 322 | 323 | # Example query for the React Agent 324 | query = "What is the distance, in metres, between the Colosseum in Rome and the Rialto bridge in Venice" 325 | result = react_agent.run(query) 326 | 327 | print(result) 328 | ``` 329 | 330 | ## Search Modes 🔄 331 | 332 | OpenDeepSearch offers two distinct search modes to balance between speed and depth: 333 | 334 | ### Default Mode ⚡ 335 | - Uses SERP-based interaction for quick results 336 | - Minimal processing overhead 337 | - Ideal for single-hop, straightforward queries 338 | - Fast response times 339 | - Perfect for basic information retrieval 340 | 341 | ### Pro Mode 🔍 342 | - Involves comprehensive web scraping 343 | - Implements semantic reranking of results 344 | - Includes advanced post-processing of data 345 | - Slightly longer processing time 346 | - Excels at: 347 | - Multi-hop queries 348 | - Complex search requirements 349 | - Detailed information gathering 350 | - Questions requiring cross-reference verification 351 | 352 | ## Acknowledgments 💡 353 | 354 | OpenDeepSearch is built on the shoulders of great open-source projects: 355 | 356 | - **[SmolAgents](https://huggingface.co/docs/smolagents/index)** 🤗 – Powers the agent framework and reasoning capabilities. 357 | - **[Crawl4AI](https://github.com/unclecode/crawl4ai)** 🕷️ – Provides data crawling support. 358 | - **[Infinity Embedding API](https://github.com/michaelfeil/infinity)** 🌍 – Powers semantic search capabilities. 359 | - **[LiteLLM](https://www.litellm.ai/)** 🔥 – Used for efficient AI model integration. 360 | - **Various Open-Source Libraries** 📚 – Enhancing search and retrieval functionalities. 361 | 362 | ## Citation 363 | 364 | If you use `OpenDeepSearch` in your works, please cite it using the following BibTex entry: 365 | 366 | ``` 367 | @misc{alzubi2025opendeepsearchdemocratizing, 368 | title={Open Deep Search: Democratizing Search with Open-source Reasoning Agents}, 369 | author={Salaheddin Alzubi and Creston Brooks and Purva Chiniya and Edoardo Contente and Chiara von Gerlach and Lucas Irwin and Yihan Jiang and Arda Kaz and Windsor Nguyen and Sewoong Oh and Himanshu Tyagi and Pramod Viswanath}, 370 | year={2025}, 371 | eprint={2503.20201}, 372 | archivePrefix={arXiv}, 373 | primaryClass={cs.LG}, 374 | url={https://arxiv.org/abs/2503.20201}, 375 | } 376 | ``` 377 | 378 | 379 | ## Contact 📩 380 | 381 | For questions or collaborations, open an issue or reach out to the maintainers. 382 | -------------------------------------------------------------------------------- /assets/evals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-agi/OpenDeepSearch/HEAD/assets/evals.png -------------------------------------------------------------------------------- /assets/sentient-logo-narrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-agi/OpenDeepSearch/HEAD/assets/sentient-logo-narrow.png -------------------------------------------------------------------------------- /evals/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation Scripts 2 | 3 | This repository contains scripts for running evaluations and autograding on model outputs. 4 | 5 | ## Available Commands 6 | 7 | ### Autograde DataFrame Evaluation 8 | To evaluate and autograde DataFrame outputs: 9 | 10 | ```bash 11 | python -m evals.autograde_dataframe --csv_path --output_path 12 | ``` 13 | 14 | Example: 15 | 16 | ```bash 17 | python evals/autograde_df.py output/fireworks_ai__accounts__fireworks__models__qwq-32b/codeact/simple_qa_test_set/fireworks_ai__accounts__fireworks__models__qwq-32b__codeact__simple_qa_test_set__trial1.jsonl 18 | ``` 19 | 20 | This command processes the specified JSONL file and performs automated grading on DataFrame outputs. 21 | 22 | ### Run Task Evaluations 23 | To run evaluations on a dataset with parallel processing: 24 | 25 | ```bash 26 | python ./evals/eval_tasks.py --parallel-workers=8 --num-trials=1 --eval-tasks=./evals/datasets/frames_test_set.csv ./evals/datasets/simple_qa_test_set.csv 27 | ``` 28 | 29 | Parameters: 30 | - `--date`: Optional date for the evaluation 31 | - `--eval-tasks`: List of paths to CSV files containing evaluation tasks (default: ["./evals/datasets/frames_test_set.csv", "./evals/datasets/simple_qa_test_set.csv"]) 32 | - `--search-model-id`: Model ID for the search tool (default: "fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct") 33 | - `--model-type`: Type of model to use, either "LiteLLMModel" or "HfApiModel" (default: "LiteLLMModel") 34 | - `--model-id`: ID of the model to use (default: "fireworks_ai/accounts/fireworks/models/qwq-32b") 35 | - `--agent-action-type`: Type of agent action: "codeact", "tool-calling", or "vanilla" (default: "codeact") 36 | - `--parallel-workers`: Number of parallel workers to use (default: 8) 37 | - `--num-trials`: Number of evaluation trials to run (default: 1) 38 | 39 | The results will be saved as a DataFrame in the `evals` directory. 40 | 41 | ## Output 42 | Evaluation results are stored in the following locations: 43 | - Task evaluation results: `evals/` directory 44 | - DataFrame autograding results: Generated in the script's output 45 | 46 | 47 | -------------------------------------------------------------------------------- /evals/autograde_df.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import litellm 3 | import argparse 4 | from evals.grader_prompts import GRADER_TEMPLATE 5 | from multiprocessing import Pool, cpu_count 6 | from tqdm import tqdm 7 | 8 | def grade_row(row_data): 9 | idx, row = row_data 10 | question = row['original_question'] 11 | predicted_answer = row['answer'] 12 | gold_answer = row['true_answer'] 13 | 14 | input_prompt = GRADER_TEMPLATE.format( 15 | question=question, 16 | predicted_answer=predicted_answer, 17 | target=gold_answer 18 | ) 19 | 20 | try: 21 | output = litellm.completion( 22 | model="openrouter/google/gemini-2.0-flash-001", 23 | messages=[{"role": "user", "content": input_prompt}], 24 | temperature=0.0 25 | )['choices'][0]['message']['content'] 26 | return idx, output 27 | except Exception as e: 28 | print(f"Error processing row {idx}: {e}") 29 | return idx, "Error" 30 | 31 | def autograde_df(df_path, num_cpus=4): 32 | # Read the dataframe 33 | df = pd.read_json(df_path, lines=True) 34 | 35 | # Prepare data for parallel processing 36 | row_data = list(df.iterrows()) 37 | 38 | # Use specified number of CPU cores 39 | n_processes = max(1, min(num_cpus, cpu_count())) 40 | print(f"Using {n_processes} processes") 41 | 42 | # Create process pool and process rows in parallel 43 | with Pool(n_processes) as pool: 44 | # Use tqdm for progress bar 45 | results = list(tqdm( 46 | pool.imap(grade_row, row_data), 47 | total=len(row_data), 48 | desc="Grading" 49 | )) 50 | 51 | # Sort results by index and extract grades 52 | results.sort(key=lambda x: x[0]) 53 | final_grades = [grade for _, grade in results] 54 | 55 | # Add the grades as a new column 56 | df['final_grade'] = final_grades 57 | 58 | # Save the updated dataframe back to the same file 59 | df.to_json(df_path, orient='records', lines=True) 60 | print("Grading completed and results saved!") 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser(description='Auto-grade answers in a DataFrame') 64 | parser.add_argument('df_path', type=str, help='Path to the DataFrame JSON file') 65 | parser.add_argument('--num_cpus', type=int, default=4, help='Number of CPU cores to use') 66 | 67 | args = parser.parse_args() 68 | autograde_df(args.df_path, args.num_cpus) 69 | -------------------------------------------------------------------------------- /evals/eval_gpt_web.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from openai import OpenAI 3 | import time 4 | from typing import List, Dict, Any 5 | import json 6 | import pandas as pd 7 | from pathlib import Path 8 | import argparse 9 | from dotenv import load_dotenv 10 | import os 11 | from tqdm import tqdm 12 | import multiprocessing as mp 13 | from queue import Empty 14 | from concurrent.futures import ProcessPoolExecutor 15 | 16 | load_dotenv() 17 | 18 | class WebSearchEvaluator: 19 | def __init__(self, model: str, output_path: Path, num_workers: int = 4, trial: int = 0): 20 | self.model = model 21 | self.output_path = output_path 22 | self.num_workers = num_workers 23 | self.trial = trial 24 | 25 | # Load existing results if any 26 | self.processed_questions = set() 27 | if self.output_path.exists(): 28 | with open(self.output_path, 'r') as f: 29 | for line in f: 30 | try: 31 | result = json.loads(line) 32 | self.processed_questions.add(result['question']) 33 | except: 34 | continue 35 | 36 | def worker_init(self): 37 | """Initialize OpenAI client for each worker.""" 38 | # Create new client for each process 39 | self.client = OpenAI( 40 | api_key=os.environ.get("OPENAI_API_KEY"), 41 | base_url=os.environ.get("OPENAI_BASE_URL") 42 | ) 43 | 44 | def evaluate_single(self, row: pd.Series) -> Dict[str, Any]: 45 | """Evaluate a single question with its true answer.""" 46 | # Skip if already processed 47 | if row['question'] in self.processed_questions: 48 | return None 49 | 50 | if not hasattr(self, 'client'): 51 | self.worker_init() 52 | 53 | try: 54 | start_time = time.time() 55 | response = self.client.responses.create( 56 | model=self.model, 57 | tools=[{"type": "web_search_preview"}], 58 | input=row['question'] 59 | ) 60 | end_time = time.time() 61 | result = { 62 | "question": row['question'], 63 | "true_answer": row['true_answer'], 64 | "answer": response.output_text, 65 | "model": self.model, 66 | "time_taken": end_time - start_time, 67 | "timestamp": time.strftime("%Y-%m-%d %H:%M:%S") 68 | } 69 | return result 70 | except Exception as e: 71 | return { 72 | "question": row['question'], 73 | "true_answer": row['true_answer'], 74 | "answer": None, 75 | "error": str(e), 76 | "model": self.model, 77 | "timestamp": time.strftime("%Y-%m-%d %H:%M:%S") 78 | } 79 | 80 | def save_result(self, result: Dict[str, Any]) -> None: 81 | """Save a single result to the JSONL file.""" 82 | with open(self.output_path, 'a') as f: 83 | f.write(json.dumps(result) + '\n') 84 | 85 | def evaluate_batch(self, df: pd.DataFrame) -> None: 86 | """Evaluate questions in parallel using multiple workers.""" 87 | with ProcessPoolExecutor( 88 | max_workers=self.num_workers, 89 | initializer=self.worker_init 90 | ) as executor: 91 | # Convert DataFrame rows to list of Series 92 | rows = [row for _, row in df.iterrows()] 93 | 94 | # Create progress bar for total rows 95 | with tqdm(total=len(rows), desc="Processing questions") as pbar: 96 | # Submit all tasks 97 | futures = [executor.submit(self.evaluate_single, row) for row in rows] 98 | 99 | # Process results as they complete 100 | for future in futures: 101 | result = future.result() 102 | if result is not None: # Only save if not already processed 103 | self.save_result(result) 104 | pbar.update(1) 105 | 106 | def parse_args(): 107 | parser = argparse.ArgumentParser(description='Evaluate questions using GPT-4 with web search') 108 | parser.add_argument('--output_dir', type=str, default='output', 109 | help='Directory to save results (default: output)') 110 | parser.add_argument('--input_data', type=str, 111 | default='./evals/datasets/frames_test_set.csv', 112 | help='Path to input CSV file') 113 | parser.add_argument('--model', type=str, 114 | default=os.getenv("LITELLM_EVAL_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "gpt-4o-mini")), 115 | help='Model to use for evaluation') 116 | parser.add_argument('--num_workers', type=int, default=4, 117 | help='Number of parallel workers (default: 4)') 118 | parser.add_argument('--trial', type=int, default=0, 119 | help='Trial number for this evaluation run (default: 0)') 120 | return parser.parse_args() 121 | 122 | def main(): 123 | args = parse_args() 124 | 125 | # Create output directory if it doesn't exist 126 | output_dir = Path(args.output_dir) 127 | output_dir.mkdir(parents=True, exist_ok=True) 128 | 129 | # Set up output path (now without timestamp) 130 | output_path = output_dir / f"evaluation_results_{args.model}_trial{args.trial}.jsonl" 131 | 132 | # Load input data 133 | print(f"Loading data from {args.input_data}") 134 | df = pd.read_csv(args.input_data) 135 | print(f"Loaded {len(df)} examples") 136 | 137 | # Initialize evaluator 138 | evaluator = WebSearchEvaluator( 139 | model=args.model, 140 | output_path=output_path, 141 | num_workers=args.num_workers, 142 | trial=args.trial 143 | ) 144 | 145 | # Run evaluation 146 | print(f"Starting evaluation with model {args.model} using {args.num_workers} workers...") 147 | evaluator.evaluate_batch(df) 148 | print(f"Results saved to {output_path}") 149 | 150 | # Load and display summary 151 | results_df = pd.read_json(output_path, lines=True) 152 | print("\nResults summary:") 153 | print(f"Model: {args.model}") 154 | print(f"Total evaluations: {len(results_df)}") 155 | print(f"Successful evaluations: {len(results_df[~results_df['answer'].isna()])}") 156 | print(f"Failed evaluations: {len(results_df[results_df['answer'].isna()])}") 157 | 158 | if __name__ == "__main__": 159 | main() 160 | -------------------------------------------------------------------------------- /evals/eval_tasks.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import threading 6 | import time 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | from pathlib import Path 9 | 10 | import datasets 11 | import pandas as pd 12 | from datasets import Dataset 13 | from dotenv import load_dotenv 14 | from tqdm import tqdm 15 | from opendeepsearch import OpenDeepSearchTool 16 | 17 | from smolagents import ( 18 | AgentError, 19 | CodeAgent, 20 | LiteLLMModel, 21 | HfApiModel, 22 | PythonInterpreterTool, 23 | ToolCallingAgent, 24 | ) 25 | from smolagents.agents import ActionStep 26 | 27 | 28 | load_dotenv() 29 | 30 | APPEND_ANSWER_LOCK = threading.Lock() 31 | 32 | 33 | def parse_arguments(): 34 | parser = argparse.ArgumentParser(description="Runs an agent powered by the given model on smolagent benchmark.") 35 | parser.add_argument( 36 | "--date", 37 | type=str, 38 | default=None, 39 | help="The date for the evaluation.", 40 | ) 41 | parser.add_argument( 42 | "--eval-tasks", 43 | type=str, 44 | nargs="+", 45 | default=["./evals/datasets/frames_test_set.csv", "./evals/datasets/simple_qa_test_set.csv"], 46 | help="List of evaluation task paths", 47 | ) 48 | parser.add_argument( 49 | "--search-model-id", 50 | type=str, 51 | default="fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct", 52 | help="The model ID to use for the search tool (defaults to same as model-id)", 53 | ) 54 | parser.add_argument( 55 | "--model-type", 56 | type=str, 57 | default="LiteLLMModel", 58 | choices=["LiteLLMModel", "HfApiModel"], 59 | help="The model type to use (LiteLLMModel or HfApiModel)", 60 | ) 61 | parser.add_argument( 62 | "--model-id", 63 | type=str, 64 | default="fireworks_ai/accounts/fireworks/models/qwq-32b", 65 | help="The model ID to use for the specified model type", 66 | ) 67 | parser.add_argument( 68 | "--agent-action-type", 69 | type=str, 70 | default="codeact", 71 | choices=["codeact", "tool-calling", "vanilla"], 72 | help="The agent action type: 'codeact', 'tool-calling', or 'vanilla' to use the vanilla llm", 73 | ) 74 | parser.add_argument( 75 | "--parallel-workers", 76 | type=int, 77 | default=8, 78 | help="The number of processes to run in parallel", 79 | ) 80 | parser.add_argument( 81 | "--num-trials", 82 | type=int, 83 | default=1, 84 | help="Number of trials to run for each evaluation", 85 | ) 86 | return parser.parse_args() 87 | 88 | 89 | def load_eval_dataset(eval_tasks: list): 90 | eval_ds = {} 91 | for task_path in eval_tasks: 92 | task_name = task_path.split("/")[-1][:-4] 93 | df = pd.read_csv(task_path) 94 | dataset = Dataset.from_pandas(df) 95 | eval_ds[task_name] = dataset 96 | return eval_ds 97 | 98 | 99 | def serialize_agent_error(obj): 100 | if isinstance(obj, AgentError): 101 | return {"error_type": obj.__class__.__name__, "message": obj.message} 102 | else: 103 | return str(obj) 104 | 105 | 106 | def append_answer(entry: dict, jsonl_file: str) -> None: 107 | jsonl_file = Path(jsonl_file) 108 | jsonl_file.parent.mkdir(parents=True, exist_ok=True) 109 | with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp: 110 | fp.write(json.dumps(entry) + "\n") 111 | assert os.path.exists(jsonl_file), "File not found!" 112 | 113 | 114 | def run_with_timeout(func, timeout): 115 | with ThreadPoolExecutor(max_workers=1) as executor: 116 | future = executor.submit(func) 117 | try: 118 | return future.result(timeout=timeout) 119 | except TimeoutError: 120 | return "Timed Out" 121 | 122 | 123 | def answer_single_question(example, model, answers_file, action_type, search_model_id=None): 124 | if action_type == "vanilla": 125 | agent = model 126 | elif action_type == "codeact": 127 | agent = CodeAgent( 128 | tools=[OpenDeepSearchTool(model_name=search_model_id or model.model_id)], 129 | model=model, 130 | additional_authorized_imports=["numpy"], 131 | max_steps=15, 132 | ) 133 | elif action_type == "tool-calling": 134 | agent = ToolCallingAgent( 135 | tools=[OpenDeepSearchTool(model_name=search_model_id or model.model_id), PythonInterpreterTool()], 136 | model=model, 137 | additional_authorized_imports=["numpy"], 138 | max_steps=15, 139 | ) 140 | 141 | augmented_question = example["question"] 142 | start_time = time.time() 143 | TIMEOUT_SECONDS = 300 # 5 minutes timeout 144 | 145 | try: 146 | if action_type == "vanilla": 147 | def get_vanilla_response(): 148 | response = agent([{"role": "user", "content": augmented_question}]) 149 | return response.content, agent.last_output_token_count 150 | 151 | answer, token_count = run_with_timeout(get_vanilla_response, TIMEOUT_SECONDS) 152 | intermediate_steps = answer 153 | else: 154 | def get_agent_response(): 155 | response = str(agent.run(augmented_question)) 156 | token_count = agent.monitor.get_total_token_counts() 157 | # Remove memory from logs to make them more compact. 158 | for step in agent.memory.steps: 159 | if isinstance(step, ActionStep): 160 | step.agent_memory = None 161 | return response, token_count, str(agent.memory.steps) 162 | 163 | answer, token_count, intermediate_steps = run_with_timeout(get_agent_response, TIMEOUT_SECONDS) 164 | 165 | end_time = time.time() 166 | except Exception as e: 167 | print("Error on ", augmented_question, e) 168 | intermediate_steps = [] 169 | end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 170 | annotated_example = { 171 | "model_id": model.model_id, 172 | "agent_action_type": action_type, 173 | "original_question": example["question"], 174 | "answer": answer, 175 | "true_answer": example["true_answer"], 176 | "intermediate_steps": intermediate_steps, 177 | "start_time": start_time, 178 | "end_time": end_time, 179 | "token_counts": token_count, 180 | } 181 | append_answer(annotated_example, answers_file) 182 | 183 | 184 | def answer_questions( 185 | eval_ds, 186 | model, 187 | date, 188 | action_type: str = "codeact", 189 | output_dir: str = "output", 190 | parallel_workers: int = 32, 191 | search_model_id: str = None, 192 | num_trials: int = 1, 193 | ): 194 | date = date or datetime.date.today().isoformat() 195 | model_id = model.model_id 196 | 197 | # Create directory structure: output/model_id/action_type/task 198 | model_dir = model_id.replace('/', '__') 199 | 200 | for task in eval_ds: 201 | task_dir = os.path.join(output_dir, model_dir, action_type, task) 202 | os.makedirs(task_dir, exist_ok=True) 203 | 204 | for trial in range(num_trials): 205 | file_name = f"{task_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__trial{trial}.jsonl" 206 | print(f"Starting processing trial {trial + 1}/{num_trials} and writing output to '{file_name}'") 207 | answered_questions = [] 208 | if os.path.exists(file_name): 209 | with open(file_name, "r") as f: 210 | for line in f: 211 | answered_questions.append(json.loads(line)["original_question"]) 212 | examples_todo = [example for example in eval_ds[task] if example["question"] not in answered_questions] 213 | print(f"Launching {parallel_workers} parallel workers.") 214 | 215 | with ThreadPoolExecutor(max_workers=parallel_workers) as exe: 216 | futures = [ 217 | exe.submit(answer_single_question, example, model, file_name, action_type, search_model_id) 218 | for example in examples_todo 219 | ] 220 | for f in tqdm(as_completed(futures), total=len(examples_todo), desc="Processing tasks"): 221 | f.result() 222 | 223 | print("All tasks processed.") 224 | 225 | 226 | if __name__ == "__main__": 227 | args = parse_arguments() 228 | 229 | eval_ds = load_eval_dataset(args.eval_tasks) 230 | 231 | if args.model_type == "LiteLLMModel": 232 | model = LiteLLMModel( 233 | args.model_id, 234 | max_completion_tokens=8192, 235 | temperature=0.2, 236 | # api_key=os.getenv("OPENROUTER_API_KEY"), 237 | ) 238 | else: 239 | model = HfApiModel(args.model_id, provider="together", max_tokens=8192) 240 | 241 | answer_questions( 242 | eval_ds, 243 | model, 244 | args.date, 245 | action_type=args.agent_action_type, 246 | parallel_workers=args.parallel_workers, 247 | search_model_id=args.search_model_id, 248 | num_trials=args.num_trials, 249 | ) -------------------------------------------------------------------------------- /evals/gpt_web_extract.py: -------------------------------------------------------------------------------- 1 | import litellm 2 | from multiprocessing import Pool 3 | import pandas as pd 4 | from tqdm import tqdm 5 | import argparse 6 | 7 | input_prompt = """You are a precise answer extractor. Your job is to read a question and a detailed answer, then output ONLY the final answer without any explanation. 8 | 9 | For example: 10 | Question: "What is 2+2?" 11 | Detailed Answer: "Let me calculate this. 2 plus 2 equals 4, which is a basic mathematical fact." 12 | Final Answer: 4 13 | 14 | Question: "What color is the sky on a clear day?" 15 | Detailed Answer: "When we look up on a clear day, the sky appears blue due to a phenomenon called Rayleigh scattering." 16 | Final Answer: blue 17 | 18 | Question: "If my future wife has the same first name as the 15th first lady of the United States' mother and her surname is the same as the second assassinated president's mother's maiden name, what is my future wife's name?" 19 | Detailed Answer: "The 15th First Lady of the United States was Ellen Wilson, and her mother's name was Hannah. The second assassinated president was Abraham Lincoln, and his mother's maiden name was Hodge. \n\nPutting that together, your future wife's name is **Hannah Hodge**." 20 | Final Answer: Hannah Hodge 21 | 22 | Now do this: 23 | Question: {question} 24 | Detailed Answer: {detailed_answer} 25 | Final Answer:""" 26 | 27 | def process_row(row): 28 | """Process a single row using litellm.""" 29 | try: 30 | output = litellm.completion( 31 | model="openrouter/google/gemini-2.0-flash-001", 32 | messages=[{ 33 | "role": "user", 34 | "content": input_prompt.format( 35 | question=row['question'], 36 | detailed_answer=row['original_answer'] 37 | ) 38 | }], 39 | temperature=0.3 40 | ) 41 | return output['choices'][0]['message']['content'] 42 | except Exception as e: 43 | print(f"Error processing row: {e}") 44 | return None 45 | 46 | def process_dataframe(df, num_workers=4): 47 | """Process the entire dataframe using a pool of workers.""" 48 | with Pool(num_workers) as pool: 49 | # Use tqdm to show progress bar 50 | results = list(tqdm( 51 | pool.imap(process_row, [row for _, row in df.iterrows()]), 52 | total=len(df) 53 | )) 54 | 55 | # Add results as a new column 56 | df['processed_output'] = results 57 | return df 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser(description='Process a CSV file using litellm in parallel') 61 | parser.add_argument('input_file', type=str, help='Path to the input CSV file') 62 | parser.add_argument('--workers', type=int, default=4, help='Number of worker processes (default: 4)') 63 | 64 | args = parser.parse_args() 65 | 66 | # Load and process the dataframe 67 | df = pd.read_json(args.input_file, lines=True) 68 | 69 | # Rename 'answer' to 'original_answer' 70 | df = df.rename(columns={'answer': 'original_answer'}) 71 | 72 | # Process the dataframe and store results in 'answer' column 73 | processed_df = process_dataframe(df, num_workers=args.workers) 74 | processed_df = processed_df.rename(columns={'processed_output': 'answer'}) 75 | 76 | # Save to output file (adding '_processed' before the extension) 77 | output_file = args.input_file.rsplit('.', 1)[0] + '_processed.' + args.input_file.rsplit('.', 1)[1] 78 | processed_df.to_csv(output_file, index=False) 79 | print(f"Processed data saved to: {output_file}") 80 | -------------------------------------------------------------------------------- /evals/grader_prompts.py: -------------------------------------------------------------------------------- 1 | GRADER_TEMPLATE = """ 2 | Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. 3 | First, I will give examples of each grade, and then you will grade a new example. 4 | 5 | 6 | The following are examples of CORRECT predicted answers. 7 | ``` 8 | Question: What are the names of Barack Obama's children? 9 | Gold target: Malia Obama and Sasha Obama 10 | Predicted answer 1: sasha and malia obama 11 | Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check 12 | Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. 13 | ``` 14 | These predicted answers are all CORRECT because: 15 | - They fully contain the important information in the gold target. 16 | - They do not contain any information that contradicts the gold target. 17 | - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. 18 | - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. 19 | 20 | 21 | The following are examples of INCORRECT predicted answers. 22 | ``` 23 | Question: What are the names of Barack Obama's children? 24 | Gold target: Malia and Sasha 25 | Predicted answer 1: Malia. 26 | Predicted answer 2: Malia, Sasha, and Susan. 27 | Predicted answer 3: Barack Obama does not have any children. 28 | Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. 29 | Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. 30 | Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? 31 | Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. 32 | ``` 33 | These predicted answers are all INCORRECT because: 34 | - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. 35 | 36 | 37 | The following are examples of NOT_ATTEMPTED predicted answers. 38 | ``` 39 | Question: What are the names of Barack Obama's children? 40 | Gold target: Malia and Sasha 41 | Predicted answer 1: I don't know. 42 | Predicted answer 2: I need more context about which Obama you are talking about. 43 | Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. 44 | Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. 45 | ``` 46 | These predicted answers are all NOT_ATTEMPTED because: 47 | - The important information in the gold target is not included in the answer. 48 | - No statements in the answer contradict the gold target. 49 | 50 | 51 | Also note the following things: 52 | - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". 53 | - Predicted answers "120k", "124k", and 115k" are all CORRECT. 54 | - Predicted answers "100k" and "113k" are INCORRECT. 55 | - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. 56 | - The presence or absence of commas in numbers (e.g., "5,876" vs "5876") does not affect grading. 57 | - Numbers written as words or digits are equivalent (e.g., "2 million" vs "2000000" vs "2,000,000" are all considered the same). 58 | - For large numerical answers, a margin of error of ±1% is acceptable (e.g., if the gold answer is 855, predicted answers between 846.45 and 863.55 are CORRECT). 59 | - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. 60 | - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. 61 | - Do not punish predicted answers if they omit information that would be clearly inferred from the question. 62 | - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". 63 | - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. 64 | - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. 65 | - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. 66 | - Do not punish for typos in people's name if it's clearly the same name. 67 | - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". 68 | 69 | 70 | Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. 71 | ``` 72 | Question: {question} 73 | Gold target: {target} 74 | Predicted answer: {predicted_answer} 75 | ``` 76 | 77 | Grade the predicted answer of this new question as one of: 78 | A: CORRECT 79 | B: INCORRECT 80 | C: NOT_ATTEMPTED 81 | 82 | Just return the letters "A", "B", or "C", with no text around it. 83 | """.strip() 84 | -------------------------------------------------------------------------------- /gradio_demo.py: -------------------------------------------------------------------------------- 1 | from smolagents import CodeAgent, GradioUI, LiteLLMModel 2 | from opendeepsearch import OpenDeepSearchTool 3 | import os 4 | from dotenv import load_dotenv 5 | import argparse 6 | 7 | # Load environment variables 8 | load_dotenv() 9 | 10 | # Add command line argument parsing 11 | parser = argparse.ArgumentParser(description='Run the Gradio demo with custom models') 12 | parser.add_argument('--model-name', 13 | default=os.getenv("LITELLM_SEARCH_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")), 14 | help='Model name for search') 15 | parser.add_argument('--orchestrator-model', 16 | default=os.getenv("LITELLM_ORCHESTRATOR_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")), 17 | help='Model name for orchestration') 18 | parser.add_argument('--reranker', 19 | choices=['jina', 'infinity'], 20 | default='jina', 21 | help='Reranker to use (jina or infinity)') 22 | parser.add_argument('--search-provider', 23 | choices=['serper', 'searxng'], 24 | default='serper', 25 | help='Search provider to use (serper or searxng)') 26 | parser.add_argument('--searxng-instance', 27 | help='SearXNG instance URL (required if search-provider is searxng)') 28 | parser.add_argument('--searxng-api-key', 29 | help='SearXNG API key (optional)') 30 | parser.add_argument('--serper-api-key', 31 | help='Serper API key (optional, will use SERPER_API_KEY env var if not provided)') 32 | parser.add_argument('--openai-base-url', 33 | help='OpenAI API base URL (optional, will use OPENAI_BASE_URL env var if not provided)') 34 | parser.add_argument('--server-port', 35 | type=int, 36 | default=7860, 37 | help='Port to run the Gradio server on') 38 | 39 | args = parser.parse_args() 40 | 41 | # Validate arguments 42 | if args.search_provider == 'searxng' and not (args.searxng_instance or os.getenv('SEARXNG_INSTANCE_URL')): 43 | parser.error("--searxng-instance is required when using --search-provider=searxng") 44 | 45 | # Set OpenAI base URL if provided via command line 46 | if args.openai_base_url: 47 | os.environ["OPENAI_BASE_URL"] = args.openai_base_url 48 | 49 | # Use the command line arguments 50 | search_tool = OpenDeepSearchTool( 51 | model_name=args.model_name, 52 | reranker=args.reranker, 53 | search_provider=args.search_provider, 54 | serper_api_key=args.serper_api_key, 55 | searxng_instance_url=args.searxng_instance, 56 | searxng_api_key=args.searxng_api_key 57 | ) 58 | model = LiteLLMModel( 59 | model_id=args.orchestrator_model, 60 | temperature=0.2, 61 | ) 62 | 63 | # Initialize the agent with the search tool 64 | agent = CodeAgent(tools=[search_tool], model=model) 65 | 66 | # Add a name when initializing GradioUI 67 | GradioUI(agent).launch(server_name="127.0.0.1", server_port=args.server_port, share=False) 68 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "OpenDeepSearch" 3 | version = "0.1.0" 4 | description = "Default template for PDM package" 5 | authors = [ 6 | {name = "Salaheddin Alzu'bi", email = "salaheddinalzubi@gmail.com"}, 7 | ] 8 | 9 | dependencies = ["openai>=1.66.2", "datasets>=3.3.2", "transformers>=4.49.0", "litellm>=1.61.20", "langchain>=0.3.19", "crawl4ai @ git+https://github.com/salzubi401/crawl4ai.git@main", "fasttext-wheel>=0.9.2", "wikipedia-api>=0.8.1", "pillow>=10.4.0", "smolagents>=1.9.2", "gradio==5.20.1"] 10 | requires-python = ">=3.10" 11 | readme = "README.md" 12 | license = {text = "MIT"} 13 | 14 | [build-system] 15 | requires = ["hatchling"] 16 | build-backend = "hatchling.build" 17 | 18 | 19 | [tool.pdm] 20 | distribution = true 21 | 22 | [tool.hatch.metadata] 23 | allow-direct-references = true 24 | 25 | [tool.uv] 26 | python = "3.10" 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai>=1.65.1 2 | datasets>=3.3.2 3 | transformers>=4.49.0 4 | litellm>=1.61.20 5 | langchain>=0.3.19 6 | git+https://github.com/salzubi401/crawl4ai.git@main 7 | fasttext-wheel>=0.9.2 8 | wikipedia-api>=0.8.1 9 | pillow>=10.4.0 10 | smolagents>=1.9.2 11 | gradio==5.20.1 12 | 13 | -------------------------------------------------------------------------------- /src/opendeepsearch/__init__.py: -------------------------------------------------------------------------------- 1 | from .ods_agent import OpenDeepSearchAgent 2 | from .ods_tool import OpenDeepSearchTool 3 | 4 | __all__ = ['OpenDeepSearchAgent', 'OpenDeepSearchTool'] 5 | -------------------------------------------------------------------------------- /src/opendeepsearch/context_building/build_context.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional 2 | from loguru import logger 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter 4 | 5 | 6 | def extract_information(organic_results: List[Dict]) -> List[str]: 7 | """Extract snippets from organic search results in a formatted string.""" 8 | formatted_results = [] 9 | for item in organic_results: 10 | if 'snippet' in item: 11 | result_parts = [ 12 | f"title: {item.get('title', 'N/A')}", 13 | f"date authored: {item.get('date', 'N/A')}", 14 | f"link: {item.get('link', 'N/A')}", 15 | f"snippet: {item['snippet']}" 16 | ] 17 | 18 | if 'html' in item: 19 | result_parts.append(f"additional information: {item['html']}") 20 | 21 | formatted_results.append('\n'.join(result_parts)) 22 | 23 | return formatted_results 24 | 25 | def extract_top_stories(top_stories: Optional[List[Dict]]) -> List[str]: 26 | """Extract titles from top stories.""" 27 | if not top_stories: 28 | return [] 29 | 30 | return [ 31 | item['title'] 32 | for item in top_stories 33 | if 'title' in item 34 | ] 35 | 36 | def extract_answer_box( 37 | answer_box: Optional[Dict] 38 | ) -> List[str]: 39 | """Extract information from answer box.""" 40 | results = [] 41 | 42 | if answer_box: 43 | for key in ['answer', 'snippet']: 44 | if answer_box.get(key): 45 | results.append(answer_box[key]) 46 | 47 | return results 48 | 49 | def build_context( 50 | sources_result: Dict, 51 | ) -> str: 52 | """ 53 | Build context from search results. 54 | 55 | Args: 56 | sources_result: Dictionary containing search results 57 | 58 | Returns: 59 | A formatted string containing all relevant search results 60 | """ 61 | try: 62 | # Build context from different components 63 | organic_results = extract_information(sources_result.get('organic', [])) 64 | top_stories = extract_top_stories(sources_result.get('topStories')) 65 | answer_box = extract_answer_box( 66 | sources_result.get('answerBox') 67 | ) 68 | 69 | # Combine all results into a single string 70 | context_parts = [] 71 | 72 | # Add answer box if available 73 | if answer_box: 74 | context_parts.append("ANSWER BOX:") 75 | context_parts.extend(answer_box) 76 | context_parts.append("") # Empty line for separation 77 | 78 | # Add organic results 79 | if organic_results: 80 | context_parts.append("SEARCH RESULTS:") 81 | context_parts.extend(organic_results) 82 | context_parts.append("") # Empty line for separation 83 | 84 | # Add top stories if available 85 | if top_stories: 86 | context_parts.append("TOP STORIES:") 87 | context_parts.extend(top_stories) 88 | 89 | # Join all parts with newlines 90 | return "\n".join(context_parts) 91 | 92 | except Exception as e: 93 | logger.exception(f"An error occurred while building context: {e}") 94 | return "" # Return empty string in case of error 95 | -------------------------------------------------------------------------------- /src/opendeepsearch/context_building/process_sources_pro.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional, Tuple 3 | from opendeepsearch.context_scraping.crawl4ai_scraper import WebScraper 4 | from opendeepsearch.ranking_models.infinity_rerank import InfinitySemanticSearcher 5 | from opendeepsearch.ranking_models.jina_reranker import JinaReranker 6 | from opendeepsearch.ranking_models.chunker import Chunker 7 | 8 | @dataclass 9 | class Source: 10 | link: str 11 | html: str = "" 12 | # Add other relevant fields here 13 | 14 | class SourceProcessor: 15 | def __init__( 16 | self, 17 | top_results: int = 5, 18 | strategies: List[str] = ["no_extraction"], 19 | filter_content: bool = True, 20 | reranker: str = "infinity" 21 | ): 22 | self.strategies = strategies 23 | self.filter_content = filter_content 24 | self.scraper = WebScraper( 25 | strategies=self.strategies, 26 | filter_content=self.filter_content 27 | ) 28 | self.top_results = top_results 29 | self.chunker = Chunker() 30 | 31 | # Initialize the appropriate reranker 32 | if reranker.lower() == "jina": 33 | self.semantic_searcher = JinaReranker() 34 | print("Using Jina Reranker") 35 | else: # default to infinity 36 | self.semantic_searcher = InfinitySemanticSearcher() 37 | print("Using Infinity Reranker") 38 | 39 | async def process_sources( 40 | self, 41 | sources: List[dict], 42 | num_elements: int, 43 | query: str, 44 | pro_mode: bool = False 45 | ) -> List[dict]: 46 | try: 47 | valid_sources = self._get_valid_sources(sources, num_elements) 48 | if not valid_sources: 49 | return sources 50 | 51 | if not pro_mode: 52 | # Check if there's a Wikipedia article among valid sources 53 | wiki_sources = [(i, source) for i, source in valid_sources 54 | if 'wikipedia.org' in source['link']] 55 | if not wiki_sources: 56 | return sources.data 57 | # If Wikipedia article exists, only process that 58 | valid_sources = wiki_sources[:1] # Take only the first Wikipedia source 59 | 60 | html_contents = await self._fetch_html_contents([s[1]['link'] for s in valid_sources]) 61 | return self._update_sources_with_content(sources.data, valid_sources, html_contents, query) 62 | except Exception as e: 63 | print(f"Error in process_sources: {e}") 64 | return sources 65 | 66 | def _get_valid_sources(self, sources: List[dict], num_elements: int) -> List[Tuple[int, dict]]: 67 | return [(i, source) for i, source in enumerate(sources.data['organic'][:num_elements]) if source] 68 | 69 | async def _fetch_html_contents(self, links: List[str]) -> List[str]: 70 | raw_contents = await self.scraper.scrape_many(links) 71 | return [x['no_extraction'].content for x in raw_contents.values()] 72 | 73 | def _process_html_content(self, html: str, query: str) -> str: 74 | if not html: 75 | return "" 76 | try: 77 | # Split the HTML content into chunks 78 | documents = self.chunker.split_text(html) 79 | 80 | # Rerank the chunks based on the query 81 | reranked_content = self.semantic_searcher.get_reranked_documents( 82 | query, 83 | documents, 84 | top_k=self.top_results 85 | ) 86 | 87 | return reranked_content 88 | 89 | except Exception as e: 90 | print(f"Error in content processing: {e}") 91 | return "" 92 | 93 | def _update_sources_with_content( 94 | self, 95 | sources: List[dict], 96 | valid_sources: List[Tuple[int, dict]], 97 | html_contents: List[str], 98 | query: str 99 | ) -> List[dict]: 100 | for (i, source), html in zip(valid_sources, html_contents): 101 | source['html'] = self._process_html_content(html, query) 102 | # sources[i] = source 103 | return sources -------------------------------------------------------------------------------- /src/opendeepsearch/context_scraping/basic_web_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the BasicWebScraper class for basic web scraping functionality. 3 | """ 4 | 5 | from dataclasses import dataclass 6 | from typing import Optional 7 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode 8 | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator 9 | from crawl4ai.content_filter_strategy import PruningContentFilter 10 | 11 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult 12 | from crawl4ai.extraction_strategy import ExtractionStrategy 13 | 14 | @dataclass 15 | class ExtractionConfig: 16 | """Configuration for extraction strategies""" 17 | name: str 18 | strategy: ExtractionStrategy 19 | 20 | class BasicWebScraper: 21 | """Basic web scraper implementation""" 22 | def __init__(self, browser_config: Optional[BrowserConfig] = None): 23 | self.browser_config = browser_config or BrowserConfig(headless=True, verbose=True) 24 | 25 | def _create_crawler_config(self) -> CrawlerRunConfig: 26 | """Creates default crawler configuration""" 27 | return CrawlerRunConfig( 28 | cache_mode=CacheMode.BYPASS, 29 | markdown_generator=DefaultMarkdownGenerator( 30 | content_filter=PruningContentFilter() 31 | ) 32 | ) 33 | 34 | async def extract(self, extraction_config: ExtractionConfig, url: str) -> ExtractionResult: 35 | """Performs extraction using specified strategy""" 36 | try: 37 | config = self._create_crawler_config() 38 | config.extraction_strategy = extraction_config.strategy 39 | 40 | async with AsyncWebCrawler(config=self.browser_config) as crawler: 41 | result = await crawler.arun(url=url, config=config) 42 | 43 | extraction_result = ExtractionResult( 44 | name=extraction_config.name, 45 | success=result.success, 46 | content=result.extracted_content 47 | ) 48 | 49 | if result.success: 50 | extraction_result.raw_markdown_length = len(result.markdown_v2.raw_markdown) 51 | extraction_result.citations_markdown_length = len(result.markdown_v2.markdown_with_citations) 52 | 53 | return extraction_result 54 | 55 | except Exception as e: 56 | return ExtractionResult( 57 | name=extraction_config.name, 58 | success=False, 59 | error=str(e) 60 | ) -------------------------------------------------------------------------------- /src/opendeepsearch/context_scraping/crawl4ai_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modular web scraping implementation using Crawl4AI. 3 | Supports multiple extraction strategies including LLM, CSS, and XPath. 4 | """ 5 | 6 | import asyncio 7 | import os 8 | from dataclasses import dataclass 9 | from typing import Dict, List, Optional 10 | 11 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode 12 | from crawl4ai.content_filter_strategy import PruningContentFilter 13 | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator 14 | 15 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult, print_extraction_result 16 | from opendeepsearch.context_scraping.basic_web_scraper import ExtractionConfig 17 | from opendeepsearch.context_scraping.strategy_factory import StrategyFactory 18 | 19 | class WebScraper: 20 | """Unified scraper that encapsulates all extraction strategies and configuration""" 21 | def __init__( 22 | self, 23 | browser_config: Optional[BrowserConfig] = None, 24 | strategies: List[str] = ['no_extraction'], 25 | llm_instruction: str = "Extract relevant content from the provided text, only return the text, no markdown formatting, remove all footnotes, citations, and other metadata and only keep the main content", 26 | user_query: Optional[str] = None, 27 | debug: bool = False, 28 | filter_content: bool = False 29 | ): 30 | self.browser_config = browser_config or BrowserConfig(headless=True, verbose=True) 31 | self.debug = debug 32 | self.factory = StrategyFactory() 33 | self.strategies = strategies or ['markdown_llm', 'html_llm', 'fit_markdown_llm', 'css', 'xpath', 'no_extraction', 'cosine'] 34 | self.llm_instruction = llm_instruction 35 | self.user_query = user_query 36 | self.filter_content = filter_content 37 | 38 | # Validate strategies 39 | valid_strategies = {'markdown_llm', 'html_llm', 'fit_markdown_llm', 'css', 'xpath', 'no_extraction', 'cosine'} 40 | invalid_strategies = set(self.strategies) - valid_strategies 41 | if invalid_strategies: 42 | raise ValueError(f"Invalid strategies: {invalid_strategies}") 43 | 44 | # Initialize strategy map 45 | self.strategy_map = { 46 | 'markdown_llm': lambda: self.factory.create_llm_strategy('markdown', self.llm_instruction), 47 | 'html_llm': lambda: self.factory.create_llm_strategy('html', self.llm_instruction), 48 | 'fit_markdown_llm': lambda: self.factory.create_llm_strategy('fit_markdown', self.llm_instruction), 49 | 'css': self.factory.create_css_strategy, 50 | 'xpath': self.factory.create_xpath_strategy, 51 | 'no_extraction': self.factory.create_no_extraction_strategy, 52 | 'cosine': lambda: self.factory.create_cosine_strategy(debug=self.debug) 53 | } 54 | 55 | def _create_crawler_config(self) -> CrawlerRunConfig: 56 | """Creates default crawler configuration""" 57 | content_filter = PruningContentFilter(user_query=self.user_query) if self.user_query else PruningContentFilter() 58 | return CrawlerRunConfig( 59 | cache_mode=CacheMode.BYPASS, 60 | markdown_generator=DefaultMarkdownGenerator( 61 | content_filter=content_filter 62 | ) 63 | ) 64 | 65 | async def scrape(self, url: str) -> Dict[str, ExtractionResult]: 66 | """ 67 | Scrape URL using configured strategies 68 | 69 | Args: 70 | url: Target URL to scrape 71 | """ 72 | # Handle Wikipedia URLs 73 | if 'wikipedia.org/wiki/' in url: 74 | from src.opendeepsearch.context_scraping.utils import get_wikipedia_content 75 | try: 76 | content = get_wikipedia_content(url) 77 | # Create same result for all strategies since we're using Wikipedia content 78 | return { 79 | strategy_name: ExtractionResult( 80 | name=strategy_name, 81 | success=True, 82 | content=content 83 | ) for strategy_name in self.strategies 84 | } 85 | except Exception as e: 86 | if self.debug: 87 | print(f"Debug: Wikipedia extraction failed: {str(e)}") 88 | # If Wikipedia extraction fails, fall through to normal scraping 89 | 90 | # Normal scraping for non-Wikipedia URLs or if Wikipedia extraction failed 91 | results = {} 92 | for strategy_name in self.strategies: 93 | config = ExtractionConfig( 94 | name=strategy_name, 95 | strategy=self.strategy_map[strategy_name]() 96 | ) 97 | result = await self.extract(config, url) 98 | results[strategy_name] = result 99 | 100 | return results 101 | 102 | async def scrape_many(self, urls: List[str]) -> Dict[str, Dict[str, ExtractionResult]]: 103 | """ 104 | Scrape multiple URLs using configured strategies in parallel 105 | 106 | Args: 107 | urls: List of target URLs to scrape 108 | 109 | Returns: 110 | Dictionary mapping URLs to their extraction results 111 | """ 112 | # Create tasks for all URLs 113 | tasks = [self.scrape(url) for url in urls] 114 | # Run all tasks concurrently 115 | results_list = await asyncio.gather(*tasks) 116 | 117 | # Build results dictionary 118 | results = {} 119 | for url, result in zip(urls, results_list): 120 | results[url] = result 121 | 122 | return results 123 | 124 | async def extract(self, extraction_config: ExtractionConfig, url: str) -> ExtractionResult: 125 | """Internal method to perform extraction using specified strategy""" 126 | try: 127 | config = self._create_crawler_config() 128 | config.extraction_strategy = extraction_config.strategy 129 | 130 | if self.debug: 131 | print(f"\nDebug: Attempting extraction with strategy: {extraction_config.name}") 132 | print(f"Debug: URL: {url}") 133 | print(f"Debug: Strategy config: {config.extraction_strategy}") 134 | if self.user_query: 135 | print(f"Debug: User query: {self.user_query}") 136 | 137 | async with AsyncWebCrawler(config=self.browser_config) as crawler: 138 | if isinstance(url, list): 139 | result = await crawler.arun_many(urls=url, config=config) 140 | else: 141 | result = await crawler.arun(url=url, config=config) 142 | 143 | if self.debug: 144 | print(f"Debug: Raw result attributes: {dir(result)}") 145 | print(f"Debug: Raw result: {result.__dict__}") 146 | 147 | # Handle different result formats based on strategy 148 | content = None 149 | if result.success: 150 | if extraction_config.name in ['no_extraction', 'cosine']: 151 | # For strategies that return a list of dictionaries 152 | if hasattr(result, 'markdown_v2'): 153 | content = result.markdown_v2.raw_markdown 154 | elif hasattr(result, 'raw_html'): 155 | content = result.raw_html 156 | elif hasattr(result, 'extracted_content') and result.extracted_content: 157 | if isinstance(result.extracted_content, list): 158 | content = '\n'.join(item.get('content', '') for item in result.extracted_content) 159 | else: 160 | content = result.extracted_content 161 | 162 | if self.filter_content and content: 163 | from src.opendeepsearch.context_scraping.utils import filter_quality_content 164 | content = filter_quality_content(content) 165 | else: 166 | content = result.extracted_content 167 | if self.filter_content and content: 168 | from src.opendeepsearch.context_scraping.utils import filter_quality_content 169 | content = filter_quality_content(content) 170 | 171 | if self.debug: 172 | print(f"Debug: Processed content: {content[:200] if content else None}") 173 | 174 | extraction_result = ExtractionResult( 175 | name=extraction_config.name, 176 | success=result.success, 177 | content=content, 178 | error=getattr(result, 'error', None) # Capture error if available 179 | ) 180 | 181 | if result.success: 182 | extraction_result.raw_markdown_length = len(result.markdown_v2.raw_markdown) 183 | extraction_result.citations_markdown_length = len(result.markdown_v2.markdown_with_citations) 184 | elif self.debug: 185 | print(f"Debug: Final extraction result: {extraction_result.__dict__}") 186 | 187 | return extraction_result 188 | 189 | except Exception as e: 190 | if self.debug: 191 | import traceback 192 | print(f"Debug: Exception occurred during extraction:") 193 | print(traceback.format_exc()) 194 | 195 | return ExtractionResult( 196 | name=extraction_config.name, 197 | success=False, 198 | error=str(e) 199 | ) 200 | 201 | async def main(): 202 | # Example usage with single URL 203 | single_url = "https://example.com/product-page" 204 | scraper = WebScraper(debug=True) 205 | results = await scraper.scrape(single_url) 206 | 207 | # Print single URL results 208 | for result in results.values(): 209 | print_extraction_result(result) 210 | 211 | # Example usage with multiple URLs 212 | urls = [ 213 | "https://example.com", 214 | "https://python.org", 215 | "https://github.com" 216 | ] 217 | 218 | multi_results = await scraper.scrape_many(urls) 219 | 220 | # Print multiple URL results 221 | for url, url_results in multi_results.items(): 222 | print(f"\nResults for {url}:") 223 | for result in url_results.values(): 224 | print_extraction_result(result) 225 | 226 | if __name__ == "__main__": 227 | asyncio.run(main()) 228 | -------------------------------------------------------------------------------- /src/opendeepsearch/context_scraping/extraction_result.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the ExtractionResult class for holding extraction operation results. 3 | """ 4 | 5 | from typing import Optional 6 | 7 | class ExtractionResult: 8 | """Holds the results of an extraction operation""" 9 | def __init__(self, name: str, success: bool, content: Optional[str] = None, error: Optional[str] = None): 10 | self.name = name 11 | self.success = success 12 | self.content = content 13 | self.error = error 14 | self.raw_markdown_length = 0 15 | self.citations_markdown_length = 0 16 | 17 | def print_extraction_result(result: ExtractionResult): 18 | """Utility function to print extraction results""" 19 | if result.success: 20 | print(f"\n=== {result.name} Results ===") 21 | print(f"Extracted Content: {result.content}") 22 | print(f"Raw Markdown Length: {result.raw_markdown_length}") 23 | print(f"Citations Markdown Length: {result.citations_markdown_length}") 24 | else: 25 | print(f"Error in {result.name}: {result.error}") -------------------------------------------------------------------------------- /src/opendeepsearch/context_scraping/fast_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enhanced web scraping implementation using Crawl4AI and vLLM. 3 | Supports multiple extraction strategies with LLM-powered content processing. 4 | """ 5 | 6 | import asyncio 7 | from dataclasses import dataclass 8 | from typing import Dict, List, Optional, Any 9 | import json 10 | 11 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig 12 | from vllm import LLM, SamplingParams 13 | 14 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult 15 | from opendeepsearch.context_scraping.utils import clean_html, get_wikipedia_content 16 | 17 | @dataclass 18 | class LLMConfig: 19 | """Configuration for LLM-based extraction""" 20 | model_name: str = 'jinaai/ReaderLM-v2' 21 | max_model_len: int = 512_000 22 | temperature: float = 0.0 23 | top_k: int = 1 24 | presence_penalty: float = 0.25 25 | frequency_penalty: float = 0.25 26 | repetition_penalty: float = 1.13 27 | max_tokens: int = 16_384 28 | 29 | # DEFAULT_SCHEMA = """ 30 | # { 31 | # "type": "object", 32 | # "properties": { 33 | # "title": { 34 | # "type": "string" 35 | # }, 36 | # "author": { 37 | # "type": "string" 38 | # }, 39 | # "date": { 40 | # "type": "string" 41 | # }, 42 | # "content": { 43 | # "type": "string" 44 | # } 45 | # }, 46 | # "required": ["title", "author", "date", "content"] 47 | # } 48 | # """ 49 | 50 | class FastWebScraper: 51 | """Enhanced scraper with LLM-powered extraction and multiple strategies""" 52 | def __init__( 53 | self, 54 | llm_config: Optional[LLMConfig] = None, 55 | browser_config: Optional[BrowserConfig] = None, 56 | json_schema: Optional[Dict[str, Any]] = None, 57 | debug: bool = False 58 | ): 59 | self.debug = debug 60 | self.browser_config = browser_config or BrowserConfig(headless=True, verbose=debug) 61 | self.llm_config = llm_config or LLMConfig() 62 | self.json_schema = None #json_schema or json.loads(DEFAULT_SCHEMA) 63 | 64 | # Initialize LLM 65 | self.sampling_params = SamplingParams( 66 | temperature=self.llm_config.temperature, 67 | top_k=self.llm_config.top_k, 68 | presence_penalty=self.llm_config.presence_penalty, 69 | repetition_penalty=self.llm_config.repetition_penalty, 70 | max_tokens=self.llm_config.max_tokens, 71 | frequency_penalty=self.llm_config.frequency_penalty 72 | ) 73 | 74 | self.llm = LLM( 75 | model=self.llm_config.model_name, 76 | max_model_len=self.llm_config.max_model_len, 77 | dtype='float16' 78 | ) 79 | 80 | self.tokenizer = self.llm.get_tokenizer() 81 | 82 | def _create_prompt(self, text: str, instruction: Optional[str] = None) -> str: 83 | """Create a prompt for the LLM""" 84 | if not instruction: 85 | instruction = "Extract the main content and convert to structured format." 86 | 87 | if self.json_schema: 88 | instruction = "Extract information according to the schema and return JSON." 89 | prompt = f"{instruction}\n```html\n{text}\n```\nSchema:```json\n{json.dumps(self.json_schema, indent=2)}\n```" 90 | else: 91 | prompt = f"{instruction}\n```html\n{text}\n```" 92 | 93 | messages = [{"role": "user", "content": prompt}] 94 | return self.tokenizer.apply_chat_template( 95 | messages, tokenize=False, add_generation_prompt=True 96 | ) 97 | 98 | async def _extract_content(self, html: str, instruction: Optional[str] = None) -> str: 99 | """Extract content using LLM""" 100 | cleaned_html = clean_html(html, clean_svg=True, clean_base64=True) 101 | prompt = self._create_prompt(cleaned_html, instruction) 102 | 103 | outputs = self.llm.generate(prompt, self.sampling_params) 104 | raw_text = outputs[0].outputs[0].text 105 | return self._parse_llm_output(raw_text) 106 | 107 | def _parse_llm_output(self, text: str) -> str: 108 | """ 109 | Parse LLM output, handling both single dictionaries and lists of dictionaries. 110 | Returns the content field from the most appropriate dictionary. 111 | """ 112 | try: 113 | # Strip any markdown code block markers 114 | text = text.strip() 115 | if text.startswith('```') and text.endswith('```'): 116 | text = text.split('```')[1] 117 | if text.startswith('json'): 118 | text = text[4:] 119 | 120 | data = json.loads(text.strip()) 121 | 122 | if isinstance(data, dict): 123 | return data.get('content', '') 124 | 125 | if isinstance(data, list): 126 | # First try to find a dictionary with non-empty content 127 | for item in data: 128 | if isinstance(item, dict) and item.get('content'): 129 | return item['content'] 130 | 131 | # If no content found, return content from last item or empty string 132 | last_item = data[-1] 133 | return last_item.get('content', '') if isinstance(last_item, dict) else '' 134 | 135 | return '' 136 | 137 | except json.JSONDecodeError: 138 | # If JSON parsing fails, return the original text 139 | return text.strip() 140 | except Exception: 141 | return '' 142 | 143 | async def scrape(self, url: str, instruction: Optional[str] = None) -> ExtractionResult: 144 | """ 145 | Scrape and process content from a URL 146 | 147 | Args: 148 | url: Target URL to scrape 149 | instruction: Optional custom instruction for the LLM 150 | """ 151 | try: 152 | if self.debug: 153 | print(f"Debug: Processing URL: {url}") 154 | 155 | # Handle Wikipedia URLs 156 | if 'wikipedia.org/wiki/' in url: 157 | try: 158 | content = get_wikipedia_content(url) 159 | return ExtractionResult( 160 | name="llm_extraction", 161 | success=True, 162 | content=content 163 | ) 164 | except Exception as e: 165 | if self.debug: 166 | print(f"Debug: Wikipedia extraction failed: {str(e)}") 167 | # If Wikipedia extraction fails, fall through to normal scraping 168 | 169 | # Fetch HTML 170 | async with AsyncWebCrawler(config=self.browser_config) as crawler: 171 | result = await crawler.arun(url=url, config=CrawlerRunConfig()) 172 | 173 | if not result.success: 174 | return ExtractionResult( 175 | name="llm_extraction", 176 | success=False, 177 | error="Failed to fetch HTML" 178 | ) 179 | 180 | # Process with LLM 181 | content = await self._extract_content(result.html, instruction) 182 | 183 | return ExtractionResult( 184 | name="llm_extraction", 185 | success=True, 186 | content=content 187 | ) 188 | 189 | except Exception as e: 190 | if self.debug: 191 | import traceback 192 | print(f"Debug: Exception during scraping:") 193 | print(traceback.format_exc()) 194 | 195 | return ExtractionResult( 196 | name="llm_extraction", 197 | success=False, 198 | error=str(e) 199 | ) 200 | 201 | async def scrape_many(self, urls: List[str], instruction: Optional[str] = None) -> Dict[str, ExtractionResult]: 202 | """ 203 | Scrape multiple URLs 204 | 205 | Args: 206 | urls: List of target URLs 207 | instruction: Optional custom instruction for the LLM 208 | """ 209 | results = {} 210 | for url in urls: 211 | results[url] = await self.scrape(url, instruction) 212 | return results 213 | -------------------------------------------------------------------------------- /src/opendeepsearch/context_scraping/strategy_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains the StrategyFactory class for creating various extraction strategies. 3 | """ 4 | 5 | import os 6 | from typing import Optional 7 | 8 | from crawl4ai.extraction_strategy import ( 9 | LLMExtractionStrategy, 10 | JsonCssExtractionStrategy, 11 | JsonXPathExtractionStrategy, 12 | NoExtractionStrategy, 13 | CosineStrategy, 14 | ) 15 | 16 | class StrategyFactory: 17 | """Factory for creating extraction strategies""" 18 | @staticmethod 19 | def create_llm_strategy( 20 | input_format: str = "markdown", 21 | instruction: str = "Extract relevant content from the provided text, only return the text, no markdown formatting, remove all footnotes, citations, and other metadata and only keep the main content", 22 | ) -> LLMExtractionStrategy: 23 | return LLMExtractionStrategy( 24 | input_format=input_format, 25 | provider="openrouter/google/gemini-2.0-flash-lite-001", # Uses LiteLLM as provider 26 | api_token=os.getenv("OPENROUTER_API_KEY"), 27 | instruction=instruction 28 | ) 29 | 30 | @staticmethod 31 | def create_css_strategy() -> JsonCssExtractionStrategy: 32 | schema = { 33 | "baseSelector": ".product", 34 | "fields": [ 35 | {"name": "title", "selector": "h1.product-title", "type": "text"}, 36 | {"name": "price", "selector": ".price", "type": "text"}, 37 | {"name": "description", "selector": ".description", "type": "text"}, 38 | ], 39 | } 40 | return JsonCssExtractionStrategy(schema=schema) 41 | 42 | @staticmethod 43 | def create_xpath_strategy() -> JsonXPathExtractionStrategy: 44 | schema = { 45 | "baseSelector": "//div[@class='product']", 46 | "fields": [ 47 | {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"}, 48 | {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"}, 49 | {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}, 50 | ], 51 | } 52 | return JsonXPathExtractionStrategy(schema=schema) 53 | 54 | @staticmethod 55 | def create_no_extraction_strategy() -> NoExtractionStrategy: 56 | return NoExtractionStrategy() 57 | 58 | @staticmethod 59 | def create_cosine_strategy( 60 | semantic_filter: Optional[str] = None, 61 | word_count_threshold: int = 10, 62 | max_dist: float = 0.2, 63 | sim_threshold: float = 0.3, 64 | debug: bool = False 65 | ) -> CosineStrategy: 66 | return CosineStrategy( 67 | semantic_filter=semantic_filter, 68 | word_count_threshold=word_count_threshold, 69 | max_dist=max_dist, 70 | sim_threshold=sim_threshold, 71 | verbose=debug 72 | ) -------------------------------------------------------------------------------- /src/opendeepsearch/context_scraping/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Tuple 3 | import fasttext 4 | from huggingface_hub import hf_hub_download 5 | import wikipediaapi 6 | 7 | # Load the model 8 | model = fasttext.load_model(hf_hub_download("kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2", "model.bin")) 9 | 10 | def clean_markdown_links(text: str, min_quality_score: float = 0.2) -> Tuple[str, float]: 11 | """ 12 | Clean markdown links and filter low-quality content. 13 | Returns tuple of (cleaned_text, quality_score) 14 | """ 15 | # Split by double newlines to preserve paragraph structure 16 | paragraphs = text.split('\n\n') 17 | 18 | cleaned_paragraphs = [] 19 | for paragraph in paragraphs: 20 | # Preserve code blocks by checking if paragraph contains ``` tags 21 | if '```' in paragraph: 22 | cleaned_paragraphs.append(paragraph) 23 | continue 24 | 25 | lines = paragraph.split('\n') 26 | filtered_lines = [] 27 | for line in lines: 28 | line = line.strip() 29 | # Keep headers regardless of length 30 | if re.match(r'^#{1,6}\s+', line): 31 | filtered_lines.append(line) 32 | continue 33 | 34 | # Skip common UI/navigation elements 35 | if re.match(r'^(Share|Trade|More|Buy|Sell|Download|Menu|Home|Back|Next|Previous|\d+\s*(BTC|USD|EUR|GBP)|\w{3}-\w{1,3}|Currency:.*|You (Buy|Spend|Receive)|≈|\d+\.\d+)', line, re.IGNORECASE): 36 | continue 37 | 38 | # Count words before removing markdown 39 | word_count = len(re.sub(r'\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|<.*?>', '', line).split()) 40 | 41 | # Increase minimum word threshold to 12 42 | if word_count < 12: 43 | # Check if line only contains markdown patterns or appears to be a currency/trading related line 44 | cleaned_line = re.sub(r'\[!\[.*?\]\(.*?\)\]\(.*?\)|\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|<.*?>|\d+(\.\d+)?%?|\$\d+(\.\d+)?', '', line).strip() 45 | if not cleaned_line or len(cleaned_line.split()) < 8: # If nothing substantial remains, skip this line 46 | continue 47 | 48 | filtered_lines.append(line) 49 | 50 | # Only add paragraph if it has any lines left 51 | if filtered_lines: 52 | cleaned_paragraphs.append('\n'.join(filtered_lines)) 53 | 54 | # Rejoin with double newlines 55 | cleaned_text = '\n\n'.join(cleaned_paragraphs) 56 | 57 | # Get quality score 58 | quality_score = predict_educational_value([cleaned_text])[0] 59 | 60 | return cleaned_text, quality_score 61 | 62 | def filter_quality_content(text: str, min_quality_score: float = 0.2) -> str: 63 | """ 64 | Filter content based on quality and returns concatenated quality content 65 | """ 66 | # Split text into paragraphs 67 | paragraphs = text.split('\n\n') 68 | 69 | # Process each paragraph 70 | quality_content = [] 71 | for paragraph in paragraphs: 72 | if not paragraph.strip(): # Skip empty paragraphs 73 | continue 74 | 75 | cleaned_text, quality_score = clean_markdown_links(paragraph, min_quality_score) 76 | if cleaned_text and quality_score >= min_quality_score: 77 | quality_content.append((cleaned_text, quality_score)) 78 | 79 | # Debug print 80 | print(f"Found {len(quality_content)} quality paragraphs out of {len(paragraphs)} total") 81 | 82 | if quality_content: 83 | return "\n\n".join(text for text, _ in quality_content) 84 | return text # Return original text if no quality content found 85 | 86 | def replace_newlines(text: str) -> str: 87 | """Replace multiple newlines with a single space.""" 88 | return re.sub("\n+", " ", text) 89 | 90 | score_dict = { 91 | '__label__': 0, 92 | '__label__Low': 0, 93 | '__label__Mid': 1, 94 | '__label__High': 2 95 | } 96 | 97 | def predict_educational_value(text_list: List[str]) -> List[float]: 98 | """ 99 | Predict educational value scores for a list of texts. 100 | Returns a list of scores between 0 and 2. 101 | """ 102 | text_list = [replace_newlines(text) for text in text_list] 103 | pred = model.predict(text_list, k=-1) 104 | score_list = [] 105 | for l, s in zip(*pred): 106 | score = 0 107 | for _l, _s in zip(l, s): 108 | score += score_dict[_l] * _s 109 | score_list.append(float(score)) 110 | return score_list 111 | 112 | def get_wikipedia_content(url: str) -> str | None: 113 | """ 114 | Extract content from a Wikipedia URL. 115 | 116 | Args: 117 | url: Wikipedia URL to scrape 118 | 119 | Returns: 120 | str: Page content if found, None otherwise 121 | """ 122 | wiki = wikipediaapi.Wikipedia(user_agent="opendeepsearch", language='en') 123 | 124 | # Extract the page title from URL (everything after /wiki/) 125 | try: 126 | title = url.split('/wiki/')[-1] 127 | page = wiki.page(title) 128 | if page.exists(): 129 | return page.text 130 | return None 131 | except Exception: 132 | return None 133 | 134 | # Patterns 135 | SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>" 136 | STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>" 137 | META_PATTERN = r"<[ ]*meta.*?>" 138 | COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>" 139 | LINK_PATTERN = r"<[ ]*link.*?>" 140 | BASE64_IMG_PATTERN = r']+src="data:image/[^;]+;base64,[^"]+"[^>]*>' 141 | SVG_PATTERN = r"(]*>)(.*?)(<\/svg>)" 142 | IFRAME_PATTERN = r"<[ ]*iframe.*?\/[ ]*iframe[ ]*>" 143 | NOSCRIPT_PATTERN = r"<[ ]*noscript.*?\/[ ]*noscript[ ]*>" 144 | HEADER_PATTERN = r"<[ ]*header.*?\/[ ]*header[ ]*>" 145 | FOOTER_PATTERN = r"<[ ]*footer.*?\/[ ]*footer[ ]*>" 146 | NAV_PATTERN = r"<[ ]*nav.*?\/[ ]*nav[ ]*>" 147 | FORM_PATTERN = r"<[ ]*form.*?\/[ ]*form[ ]*>" 148 | 149 | 150 | def replace_svg(html: str, new_content: str = "this is a placeholder") -> str: 151 | return re.sub( 152 | SVG_PATTERN, 153 | lambda match: f"{match.group(1)}{new_content}{match.group(3)}", 154 | html, 155 | flags=re.DOTALL, 156 | ) 157 | 158 | 159 | def replace_base64_images(html: str, new_image_src: str = "#") -> str: 160 | return re.sub(BASE64_IMG_PATTERN, f'', html) 161 | 162 | 163 | def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False): 164 | """Clean HTML content by removing various elements.""" 165 | patterns = [ 166 | SCRIPT_PATTERN, 167 | STYLE_PATTERN, 168 | META_PATTERN, 169 | COMMENT_PATTERN, 170 | LINK_PATTERN, 171 | IFRAME_PATTERN, 172 | NOSCRIPT_PATTERN, 173 | HEADER_PATTERN, 174 | FOOTER_PATTERN, 175 | NAV_PATTERN, 176 | FORM_PATTERN 177 | ] 178 | 179 | for pattern in patterns: 180 | html = re.sub(pattern, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) 181 | 182 | if clean_svg: 183 | html = replace_svg(html) 184 | if clean_base64: 185 | html = replace_base64_images(html) 186 | 187 | # Remove empty lines and excessive whitespace 188 | html = re.sub(r'\n\s*\n', '\n', html) 189 | html = re.sub(r'\s+', ' ', html) 190 | 191 | return html.strip() 192 | 193 | JSON_SCHEMA = """ 194 | { 195 | "type": "object", 196 | "properties": { 197 | "title": { 198 | "type": "string" 199 | }, 200 | "author": { 201 | "type": "string" 202 | }, 203 | "date": { 204 | "type": "string" 205 | }, 206 | "content": { 207 | "type": "string" 208 | } 209 | }, 210 | "required": ["title", "author", "date", "content"] 211 | } 212 | """ -------------------------------------------------------------------------------- /src/opendeepsearch/ods_agent.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any, Literal 2 | from opendeepsearch.serp_search.serp_search import create_search_api, SearchAPI 3 | from opendeepsearch.context_building.process_sources_pro import SourceProcessor 4 | from opendeepsearch.context_building.build_context import build_context 5 | from litellm import completion, utils 6 | from dotenv import load_dotenv 7 | import os 8 | from opendeepsearch.prompts import SEARCH_SYSTEM_PROMPT 9 | import asyncio 10 | import nest_asyncio 11 | load_dotenv() 12 | 13 | class OpenDeepSearchAgent: 14 | def __init__( 15 | self, 16 | model: Optional[str] = None, #We use LiteLLM to call the model 17 | system_prompt: Optional[str] = SEARCH_SYSTEM_PROMPT, 18 | search_provider: Literal["serper", "searxng"] = "serper", 19 | serper_api_key: Optional[str] = None, 20 | searxng_instance_url: Optional[str] = None, 21 | searxng_api_key: Optional[str] = None, 22 | source_processor_config: Optional[Dict[str, Any]] = None, 23 | temperature: float = 0.2, # Slight variation while maintaining reliability 24 | top_p: float = 0.3, # Focus on high-confidence tokens 25 | reranker: Optional[str] = "None", # Optional reranker identifier 26 | ): 27 | """ 28 | Initialize an OpenDeepSearch agent that combines web search, content processing, and LLM capabilities. 29 | 30 | This agent performs web searches using either SerperAPI or SearXNG, processes the search results to extract 31 | relevant information, and uses a language model to generate responses based on the gathered context. 32 | 33 | Args: 34 | model (str): The identifier for the language model to use (compatible with LiteLLM). 35 | system_prompt (str, optional): Custom system prompt for the language model. If not provided, 36 | uses a default prompt that instructs the model to answer based on context. 37 | search_provider (str, optional): The search provider to use ('serper' or 'searxng'). Default is 'serper'. 38 | serper_api_key (str, optional): API key for SerperAPI. Required if search_provider is 'serper' and 39 | SERPER_API_KEY environment variable is not set. 40 | searxng_instance_url (str, optional): URL of the SearXNG instance. Required if search_provider is 'searxng' 41 | and SEARXNG_INSTANCE_URL environment variable is not set. 42 | searxng_api_key (str, optional): API key for SearXNG instance. Optional even if search_provider is 'searxng'. 43 | source_processor_config (Dict[str, Any], optional): Configuration dictionary for the 44 | SourceProcessor. Supports the following options: 45 | - strategies (List[str]): Content extraction strategies to use 46 | - filter_content (bool): Whether to enable content filtering 47 | - top_results (int): Number of top results to process 48 | temperature (float, default=0.2): Controls randomness in model outputs. Lower values make 49 | the output more focused and deterministic. 50 | top_p (float, default=0.3): Controls nucleus sampling for model outputs. Lower values make 51 | the output more focused on high-probability tokens. 52 | reranker (str, optional): Identifier for the reranker to use. If not provided, 53 | uses the default reranker from SourceProcessor. 54 | """ 55 | # Initialize search API based on provider 56 | self.serp_search = create_search_api( 57 | search_provider=search_provider, 58 | serper_api_key=serper_api_key, 59 | searxng_instance_url=searxng_instance_url, 60 | searxng_api_key=searxng_api_key 61 | ) 62 | 63 | # Update source_processor_config with reranker if provided 64 | if source_processor_config is None: 65 | source_processor_config = {} 66 | if reranker: 67 | source_processor_config['reranker'] = reranker 68 | 69 | # Initialize SourceProcessor with provided config or defaults 70 | self.source_processor = SourceProcessor(**source_processor_config) 71 | 72 | # Initialize LLM settings 73 | self.model = model if model is not None else os.getenv("LITELLM_SEARCH_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")) 74 | self.temperature = temperature 75 | self.top_p = top_p 76 | self.system_prompt = system_prompt 77 | 78 | # Configure LiteLLM with OpenAI base URL if provided 79 | openai_base_url = os.environ.get("OPENAI_BASE_URL") 80 | if openai_base_url: 81 | utils.set_provider_config("openai", {"base_url": openai_base_url}) 82 | 83 | async def search_and_build_context( 84 | self, 85 | query: str, 86 | max_sources: int = 2, 87 | pro_mode: bool = False 88 | ) -> str: 89 | """ 90 | Performs a web search and builds a context from the search results. 91 | 92 | This method executes a search query, processes the returned sources, and builds a 93 | consolidated context, inspired by FreshPrompt in the FreshLLMs paper, that can be used for answering questions. 94 | 95 | Args: 96 | query (str): The search query to execute. 97 | max_sources (int, default=2): Maximum number of sources to process. If pro_mode 98 | is enabled, this overrides the top_results setting in source_processor_config 99 | when it's smaller. 100 | pro_mode (bool, default=False): When enabled, performs a deeper search and more 101 | thorough content processing. 102 | 103 | Returns: 104 | str: A formatted context string built from the processed search results. 105 | """ 106 | # Get sources from SERP 107 | sources = self.serp_search.get_sources(query) 108 | 109 | # Process sources 110 | processed_sources = await self.source_processor.process_sources( 111 | sources, 112 | max_sources, 113 | query, 114 | pro_mode 115 | ) 116 | 117 | # Build and return context 118 | return build_context(processed_sources) 119 | 120 | async def ask( 121 | self, 122 | query: str, 123 | max_sources: int = 2, 124 | pro_mode: bool = False, 125 | ) -> str: 126 | """ 127 | Searches for information and generates an AI response to the query. 128 | 129 | This method combines web search, context building, and AI completion to provide 130 | informed answers to questions. It first gathers relevant information through search, 131 | then uses an LLM to generate a response based on the collected context. 132 | 133 | Args: 134 | query (str): The question or query to answer. 135 | max_sources (int, default=2): Maximum number of sources to include in the context. 136 | pro_mode (bool, default=False): When enabled, performs a more comprehensive search 137 | and analysis of sources. 138 | 139 | Returns: 140 | str: An AI-generated response that answers the query based on the gathered context. 141 | """ 142 | # Get context from search results 143 | context = await self.search_and_build_context(query, max_sources, pro_mode) 144 | # Prepare messages for the LLM 145 | messages = [ 146 | {"role": "system", "content": self.system_prompt}, 147 | {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"} 148 | ] 149 | # Get completion from LLM 150 | response = completion( 151 | model=self.model, 152 | messages=messages, 153 | temperature=self.temperature, 154 | top_p=self.top_p 155 | ) 156 | 157 | return response.choices[0].message.content 158 | 159 | def ask_sync( 160 | self, 161 | query: str, 162 | max_sources: int = 2, 163 | pro_mode: bool = False, 164 | ) -> str: 165 | """ 166 | Synchronous version of ask() method. 167 | """ 168 | try: 169 | # Try getting the current event loop 170 | loop = asyncio.get_event_loop() 171 | if loop.is_running(): 172 | # If we're in a running event loop (e.g., Jupyter), use nest_asyncio 173 | nest_asyncio.apply() 174 | except RuntimeError: 175 | # If there's no event loop, create a new one 176 | loop = asyncio.new_event_loop() 177 | asyncio.set_event_loop(loop) 178 | 179 | return loop.run_until_complete(self.ask(query, max_sources, pro_mode)) 180 | -------------------------------------------------------------------------------- /src/opendeepsearch/ods_tool.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Literal 2 | from smolagents import Tool 3 | from opendeepsearch.ods_agent import OpenDeepSearchAgent 4 | 5 | class OpenDeepSearchTool(Tool): 6 | name = "web_search" 7 | description = """ 8 | Performs web search based on your query (think a Google search) then returns the final answer that is processed by an llm.""" 9 | inputs = { 10 | "query": { 11 | "type": "string", 12 | "description": "The search query to perform", 13 | }, 14 | } 15 | output_type = "string" 16 | 17 | def __init__( 18 | self, 19 | model_name: Optional[str] = None, 20 | reranker: str = "infinity", 21 | search_provider: Literal["serper", "searxng"] = "serper", 22 | serper_api_key: Optional[str] = None, 23 | searxng_instance_url: Optional[str] = None, 24 | searxng_api_key: Optional[str] = None 25 | ): 26 | super().__init__() 27 | self.search_model_name = model_name # LiteLLM model name 28 | self.reranker = reranker 29 | self.search_provider = search_provider 30 | self.serper_api_key = serper_api_key 31 | self.searxng_instance_url = searxng_instance_url 32 | self.searxng_api_key = searxng_api_key 33 | 34 | def forward(self, query: str): 35 | answer = self.search_tool.ask_sync(query, max_sources=2, pro_mode=True) 36 | return answer 37 | 38 | def setup(self): 39 | self.search_tool = OpenDeepSearchAgent( 40 | self.search_model_name, 41 | reranker=self.reranker, 42 | search_provider=self.search_provider, 43 | serper_api_key=self.serper_api_key, 44 | searxng_instance_url=self.searxng_instance_url, 45 | searxng_api_key=self.searxng_api_key 46 | ) 47 | -------------------------------------------------------------------------------- /src/opendeepsearch/prompts.py: -------------------------------------------------------------------------------- 1 | from smolagents import PromptTemplates 2 | 3 | SEARCH_SYSTEM_PROMPT = """ 4 | You are an AI-powered search agent that takes in a user’s search query, retrieves relevant search results, and provides an accurate and concise answer based on the provided context. 5 | 6 | ## **Guidelines** 7 | 8 | ### 1. **Prioritize Reliable Sources** 9 | - Use **ANSWER BOX** when available, as it is the most likely authoritative source. 10 | - Prefer **Wikipedia** if present in the search results for general knowledge queries. 11 | - If there is a conflict between **Wikipedia** and the **ANSWER BOX**, rely on **Wikipedia**. 12 | - Prioritize **government (.gov), educational (.edu), reputable organizations (.org), and major news outlets** over less authoritative sources. 13 | - When multiple sources provide conflicting information, prioritize the most **credible, recent, and consistent** source. 14 | 15 | ### 2. **Extract the Most Relevant Information** 16 | - Focus on **directly answering the query** using the information from the **ANSWER BOX** or **SEARCH RESULTS**. 17 | - Use **additional information** only if it provides **directly relevant** details that clarify or expand on the query. 18 | - Ignore promotional, speculative, or repetitive content. 19 | 20 | ### 3. **Provide a Clear and Concise Answer** 21 | - Keep responses **brief (1–3 sentences)** while ensuring accuracy and completeness. 22 | - If the query involves **numerical data** (e.g., prices, statistics), return the **most recent and precise value** available. 23 | - If the source is available, then mention it in the answer to the question. If you're relying on the answer box, then do not mention the source if it's not there. 24 | - For **diverse or expansive queries** (e.g., explanations, lists, or opinions), provide a more detailed response when the context justifies it. 25 | 26 | ### 4. **Handle Uncertainty and Ambiguity** 27 | - If **conflicting answers** are present, acknowledge the discrepancy and mention the different perspectives if relevant. 28 | - If **no relevant information** is found in the context, explicitly state that the query could not be answered. 29 | 30 | ### 5. **Answer Validation** 31 | - Only return answers that can be **directly validated** from the provided context. 32 | - Do not generate speculative or outside knowledge answers. If the context does not contain the necessary information, state that the answer could not be found. 33 | 34 | ### 6. **Bias and Neutrality** 35 | - Maintain **neutral language** and avoid subjective opinions. 36 | - For controversial topics, present multiple perspectives if they are available and relevant. 37 | """ 38 | 39 | REACT_PROMPT = PromptTemplates(system_prompt=""" 40 | You are an expert assistant who can solve any task using tool calls. You will be given a task to solve as best you can. 41 | To do so, you have been given access to some tools. 42 | 43 | The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation". 44 | This Action/Observation can repeat N times, you should take several steps when needed. 45 | 46 | You can use the result of the previous action as input for the next action. 47 | The observation will always be a string: it can represent a file, like "image_1.jpg". 48 | Then you can use it as input for the next action. You can do it for instance as follows: 49 | 50 | Observation: "image_1.jpg" 51 | 52 | Action: 53 | { 54 | "name": "image_transformer", 55 | "arguments": {"image": "image_1.jpg"} 56 | } 57 | 58 | To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this: 59 | Action: 60 | { 61 | "name": "final_answer", 62 | "arguments": {"answer": "insert your final answer here"} 63 | } 64 | 65 | 66 | Here are a few examples using notional tools: 67 | --- 68 | Task: "What historical event happened closest in time to the invention of the telephone: the American Civil War or the establishment of the Eiffel Tower?" 69 | 70 | Action: 71 | { 72 | "name": "web_search", 73 | "arguments": {"query": "year of telephone invention"} 74 | } 75 | Observation: "The telephone was invented in 1876." 76 | 77 | Action: 78 | { 79 | "name": "web_search", 80 | "arguments": {"query": "year American Civil War ended"} 81 | } 82 | Observation: "The American Civil War ended in 1865." 83 | 84 | Action: 85 | { 86 | "name": "web_search", 87 | "arguments": {"query": "year Eiffel Tower established"} 88 | } 89 | Observation: "The Eiffel Tower was completed in 1889." 90 | 91 | Action: 92 | { 93 | "name": "calculate", 94 | "arguments": {"expression": "|1876 - 1865| and |1889 - 1876|"} 95 | } 96 | Observation: "11 years (Civil War) and 13 years (Eiffel Tower)." 97 | 98 | Action: 99 | { 100 | "name": "final_answer", 101 | "arguments": {"answer": "The historical event closest in time to the invention of the telephone is the end of the American Civil War (11 years apart)."} 102 | } 103 | 104 | --- 105 | Task: "Which country has a higher population density: Japan or India?" 106 | 107 | Action: 108 | { 109 | "name": "web_search", 110 | "arguments": {"query": "population and area of Japan"} 111 | } 112 | Observation: "Japan has a population of 125 million and an area of 377,975 square kilometers." 113 | 114 | Action: 115 | { 116 | "name": "web_search", 117 | "arguments": {"query": "population and area of India"} 118 | } 119 | Observation: "India has a population of 1.38 billion and an area of 3,287,263 square kilometers." 120 | 121 | Action: 122 | { 123 | "name": "calculate", 124 | "arguments": {"expression": "125 million / 377,975 and 1.38 billion / 3,287,263"} 125 | } 126 | Observation: "Japan: 330.7 people/km²; India: 419.6 people/km²." 127 | 128 | Action: 129 | { 130 | "name": "final_answer", 131 | "arguments": {"answer": "India has a higher population density (419.6 people/km²) than Japan (330.7 people/km²)."} 132 | } 133 | 134 | --- 135 | Task: "Which country has won more total Olympic gold medals: the United States or China?" 136 | 137 | Action: 138 | { 139 | "name": "web_search", 140 | "arguments": {"query": "total Olympic gold medals won by the United States"} 141 | } 142 | Observation: "The United States has won 1,127 gold medals." 143 | 144 | Action: 145 | { 146 | "name": "web_search", 147 | "arguments": {"query": "total Olympic gold medals won by China"} 148 | } 149 | Observation: "China has won 283 gold medals." 150 | 151 | Action: 152 | { 153 | "name": "calculate", 154 | "arguments": {"expression": "1,127 - 283"} 155 | } 156 | Observation: "The United States has 844 more gold medals than China." 157 | 158 | Action: 159 | { 160 | "name": "final_answer", 161 | "arguments": {"answer": "The United States has won more Olympic gold medals (1,127) than China (283)."} 162 | } 163 | 164 | --- 165 | Task: "Who discovered the structure of DNA, and in which year was the discovery made?" 166 | 167 | Action: 168 | { 169 | "name": "web_search", 170 | "arguments": {"query": "scientists who discovered DNA structure"} 171 | } 172 | Observation: "James Watson and Francis Crick discovered the structure of DNA." 173 | 174 | Action: 175 | { 176 | "name": "web_search", 177 | "arguments": {"query": "year DNA structure discovered"} 178 | } 179 | Observation: "The structure of DNA was discovered in 1953." 180 | 181 | Action: 182 | { 183 | "name": "final_answer", 184 | "arguments": {"answer": "James Watson and Francis Crick discovered the structure of DNA in 1953."} 185 | } 186 | 187 | --- 188 | Task: "How many meters taller is the Burj Khalifa compared to the Empire State Building?" 189 | 190 | Action: 191 | { 192 | "name": "web_search", 193 | "arguments": {"query": "height of Burj Khalifa"} 194 | } 195 | Observation: "The Burj Khalifa is 828 meters tall." 196 | 197 | Action: 198 | { 199 | "name": "web_search", 200 | "arguments": {"query": "height of Empire State Building"} 201 | } 202 | Observation: "The Empire State Building is 381 meters tall." 203 | 204 | Action: 205 | { 206 | "name": "calculate", 207 | "arguments": {"expression": "828 - 381"} 208 | } 209 | Observation: "The difference is 447 meters." 210 | 211 | Action: 212 | { 213 | "name": "final_answer", 214 | "arguments": {"answer": "The Burj Khalifa is 447 meters taller than the Empire State Building."} 215 | } 216 | 217 | --- 218 | Task: "Which country launched the first satellite into space, and what was the name of the satellite?" 219 | 220 | Action: 221 | { 222 | "name": "web_search", 223 | "arguments": {"query": "first satellite launched into space"} 224 | } 225 | Observation: "The Soviet Union launched the first satellite." 226 | 227 | Action: 228 | { 229 | "name": "web_search", 230 | "arguments": {"query": "name of first satellite in space"} 231 | } 232 | Observation: "The first satellite was Sputnik 1." 233 | 234 | Action: 235 | { 236 | "name": "final_answer", 237 | "arguments": {"answer": "The Soviet Union launched the first satellite into space, named Sputnik 1."} 238 | } 239 | 240 | --- 241 | Task: "Which novel by George Orwell introduced the concept of 'Big Brother,' and in what year was it published?" 242 | 243 | Action: 244 | { 245 | "name": "web_search", 246 | "arguments": {"query": "novel by George Orwell Big Brother"} 247 | } 248 | Observation: "The novel is '1984.'" 249 | 250 | Action: 251 | { 252 | "name": "web_search", 253 | "arguments": {"query": "year '1984' by George Orwell published"} 254 | } 255 | Observation: "'1984' was published in 1949." 256 | 257 | Action: 258 | { 259 | "name": "final_answer", 260 | "arguments": {"answer": "George Orwell's novel '1984,' which introduced the concept of 'Big Brother,' was published in 1949."} 261 | } 262 | 263 | --- 264 | Task: "Which country hosted the first FIFA World Cup, and in what year?" 265 | 266 | Action: 267 | { 268 | "name": "web_search", 269 | "arguments": {"query": "country hosted first FIFA World Cup"} 270 | } 271 | Observation: "Uruguay hosted the first FIFA World Cup." 272 | 273 | Action: 274 | { 275 | "name": "web_search", 276 | "arguments": {"query": "year of first FIFA World Cup"} 277 | } 278 | Observation: "The first FIFA World Cup was held in 1930." 279 | 280 | Action: 281 | { 282 | "name": "final_answer", 283 | "arguments": {"answer": "Uruguay hosted the first FIFA World Cup in 1930."} 284 | } 285 | 286 | --- 287 | Task: "Who invented the light bulb, and what company did he later establish?" 288 | 289 | Action: 290 | { 291 | "name": "web_search", 292 | "arguments": {"query": "inventor of the light bulb"} 293 | } 294 | Observation: "Thomas Edison invented the light bulb." 295 | 296 | Action: 297 | { 298 | "name": "web_search", 299 | "arguments": {"query": "company founded by Thomas Edison"} 300 | } 301 | Observation: "Thomas Edison founded General Electric." 302 | 303 | Action: 304 | { 305 | "name": "final_answer", 306 | "arguments": {"answer": "Thomas Edison invented the light bulb and later established General Electric."} 307 | } 308 | 309 | --- 310 | Task: "In which city was the Declaration of Independence signed, and in what building?" 311 | 312 | Action: 313 | { 314 | "name": "web_search", 315 | "arguments": {"query": "city where Declaration of Independence was signed"} 316 | } 317 | Observation: "The Declaration of Independence was signed in Philadelphia." 318 | 319 | Action: 320 | { 321 | "name": "web_search", 322 | "arguments": {"query": "building where Declaration of Independence was signed"} 323 | } 324 | Observation: "It was signed in Independence Hall." 325 | 326 | Action: 327 | { 328 | "name": "final_answer", 329 | "arguments": {"answer": "The Declaration of Independence was signed in Philadelphia at Independence Hall."} 330 | } 331 | 332 | --- 333 | Task: "Who developed the theory of general relativity, and in what year was it published?" 334 | 335 | Action: 336 | { 337 | "name": "web_search", 338 | "arguments": {"query": "developer of general relativity"} 339 | } 340 | Observation: "Albert Einstein developed the theory of general relativity." 341 | 342 | Action: 343 | { 344 | "name": "web_search", 345 | "arguments": {"query": "year general relativity published"} 346 | } 347 | Observation: "The theory of general relativity was published in 1915." 348 | 349 | Action: 350 | { 351 | "name": "final_answer", 352 | "arguments": {"answer": "Albert Einstein developed the theory of general relativity, which was published in 1915."} 353 | } 354 | 355 | --- 356 | Task: "Which Shakespeare play features the phrase 'To be, or not to be,' and who speaks this line?" 357 | 358 | Action: 359 | { 360 | "name": "web_search", 361 | "arguments": {"query": "Shakespeare play To be, or not to be"} 362 | } 363 | Observation: "The play is 'Hamlet.'" 364 | 365 | Action: 366 | { 367 | "name": "web_search", 368 | "arguments": {"query": "character who says To be, or not to be in Hamlet"} 369 | } 370 | Observation: "The line is spoken by Hamlet." 371 | 372 | Action: 373 | { 374 | "name": "final_answer", 375 | "arguments": {"answer": "The phrase 'To be, or not to be' is from Shakespeare's 'Hamlet,' and it is spoken by the character Hamlet."} 376 | } 377 | 378 | --- 379 | Task: "What is the tallest mountain in Africa, and how high is it?" 380 | 381 | Action: 382 | { 383 | "name": "web_search", 384 | "arguments": {"query": "tallest mountain in Africa"} 385 | } 386 | Observation: "Mount Kilimanjaro is the tallest mountain in Africa." 387 | 388 | Action: 389 | { 390 | "name": "web_search", 391 | "arguments": {"query": "height of Mount Kilimanjaro"} 392 | } 393 | Observation: "Mount Kilimanjaro is 5,895 meters tall." 394 | 395 | Action: 396 | { 397 | "name": "final_answer", 398 | "arguments": {"answer": "Mount Kilimanjaro, the tallest mountain in Africa, is 5,895 meters high."} 399 | } 400 | 401 | --- 402 | Task: "Who was the first President of the United States to serve two non-consecutive terms?" 403 | 404 | Action: 405 | { 406 | "name": "web_search", 407 | "arguments": {"query": "President who served two non-consecutive terms"} 408 | } 409 | Observation: "Grover Cleveland was the first President to serve two non-consecutive terms." 410 | 411 | Action: 412 | { 413 | "name": "final_answer", 414 | "arguments": {"answer": "Grover Cleveland was the first President of the United States to serve two non-consecutive terms."} 415 | } 416 | 417 | --- 418 | Task: "What planet is the largest in our solar system, and what is its diameter?" 419 | 420 | Action: 421 | { 422 | "name": "web_search", 423 | "arguments": {"query": "largest planet in solar system"} 424 | } 425 | Observation: "Jupiter is the largest planet in the solar system." 426 | 427 | Action: 428 | { 429 | "name": "web_search", 430 | "arguments": {"query": "diameter of Jupiter"} 431 | } 432 | Observation: "Jupiter's diameter is approximately 139,820 kilometers." 433 | 434 | Action: 435 | { 436 | "name": "final_answer", 437 | "arguments": {"answer": "Jupiter is the largest planet in the solar system, with a diameter of approximately 139,820 kilometers."} 438 | } 439 | 440 | --- 441 | Task: "What was the first airplane to fly, and in what year did it achieve this feat?" 442 | 443 | Action: 444 | { 445 | "name": "web_search", 446 | "arguments": {"query": "first airplane to fly"} 447 | } 448 | Observation: "The first airplane to fly was the Wright Flyer." 449 | 450 | Action: 451 | { 452 | "name": "web_search", 453 | "arguments": {"query": "year Wright Flyer first flight"} 454 | } 455 | Observation: "The Wright Flyer flew for the first time in 1903." 456 | 457 | Action: 458 | { 459 | "name": "final_answer", 460 | "arguments": {"answer": "The Wright Flyer was the first airplane to fly, achieving this feat in 1903."} 461 | } 462 | 463 | --- 464 | Task: "Who painted the Mona Lisa, and where is it displayed?" 465 | 466 | Action: 467 | { 468 | "name": "web_search", 469 | "arguments": {"query": "artist who painted Mona Lisa"} 470 | } 471 | Observation: "Leonardo da Vinci painted the Mona Lisa." 472 | 473 | Action: 474 | { 475 | "name": "web_search", 476 | "arguments": {"query": "where is the Mona Lisa displayed"} 477 | } 478 | Observation: "The Mona Lisa is displayed in the Louvre Museum in Paris." 479 | 480 | Action: 481 | { 482 | "name": "final_answer", 483 | "arguments": {"answer": "Leonardo da Vinci painted the Mona Lisa, which is displayed in the Louvre Museum in Paris."} 484 | } 485 | 486 | --- 487 | Task: "Who has won the most Grand Slam tennis titles, and how many have they won?" 488 | 489 | Action: 490 | { 491 | "name": "web_search", 492 | "arguments": {"query": "player with most Grand Slam tennis titles"} 493 | } 494 | Observation: "Novak Djokovic has won the most Grand Slam titles." 495 | 496 | Action: 497 | { 498 | "name": "web_search", 499 | "arguments": {"query": "number of Grand Slam titles Novak Djokovic"} 500 | } 501 | Observation: "Novak Djokovic has won 24 Grand Slam titles." 502 | 503 | Action: 504 | { 505 | "name": "final_answer", 506 | "arguments": {"answer": "Novak Djokovic has won the most Grand Slam tennis titles, with 24 titles."} 507 | } 508 | 509 | --- 510 | Task: "Who was the longest-reigning monarch in British history, and how many years did they reign?" 511 | 512 | Action: 513 | { 514 | "name": "web_search", 515 | "arguments": {"query": "longest reigning monarch in British history"} 516 | } 517 | Observation: "Queen Elizabeth II was the longest-reigning monarch in British history." 518 | 519 | Action: 520 | { 521 | "name": "web_search", 522 | "arguments": {"query": "length of reign Queen Elizabeth II"} 523 | } 524 | Observation: "Queen Elizabeth II reigned for 70 years." 525 | 526 | Action: 527 | { 528 | "name": "final_answer", 529 | "arguments": {"answer": "Queen Elizabeth II was the longest-reigning monarch in British history, with a reign of 70 years."} 530 | } 531 | 532 | --- 533 | Task: "Which Shakespeare play contains the line \"All the world's a stage,\" and how many years ago was it first performed if today is 2024?" 534 | 535 | Action: 536 | { 537 | "name": "web_search", 538 | "arguments": {"query": "Shakespeare play All the world's a stage"} 539 | } 540 | Observation: "The line is from \"As You Like It.\"" 541 | 542 | Action: 543 | { 544 | "name": "web_search", 545 | "arguments": {"query": "year As You Like It first performed"} 546 | } 547 | Observation: "\"As You Like It\" was first performed in 1603." 548 | 549 | Action: 550 | { 551 | "name": "calculate", 552 | "arguments": {"expression": "2024 - 1603"} 553 | } 554 | Observation: "421 years." 555 | 556 | Action: 557 | { 558 | "name": "final_answer", 559 | "arguments": {"answer": "\"As You Like It\" contains the line \"All the world's a stage\" and was first performed 421 years ago in 1603."} 560 | } 561 | 562 | Above examples were using notional tools that might not exist for you. You only have access to these tools: 563 | {%- for tool in tools.values() %} 564 | - {{ tool.name }}: {{ tool.description }} 565 | Takes inputs: {{tool.inputs}} 566 | Returns an output of type: {{tool.output_type}} 567 | {%- endfor %} 568 | 569 | {%- if managed_agents and managed_agents.values() | list %} 570 | You can also give tasks to team members. 571 | Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task. 572 | Given that this team member is a real human, you should be very verbose in your task. 573 | Here is a list of the team members that you can call: 574 | {%- for agent in managed_agents.values() %} 575 | - {{ agent.name }}: {{ agent.description }} 576 | {%- endfor %} 577 | {%- else %} 578 | {%- endif %} 579 | 580 | Here are the rules you should always follow to solve your task: 581 | 1. ALWAYS provide a tool call, else you will fail. 582 | 2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead. 583 | 3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself. 584 | If no tool call is needed, use final_answer tool to return your answer. 585 | 4. Never re-do a tool call that you previously did with the exact same parameters. 586 | 587 | Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. 588 | """) -------------------------------------------------------------------------------- /src/opendeepsearch/ranking_models/README.md: -------------------------------------------------------------------------------- 1 | ## Semantic Search and Reranking 2 | 3 | The OpenDeepSearch library provides a flexible framework for semantic search and document reranking. At its core is the `BaseSemanticSearcher` class, which can be extended to implement various reranking strategies. 4 | 5 | ### Creating Your Own Reranker 6 | 7 | To implement your own reranker, simply inherit from `BaseSemanticSearcher` and implement the `_get_embeddings()` method: 8 | 9 | ```python 10 | from opendeepsearch.ranking_models.base_reranker import BaseSemanticSearcher 11 | import torch 12 | class MyCustomReranker(BaseSemanticSearcher): 13 | def init(self): 14 | # Initialize your embedding model here 15 | super().__init__() 16 | self.model = YourEmbeddingModel() 17 | def get_embeddings(self, texts: List[str]) -> torch.Tensor: 18 | # Implement your embedding logic here 19 | pass 20 | embeddings = self.model.encode(texts) 21 | return torch.tensor(embeddings) 22 | ``` 23 | 24 | The base class automatically handles: 25 | - Similarity score calculation 26 | - Score normalization (softmax, scaling, or none) 27 | - Document reranking 28 | - Top-k selection 29 | 30 | ### Using Infinity Rerankers 31 | 32 | For high-performance reranking, we support [Infinity](https://github.com/michaelfeil/infinity) rerankers which offer state-of-the-art performance. To use an Infinity reranker, first start the Infinity server: 33 | 34 | ```bash 35 | # requires ~16-32GB VRAM NVIDIA Compute Capability >= 8.0 36 | docker run \ 37 | -v $PWD/data:/app/.cache --gpus "0" -p "7997":"7997" \ 38 | michaelf34/infinity:0.0.68-trt-onnx \ 39 | v2 --model-id Alibaba-NLP/gte-Qwen2-7B-instruct --revision "refs/pr/38" \ 40 | --dtype bfloat16 --batch-size 8 --device cuda --engine torch --port 7997 \ 41 | --no-bettertransformer 42 | ``` 43 | 44 | This will start an Infinity server using the Qwen2-7B-instruct model. The server will be available at `localhost:7997`. 45 | 46 | Key parameters: 47 | - `--model-id`: The Hugging Face model ID to use (see [supported models](https://github.com/michaelfeil/infinity#supported-tasks-and-models-by-infinity)) 48 | - `--dtype`: Data type for inference (bfloat16 recommended for modern GPUs) 49 | - `--batch-size`: Batch size for inference 50 | - `--port`: Port to expose the server on 51 | 52 | For specialized deployments, Infinity provides several Docker images: 53 | - `latest-cpu` - For CPU-only inference 54 | - `latest-rocm` - For AMD ROCm GPUs 55 | - `latest-trt-onnx` - For NVIDIA GPUs with TensorRT/ONNX optimizations 56 | 57 | See the [Infinity documentation](https://michaelfeil.github.io/infinity/) for more details on deployment options and configuration. 58 | 59 | Note: Ensure you have sufficient VRAM (16-32GB) and a compatible NVIDIA GPU (Compute Capability ≥ 8.0) before running the Infinity server. 60 | 61 | ### Using Jina AI (or API based) Rerankers 62 | 63 | Jina AI provides powerful embedding models through their API service. The `JinaReranker` class offers a simple way to leverage these models: 64 | 65 | ```python 66 | from opendeepsearch.ranking_models.jina_reranker import JinaReranker 67 | 68 | # Initialize with your API key 69 | reranker = JinaReranker(api_key="your_api_key") # or set JINA_API_KEY env variable 70 | 71 | # Example usage 72 | query = "What is machine learning?" 73 | documents = [ 74 | "Machine learning is a subset of artificial intelligence", 75 | "Deep learning is a type of machine learning", 76 | "Natural language processing uses machine learning" 77 | ] 78 | 79 | # Get top 2 most relevant documents 80 | results = reranker.search(query, documents, k=2) 81 | ``` 82 | 83 | The JinaReranker uses Jina's v3 embeddings by default, which provides: 84 | - 1024-dimensional embeddings 85 | - Optimized for text matching tasks 86 | - State-of-the-art performance for semantic search 87 | 88 | To use JinaReranker: 89 | 1. Sign up for a Jina AI API key at https://jina.ai 90 | 2. Either pass the API key directly or set it as an environment variable `JINA_API_KEY` 91 | 3. Optionally specify a different model using the `model` parameter 92 | 93 | Note: Unlike Infinity rerankers which run locally, Jina rerankers require an internet connection and API credits. -------------------------------------------------------------------------------- /src/opendeepsearch/ranking_models/base_reranker.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import torch 3 | from typing import List, Dict, Union 4 | 5 | class BaseSemanticSearcher(ABC): 6 | """ 7 | Abstract base class for semantic search implementations. 8 | 9 | This class defines the interface that all semantic searchers must implement. 10 | Subclasses should implement the _get_embeddings method according to their 11 | specific embedding source. 12 | """ 13 | 14 | @abstractmethod 15 | def _get_embeddings(self, texts: List[str]) -> torch.Tensor: 16 | """ 17 | Get embeddings for a list of texts. 18 | 19 | Args: 20 | texts: List of text strings to embed 21 | 22 | Returns: 23 | torch.Tensor containing the embeddings shape: (num_texts, embedding_dim) 24 | """ 25 | pass 26 | 27 | def calculate_scores( 28 | self, 29 | queries: List[str], 30 | documents: List[str], 31 | normalize: str = "softmax" # Options: "softmax", "scale", "none" 32 | ) -> torch.Tensor: 33 | """ 34 | Calculate similarity scores between queries and documents. 35 | 36 | Args: 37 | queries: List of query strings 38 | documents: List of document strings 39 | normalize: Normalization method: 40 | - "softmax": Apply softmax normalization (default) 41 | - "scale": Scale to 0-100 range 42 | - "none": No normalization 43 | 44 | Returns: 45 | torch.Tensor of shape (num_queries, num_documents) containing similarity scores 46 | """ 47 | # Get embeddings for queries and documents 48 | query_embeddings = self._get_embeddings(queries) 49 | doc_embeddings = self._get_embeddings(documents) 50 | 51 | # Calculate similarity scores 52 | scores = query_embeddings @ doc_embeddings.T 53 | 54 | # Apply normalization 55 | if normalize == "softmax": 56 | scores = torch.softmax(scores, dim=-1) 57 | elif normalize == "scale": 58 | scores = scores * 100 59 | elif normalize == "none": 60 | pass 61 | else: 62 | raise ValueError(f"Unknown normalization method: {normalize}") 63 | 64 | return scores 65 | 66 | def rerank( 67 | self, 68 | query: Union[str, List[str]], 69 | documents: List[str], 70 | top_k: int = 5, 71 | normalize: str = "softmax" 72 | ) -> List[Dict[str, Union[str, float]]]: 73 | """ 74 | Rerank documents based on their semantic similarity to the query. 75 | 76 | Args: 77 | query: Query string or list of query strings 78 | documents: List of documents to rerank 79 | top_k: Number of top results to return per query 80 | normalize: Normalization method for scores 81 | 82 | Returns: 83 | List of dicts containing reranked documents and their scores. 84 | For single query: [{"document": str, "score": float}, ...] 85 | For multiple queries: [[{"document": str, "score": float}, ...], ...] 86 | """ 87 | queries = [query] if isinstance(query, str) else query 88 | scores = self.calculate_scores(queries, documents, normalize=normalize) 89 | 90 | results = [] 91 | for query_scores in scores: 92 | top_indices = torch.topk(query_scores, min(top_k, len(documents)), dim=0) 93 | query_results = [ 94 | { 95 | "document": documents[idx.item()], 96 | "score": score.item() 97 | } 98 | for score, idx in zip(top_indices.values, top_indices.indices) 99 | ] 100 | results.append(query_results) 101 | 102 | return results[0] if isinstance(query, str) else results 103 | 104 | def get_reranked_documents( 105 | self, 106 | query: Union[str, List[str]], 107 | documents: List[str], 108 | top_k: int = 5, 109 | normalize: str = "softmax" 110 | ) -> Union[List[str], List[List[str]]]: 111 | """ 112 | Returns only the reranked documents without scores. 113 | 114 | Args: 115 | query: Query string or list of query strings 116 | documents: List of documents to rerank 117 | top_k: Number of top results to return per query 118 | normalize: Normalization method for scores 119 | 120 | Returns: 121 | For single query: List of reranked document strings 122 | For multiple queries: List of lists of reranked document strings 123 | """ 124 | results = self.rerank(query, documents, top_k, normalize) 125 | return "\n".join([x['document'].strip() for x in results]) 126 | -------------------------------------------------------------------------------- /src/opendeepsearch/ranking_models/chunker.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from langchain_text_splitters import RecursiveCharacterTextSplitter 3 | 4 | class Chunker: 5 | """A modular text chunking class that splits text into smaller, overlapping segments. 6 | 7 | This class provides a flexible way to break down large texts into smaller chunks 8 | while maintaining context through configurable overlap. It uses RecursiveCharacterTextSplitter 9 | from langchain under the hood. 10 | 11 | Attributes: 12 | chunk_size (int): The target size for each text chunk. 13 | chunk_overlap (int): The number of characters to overlap between chunks. 14 | separators (List[str]): List of separators to use for splitting, in order of preference. 15 | length_function (callable): Function to measure text length (default: len). 16 | """ 17 | 18 | def __init__( 19 | self, 20 | chunk_size: int = 150, 21 | chunk_overlap: int = 50, 22 | separators: Optional[List[str]] = None, 23 | length_function: callable = len 24 | ): 25 | """Initialize the Chunker with specified parameters. 26 | 27 | Args: 28 | chunk_size (int, optional): Target size for each chunk. Defaults to 250. 29 | chunk_overlap (int, optional): Number of characters to overlap. Defaults to 50. 30 | separators (List[str], optional): Custom separators for splitting. 31 | Defaults to ["\n\n", "\n", " "]. 32 | length_function (callable, optional): Function to measure text length. 33 | Defaults to len. 34 | """ 35 | self.chunk_size = chunk_size 36 | self.chunk_overlap = chunk_overlap 37 | self.separators = separators or ["\n\n", "\n"] 38 | self.length_function = length_function 39 | 40 | self.splitter = RecursiveCharacterTextSplitter( 41 | separators=self.separators, 42 | chunk_size=self.chunk_size, 43 | chunk_overlap=self.chunk_overlap, 44 | length_function=self.length_function 45 | ) 46 | 47 | def split_text(self, text: str) -> List[str]: 48 | """Split a single text into chunks. 49 | 50 | Args: 51 | text (str): The input text to be split into chunks. 52 | 53 | Returns: 54 | List[str]: A list of text chunks. 55 | """ 56 | return self.splitter.split_text(text) 57 | 58 | def split_texts(self, texts: List[str]) -> List[List[str]]: 59 | """Split multiple texts into chunks. 60 | 61 | Args: 62 | texts (List[str]): A list of input texts to be split into chunks. 63 | 64 | Returns: 65 | List[List[str]]: A list of lists, where each inner list contains 66 | the chunks for one input text. 67 | """ 68 | return [self.split_text(text) for text in texts] 69 | -------------------------------------------------------------------------------- /src/opendeepsearch/ranking_models/infinity_rerank.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import requests 3 | import json 4 | from typing import List 5 | from opendeepsearch.ranking_models.base_reranker import BaseSemanticSearcher 6 | 7 | class InfinitySemanticSearcher(BaseSemanticSearcher): 8 | """ 9 | A semantic reranking model that uses the Infinity Embedding API for text embeddings. 10 | 11 | This class provides methods to rerank documents based on their semantic similarity 12 | to queries using embeddings from the Infinity API. The API endpoint expects to receive 13 | text inputs and returns high-dimensional embeddings that capture semantic meaning. 14 | 15 | The default model used is 'Alibaba-NLP/gte-Qwen2-7B-instruct', but other models 16 | available through the Infinity API can be specified. 17 | 18 | Attributes: 19 | embedding_endpoint (str): URL of the Infinity Embedding API endpoint 20 | model_name (str): Name of the embedding model to use 21 | 22 | Example: 23 | ```python 24 | reranker = SemanticSearch( 25 | embedding_endpoint="http://localhost:7997/embeddings", 26 | model_name="Alibaba-NLP/gte-Qwen2-7B-instruct" 27 | ) 28 | 29 | documents = [ 30 | "Munich is in Germany.", 31 | "The sky is blue." 32 | ] 33 | 34 | results = reranker.rerank( 35 | query="What color is the sky?", 36 | documents=documents, 37 | top_k=1 38 | ) 39 | ``` 40 | """ 41 | 42 | def __init__( 43 | self, 44 | embedding_endpoint: str = "http://localhost:7997/embeddings", 45 | model_name: str = "Alibaba-NLP/gte-Qwen2-7B-instruct", 46 | instruction_prefix: str = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: " 47 | ): 48 | """ 49 | Initialize the semantic search engine with Infinity Embedding API settings. 50 | 51 | Args: 52 | embedding_endpoint: URL of the Infinity Embedding API endpoint 53 | model_name: Name of the embedding model available in Infinity API 54 | instruction_prefix: Prefix to add to queries for better search relevance 55 | """ 56 | self.embedding_endpoint = embedding_endpoint 57 | self.model_name = model_name 58 | self.instruction_prefix = instruction_prefix 59 | 60 | def _get_embeddings(self, texts: List[str], embedding_type: str = "query") -> torch.Tensor: 61 | """ 62 | Get embeddings for a list of texts using the Infinity API. 63 | """ 64 | MAX_TEXTS = 2048 65 | if len(texts) > MAX_TEXTS: 66 | import warnings 67 | warnings.warn(f"Number of texts ({len(texts)}) exceeds maximum of {MAX_TEXTS}. List will be truncated.") 68 | texts = texts[:MAX_TEXTS] 69 | 70 | # Format queries with instruction prefix 71 | formatted_texts = [ 72 | self.instruction_prefix + text if embedding_type == "query" else text 73 | for text in texts 74 | ] 75 | 76 | response = requests.post( 77 | self.embedding_endpoint, 78 | json={ 79 | "model": self.model_name, 80 | "input": formatted_texts 81 | } 82 | ) 83 | 84 | content_str = response.content.decode('utf-8') 85 | content_json = json.loads(content_str) 86 | return torch.tensor([item['embedding'] for item in content_json['data']]) 87 | -------------------------------------------------------------------------------- /src/opendeepsearch/ranking_models/jina_reranker.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import torch 3 | from typing import List, Optional 4 | from dotenv import load_dotenv 5 | import os 6 | from .base_reranker import BaseSemanticSearcher 7 | 8 | class JinaReranker(BaseSemanticSearcher): 9 | """ 10 | Semantic searcher implementation using Jina AI's embedding API. 11 | """ 12 | 13 | def __init__(self, api_key: Optional[str] = None, model: str = "jina-embeddings-v3"): 14 | """ 15 | Initialize the Jina reranker. 16 | 17 | Args: 18 | api_key: Jina AI API key. If None, will load from environment variable JINA_API_KEY 19 | model: Model name to use (default: "jina-embeddings-v3") 20 | """ 21 | if api_key is None: 22 | load_dotenv() 23 | api_key = os.getenv('JINA_API_KEY') 24 | if not api_key: 25 | raise ValueError("No API key provided and JINA_API_KEY not found in environment variables") 26 | 27 | self.api_url = 'https://api.jina.ai/v1/embeddings' 28 | self.headers = { 29 | 'Content-Type': 'application/json', 30 | 'Authorization': f'Bearer {api_key}' 31 | } 32 | self.model = model 33 | 34 | def _get_embeddings(self, texts: List[str]) -> torch.Tensor: 35 | """ 36 | Get embeddings for a list of texts using Jina AI API. 37 | 38 | Args: 39 | texts: List of text strings to embed 40 | 41 | Returns: 42 | torch.Tensor containing the embeddings 43 | """ 44 | data = { 45 | "model": self.model, 46 | "task": "text-matching", 47 | "late_chunking": False, 48 | "dimensions": 1024, 49 | "embedding_type": "float", 50 | "input": texts 51 | } 52 | 53 | try: 54 | response = requests.post(self.api_url, headers=self.headers, json=data) 55 | response.raise_for_status() # Raise exception for non-200 status codes 56 | 57 | # Extract embeddings from response 58 | embeddings_data = [item["embedding"] for item in response.json()["data"]] 59 | 60 | # Convert to torch tensor 61 | embeddings = torch.tensor(embeddings_data) 62 | 63 | return embeddings 64 | 65 | except requests.exceptions.RequestException as e: 66 | raise RuntimeError(f"Error calling Jina AI API: {str(e)}") 67 | -------------------------------------------------------------------------------- /src/opendeepsearch/serp_search/serp_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from typing import Dict, Any, Optional, List, TypeVar, Generic, Union 4 | from abc import ABC, abstractmethod 5 | 6 | import requests 7 | 8 | T = TypeVar('T') 9 | 10 | class SearchAPIException(Exception): 11 | """Custom exception for Search API related errors""" 12 | pass 13 | 14 | class SerperAPIException(SearchAPIException): 15 | """Custom exception for Serper API related errors""" 16 | pass 17 | 18 | class SearXNGException(SearchAPIException): 19 | """Custom exception for SearXNG related errors""" 20 | pass 21 | 22 | @dataclass 23 | class SerperConfig: 24 | """Configuration for Serper API""" 25 | api_key: str 26 | api_url: str = "https://google.serper.dev/search" 27 | default_location: str = 'us' 28 | timeout: int = 10 29 | 30 | @classmethod 31 | def from_env(cls) -> 'SerperConfig': 32 | """Create config from environment variables""" 33 | api_key = os.getenv("SERPER_API_KEY") 34 | if not api_key: 35 | raise SerperAPIException("SERPER_API_KEY environment variable not set") 36 | return cls(api_key=api_key) 37 | 38 | @dataclass 39 | class SearXNGConfig: 40 | """Configuration for SearXNG instance""" 41 | instance_url: str 42 | api_key: Optional[str] = None 43 | default_location: str = 'all' 44 | timeout: int = 10 45 | 46 | @classmethod 47 | def from_env(cls) -> 'SearXNGConfig': 48 | """Create config from environment variables""" 49 | instance_url = os.getenv("SEARXNG_INSTANCE_URL") 50 | if not instance_url: 51 | raise SearXNGException("SEARXNG_INSTANCE_URL environment variable not set") 52 | api_key = os.getenv("SEARXNG_API_KEY") # Optional 53 | return cls(instance_url=instance_url, api_key=api_key) 54 | 55 | class SearchResult(Generic[T]): 56 | """Container for search results with error handling""" 57 | def __init__(self, data: Optional[T] = None, error: Optional[str] = None): 58 | self.data = data 59 | self.error = error 60 | self.success = error is None 61 | 62 | @property 63 | def failed(self) -> bool: 64 | return not self.success 65 | 66 | class SearchAPI(ABC): 67 | """Abstract base class for search APIs""" 68 | @abstractmethod 69 | def get_sources( 70 | self, 71 | query: str, 72 | num_results: int = 8, 73 | stored_location: Optional[str] = None 74 | ) -> SearchResult[Dict[str, Any]]: 75 | """Get search results from the API""" 76 | pass 77 | 78 | class SerperAPI(SearchAPI): 79 | def __init__(self, api_key: Optional[str] = None, config: Optional[SerperConfig] = None): 80 | if api_key: 81 | self.config = SerperConfig(api_key=api_key) 82 | else: 83 | self.config = config or SerperConfig.from_env() 84 | 85 | self.headers = { 86 | 'X-API-KEY': self.config.api_key, 87 | 'Content-Type': 'application/json' 88 | } 89 | 90 | @staticmethod 91 | def extract_fields(items: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]: 92 | """Extract specified fields from a list of dictionaries""" 93 | return [{key: item.get(key, "") for key in fields if key in item} for item in items] 94 | 95 | def get_sources( 96 | self, 97 | query: str, 98 | num_results: int = 8, 99 | stored_location: Optional[str] = None 100 | ) -> SearchResult[Dict[str, Any]]: 101 | """ 102 | Fetch search results from Serper API. 103 | 104 | Args: 105 | query: Search query string 106 | num_results: Number of results to return (default: 8, max: 10) 107 | stored_location: Optional location string 108 | 109 | Returns: 110 | SearchResult containing the search results or error information 111 | """ 112 | if not query.strip(): 113 | return SearchResult(error="Query cannot be empty") 114 | 115 | try: 116 | search_location = (stored_location or self.config.default_location).lower() 117 | 118 | payload = { 119 | "q": query, 120 | "num": min(max(1, num_results), 10), 121 | "gl": search_location 122 | } 123 | 124 | response = requests.post( 125 | self.config.api_url, 126 | headers=self.headers, 127 | json=payload, 128 | timeout=self.config.timeout 129 | ) 130 | response.raise_for_status() 131 | data = response.json() 132 | 133 | results = { 134 | 'organic': self.extract_fields( 135 | data.get('organic', []), 136 | ['title', 'link', 'snippet', 'date'] 137 | ), 138 | 'topStories': self.extract_fields( 139 | data.get('topStories', []), 140 | ['title', 'imageUrl'] 141 | ), 142 | 'images': self.extract_fields( 143 | data.get('images', [])[:6], 144 | ['title', 'imageUrl'] 145 | ), 146 | 'graph': data.get('knowledgeGraph'), 147 | 'answerBox': data.get('answerBox'), 148 | 'peopleAlsoAsk': data.get('peopleAlsoAsk'), 149 | 'relatedSearches': data.get('relatedSearches') 150 | } 151 | 152 | return SearchResult(data=results) 153 | 154 | except requests.RequestException as e: 155 | return SearchResult(error=f"API request failed: {str(e)}") 156 | except Exception as e: 157 | return SearchResult(error=f"Unexpected error: {str(e)}") 158 | 159 | 160 | class SearXNGAPI(SearchAPI): 161 | """API client for SearXNG search engine""" 162 | 163 | def __init__(self, instance_url: Optional[str] = None, api_key: Optional[str] = None, config: Optional[SearXNGConfig] = None): 164 | if instance_url: 165 | self.config = SearXNGConfig(instance_url=instance_url, api_key=api_key) 166 | else: 167 | self.config = config or SearXNGConfig.from_env() 168 | 169 | self.headers = {'Content-Type': 'application/json'} 170 | if self.config.api_key: 171 | self.headers['X-API-Key'] = self.config.api_key 172 | 173 | def get_sources( 174 | self, 175 | query: str, 176 | num_results: int = 8, 177 | stored_location: Optional[str] = None 178 | ) -> SearchResult[Dict[str, Any]]: 179 | """ 180 | Fetch search results from SearXNG instance. 181 | 182 | Args: 183 | query: Search query string 184 | num_results: Number of results to return (default: 8) 185 | stored_location: Optional location string (may not be supported by all instances) 186 | 187 | Returns: 188 | SearchResult containing the search results or error information 189 | """ 190 | if not query.strip(): 191 | return SearchResult(error="Query cannot be empty") 192 | 193 | try: 194 | # Ensure the instance URL ends with /search 195 | search_url = self.config.instance_url 196 | if not search_url.endswith('/search'): 197 | search_url = search_url.rstrip('/') + '/search' 198 | 199 | # Prepare parameters for SearXNG 200 | params = { 201 | 'q': query, 202 | 'format': 'json', 203 | 'pageno': 1, 204 | 'categories': 'general', 205 | 'language': 'all', 206 | 'time_range': None, 207 | 'safesearch': 0, 208 | 'engines': 'google,bing,duckduckgo', # Default engines, can be customised 209 | 'max_results': min(max(1, num_results), 20) # Limit to reasonable range 210 | } 211 | 212 | # Add location if provided and supported 213 | if stored_location and stored_location != 'all': 214 | params['language'] = stored_location 215 | 216 | response = requests.get( 217 | search_url, 218 | headers=self.headers, 219 | params=params, 220 | timeout=self.config.timeout 221 | ) 222 | response.raise_for_status() 223 | data = response.json() 224 | 225 | # Transform SearXNG results to match SerperAPI format 226 | organic_results = [] 227 | for result in data.get('results', [])[:num_results]: 228 | organic_results.append({ 229 | 'title': result.get('title', ''), 230 | 'link': result.get('url', ''), 231 | 'snippet': result.get('content', ''), 232 | 'date': result.get('publishedDate', '') 233 | }) 234 | 235 | # Extract image results if available 236 | image_results = [] 237 | for result in data.get('results', []): 238 | if result.get('img_src'): 239 | image_results.append({ 240 | 'title': result.get('title', ''), 241 | 'imageUrl': result.get('img_src', '') 242 | }) 243 | image_results = image_results[:6] # Limit to 6 images like SerperAPI 244 | 245 | # Format results to match SerperAPI structure 246 | results = { 247 | 'organic': organic_results, 248 | 'images': image_results, 249 | 'topStories': [], # SearXNG might not have direct equivalent 250 | 'graph': None, # SearXNG doesn't provide knowledge graph 251 | 'answerBox': None, # SearXNG doesn't provide answer box 252 | 'peopleAlsoAsk': None, 253 | 'relatedSearches': data.get('suggestions', []) 254 | } 255 | 256 | return SearchResult(data=results) 257 | 258 | except requests.RequestException as e: 259 | return SearchResult(error=f"SearXNG API request failed: {str(e)}") 260 | except Exception as e: 261 | return SearchResult(error=f"Unexpected error with SearXNG: {str(e)}") 262 | 263 | 264 | def create_search_api( 265 | search_provider: str = "serper", 266 | serper_api_key: Optional[str] = None, 267 | searxng_instance_url: Optional[str] = None, 268 | searxng_api_key: Optional[str] = None 269 | ) -> SearchAPI: 270 | """ 271 | Factory function to create the appropriate search API client. 272 | 273 | Args: 274 | search_provider: The search provider to use ('serper' or 'searxng') 275 | serper_api_key: Optional API key for Serper 276 | searxng_instance_url: Optional SearXNG instance URL 277 | searxng_api_key: Optional API key for SearXNG instance 278 | 279 | Returns: 280 | An instance of a SearchAPI implementation 281 | 282 | Raises: 283 | ValueError: If an invalid search provider is specified 284 | """ 285 | if search_provider.lower() == "serper": 286 | return SerperAPI(api_key=serper_api_key) 287 | elif search_provider.lower() == "searxng": 288 | return SearXNGAPI(instance_url=searxng_instance_url, api_key=searxng_api_key) 289 | else: 290 | raise ValueError(f"Invalid search provider: {search_provider}. Must be 'serper' or 'searxng'") 291 | -------------------------------------------------------------------------------- /src/opendeepsearch/wolfram_tool.py: -------------------------------------------------------------------------------- 1 | from smolagents import Tool 2 | import wolframalpha 3 | import json 4 | import os 5 | 6 | class WolframAlphaTool(Tool): 7 | name = "calculate" 8 | description = """ 9 | Performs computational, mathematical, and factual queries using Wolfram Alpha's computational knowledge engine. 10 | """ 11 | inputs = { 12 | "query": { 13 | "type": "string", 14 | "description": "The query to send to Wolfram Alpha", 15 | }, 16 | } 17 | output_type = "string" 18 | 19 | def __init__(self, app_id: str): 20 | super().__init__() 21 | self.app_id = app_id 22 | 23 | def setup(self): 24 | self.search_tool = WolframAlphaTool( 25 | self.app_id, 26 | ) 27 | 28 | def forward(self, query: str): 29 | 30 | # Initialize the Wolfram Alpha client 31 | self.wolfram_client = wolframalpha.Client(self.app_id) 32 | 33 | try: 34 | # Send the query to Wolfram Alpha 35 | res = self.wolfram_client.query(query) 36 | 37 | # Process the results 38 | results = [] 39 | for pod in res.pods: 40 | if pod.title: 41 | for subpod in pod.subpods: 42 | if subpod.plaintext: 43 | results.append({ 44 | 'title': pod.title, 45 | 'result': subpod.plaintext 46 | }) 47 | 48 | # Convert results to a JSON-serializable format 49 | formatted_result = { 50 | 'queryresult': { 51 | 'success': bool(results), 52 | 'inputstring': query, 53 | 'pods': [ 54 | { 55 | 'title': result['title'], 56 | 'subpods': [{'title': '', 'plaintext': result['result']}] 57 | } for result in results 58 | ] 59 | } 60 | } 61 | 62 | # Initialize final_result with a default value 63 | final_result = "No result found." 64 | 65 | # Extract the pods from the query result 66 | pods = formatted_result.get("queryresult", {}).get("pods", []) 67 | 68 | # Loop through pods to find the "Result" title 69 | for pod in pods: 70 | if pod.get("title") == "Result": 71 | # Extract and return the plaintext from the subpods 72 | subpods = pod.get("subpods", []) 73 | if subpods: 74 | final_result = subpods[0].get("plaintext", "").strip() 75 | break 76 | 77 | # If no "Result" pod was found, use the first available result 78 | if final_result == "No result found." and results: 79 | final_result = results[0]['result'] 80 | 81 | 82 | print(f"QUERY: {query}\n\nRESULT: {final_result}") 83 | return final_result 84 | 85 | except Exception as e: 86 | error_message = f"Error querying Wolfram Alpha: {str(e)}" 87 | print(error_message) 88 | return error_message 89 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------------