├── .env.example
├── .gitignore
├── LICENSE
├── README.md
├── assets
├── evals.png
└── sentient-logo-narrow.png
├── evals
├── README.md
├── autograde_df.py
├── datasets
│ ├── frames_test_set.csv
│ └── simple_qa_test_set.csv
├── eval_gpt_web.py
├── eval_tasks.py
├── gpt_web_extract.py
└── grader_prompts.py
├── gradio_demo.py
├── pdm.lock
├── pyproject.toml
├── requirements.txt
├── src
└── opendeepsearch
│ ├── __init__.py
│ ├── context_building
│ ├── build_context.py
│ └── process_sources_pro.py
│ ├── context_scraping
│ ├── basic_web_scraper.py
│ ├── crawl4ai_scraper.py
│ ├── extraction_result.py
│ ├── fast_scraper.py
│ ├── strategy_factory.py
│ └── utils.py
│ ├── ods_agent.py
│ ├── ods_tool.py
│ ├── prompts.py
│ ├── ranking_models
│ ├── README.md
│ ├── base_reranker.py
│ ├── chunker.py
│ ├── infinity_rerank.py
│ └── jina_reranker.py
│ ├── serp_search
│ └── serp_search.py
│ └── wolfram_tool.py
└── tests
└── __init__.py
/.env.example:
--------------------------------------------------------------------------------
1 | # SEARXNG_INSTANCE_URL=http://searxng:8080
2 | # or
3 | # SERPER_API_KEY=
4 |
5 | JINA_API_KEY=
6 | WOLFRAM_ALPHA_APP_ID=
7 |
8 | ### Providers ###
9 | OPENAI_API_KEY=
10 | OPENAI_BASE_URL=
11 | ANTHROPIC_API_KEY=
12 | OPENROUTER_API_KEY=
13 |
14 | # LiteLLM model IDs for different tasks
15 | LITELLM_MODEL_ID=openrouter/google/gemini-2.0-flash-001
16 | LITELLM_SEARCH_MODEL_ID=openrouter/google/gemini-2.0-flash-001
17 | LITELLM_ORCHESTRATOR_MODEL_ID=openrouter/google/gemini-2.0-flash-001
18 | LITELLM_EVAL_MODEL_ID=gpt-4o-mini
19 | FIREWORKS_API_KEY=
20 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 | *.ipynb
9 | *.ipynb_checkpoints
10 | output/
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 | cover/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | .pybuilder/
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # IPython
85 | profile_default/
86 | ipython_config.py
87 |
88 | # pyenv
89 | # For a library or package, you might want to ignore these files since the code is
90 | # intended to run in multiple environments; otherwise, check them in:
91 | # .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # poetry
101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | # This is especially recommended for binary packages to ensure reproducibility, and is more
103 | # commonly ignored for libraries.
104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 |
107 | # pdm
108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | # in version control.
112 | # https://pdm-project.org/#use-with-ide
113 | .pdm.toml
114 | .pdm-python
115 | .pdm-build/
116 |
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 |
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 |
124 | # SageMath parsed files
125 | *.sage.py
126 |
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 |
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 |
140 | # Rope project settings
141 | .ropeproject
142 |
143 | # mkdocs documentation
144 | /site
145 |
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 |
151 | # Pyre type checker
152 | .pyre/
153 |
154 | # pytype static type analyzer
155 | .pytype/
156 |
157 | # Cython debug symbols
158 | cython_debug/
159 |
160 | # PyCharm
161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | # and can be added to the global gitignore or merged into this file. For a more nuclear
164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | #.idea/
166 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🔍OpenDeepSearch: Democratizing Search with Open-source Reasoning Models and Reasoning Agents 🚀
2 |
3 |
4 |
5 |
6 |
7 |
36 |
37 | ## Description 📝
38 |
39 | OpenDeepSearch is a lightweight yet powerful search tool designed for seamless integration with AI agents. It enables deep web search and retrieval, optimized for use with Hugging Face's **[SmolAgents](https://github.com/huggingface/smolagents)** ecosystem.
40 |
41 |
42 |
43 |
44 |
45 | - **Performance**: ODS performs on par with closed source search alternatives on single-hop queries such as [SimpleQA](https://openai.com/index/introducing-simpleqa/) 🔍.
46 | - **Advanced Capabilities**: ODS performs much better than closed source search alternatives on multi-hop queries such as [FRAMES bench](https://huggingface.co/datasets/google/frames-benchmark) 🚀.
47 |
48 | ## Table of Contents 📑
49 |
50 | - [🔍OpenDeepSearch: Democratizing Search with Open-source Reasoning Models and Reasoning Agents 🚀](#opendeepsearch-democratizing-search-with-open-source-reasoning-models-and-reasoning-agents-)
51 | - [Description 📝](#description-)
52 | - [Table of Contents 📑](#table-of-contents-)
53 | - [Features ✨](#features-)
54 | - [Installation 📚](#installation-)
55 | - [Setup](#setup)
56 | - [Usage ️](#usage-️)
57 | - [Using OpenDeepSearch Standalone 🔍](#using-opendeepsearch-standalone-)
58 | - [Running the Gradio Demo 🖥️](#running-the-gradio-demo-️)
59 | - [Integrating with SmolAgents \& LiteLLM 🤖⚙️](#integrating-with-smolagents--litellm-️)
60 | - [](#)
61 | - [ReAct agent with math and search tools 🤖⚙️](#react-agent-with-math-and-search-tools-️)
62 | - [](#-1)
63 | - [Search Modes 🔄](#search-modes-)
64 | - [Default Mode ⚡](#default-mode-)
65 | - [Pro Mode 🔍](#pro-mode-)
66 | - [Acknowledgments 💡](#acknowledgments-)
67 | - [Citation](#citation)
68 | - [Contact 📩](#contact-)
69 |
70 | ## Features ✨
71 |
72 | - **Semantic Search** 🧠: Leverages **[Crawl4AI](https://github.com/unclecode/crawl4ai)** and semantic search rerankers (such as [Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/tree/main) and [Jina AI](https://jina.ai/)) to provide in-depth results
73 | - **Two Modes of Operation** ⚡:
74 | - **Default Mode**: Quick and efficient search with minimal latency.
75 | - **Pro Mode (Deep Search)**: More in-depth and accurate results at the cost of additional processing time.
76 | - **Optimized for AI Agents** 🤖: Works seamlessly with **SmolAgents** like `CodeAgent`.
77 | - **Fast and Lightweight** ⚡: Designed for speed and efficiency with minimal setup.
78 | - **Extensible** 🔌: Easily configurable to work with different models and APIs.
79 |
80 | ## Installation 📚
81 |
82 | To install OpenDeepSearch, run:
83 |
84 | ```bash
85 | pip install -e . #you can also use: uv pip install -e .
86 | pip install -r requirements.txt #you can also use: uv pip install -r requirements.txt
87 | ```
88 |
89 | Note: you must have `torch` installed.
90 | Note: using `uv` instead of regular `pip` makes life much easier!
91 |
92 | ### Using PDM (Alternative Package Manager) 📦
93 |
94 | You can also use PDM as an alternative package manager for OpenDeepSearch. PDM is a modern Python package and dependency manager supporting the latest PEP standards.
95 |
96 | ```bash
97 | # Install PDM if you haven't already
98 | curl -sSL https://raw.githubusercontent.com/pdm-project/pdm/main/install-pdm.py | python3 -
99 |
100 | # Initialize a new PDM project
101 | pdm init
102 |
103 | # Install OpenDeepSearch and its dependencies
104 | pdm install
105 |
106 | # Activate the virtual environment
107 | eval "$(pdm venv activate)"
108 | ```
109 |
110 | PDM offers several advantages:
111 | - Lockfile support for reproducible installations
112 | - PEP 582 support (no virtual environment needed)
113 | - Fast dependency resolution
114 | - Built-in virtual environment management
115 |
116 | ## Setup
117 |
118 | 1. **Choose a Search Provider**:
119 | - **Option 1: Serper.dev**: Get **free 2500 credits** and add your API key.
120 | - Visit [serper.dev](https://serper.dev) to create an account.
121 | - Retrieve your API key and store it as an environment variable:
122 |
123 | ```bash
124 | export SERPER_API_KEY='your-api-key-here'
125 | ```
126 |
127 | - **Option 2: SearXNG**: Use a self-hosted or public SearXNG instance.
128 | - Specify the SearXNG instance URL when initializing OpenDeepSearch.
129 | - Optionally provide an API key if your instance requires authentication:
130 |
131 | ```bash
132 | export SEARXNG_INSTANCE_URL='https://your-searxng-instance.com'
133 | export SEARXNG_API_KEY='your-api-key-here' # Optional
134 | ```
135 |
136 | 2. **Choose a Reranking Solution**:
137 | - **Quick Start with Jina**: Sign up at [Jina AI](https://jina.ai/) to get an API key for immediate use
138 | - **Self-hosted Option**: Set up [Infinity Embeddings](https://github.com/michaelfeil/infinity) server locally with open source models such as [Qwen2-7B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-7B-instruct/tree/main)
139 | - For more details on reranking options, see our [Rerankers Guide](src/opendeepsearch/ranking_models/README.md)
140 |
141 | 3. **Set up LiteLLM Provider**:
142 | - Choose a provider from the [supported list](https://docs.litellm.ai/docs/providers/), including:
143 | - OpenAI
144 | - Anthropic
145 | - Google (Gemini)
146 | - OpenRouter
147 | - HuggingFace
148 | - Fireworks
149 | - And many more!
150 | - Set your chosen provider's API key as an environment variable:
151 | ```bash
152 | export _API_KEY='your-api-key-here' # e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY
153 | ```
154 | - For OpenAI, you can also set a custom base URL (useful for self-hosted endpoints or proxies):
155 | ```bash
156 | export OPENAI_BASE_URL='https://your-custom-openai-endpoint.com'
157 | ```
158 | - You can set default LiteLLM model IDs for different tasks:
159 | ```bash
160 | # General default model (fallback for all tasks)
161 | export LITELLM_MODEL_ID='openrouter/google/gemini-2.0-flash-001'
162 |
163 | # Task-specific models
164 | export LITELLM_SEARCH_MODEL_ID='openrouter/google/gemini-2.0-flash-001' # For search tasks
165 | export LITELLM_ORCHESTRATOR_MODEL_ID='openrouter/google/gemini-2.0-flash-001' # For agent orchestration
166 | export LITELLM_EVAL_MODEL_ID='gpt-4o-mini' # For evaluation tasks
167 | ```
168 | - When initializing OpenDeepSearch, you can specify your chosen model using the provider's format (this will override the environment variables):
169 | ```python
170 | search_agent = OpenDeepSearchTool(model_name="provider/model-name") # e.g., "anthropic/claude-3-opus-20240229", 'huggingface/microsoft/codebert-base', 'openrouter/google/gemini-2.0-flash-001'
171 | ```
172 |
173 | ## Usage ️
174 |
175 | You can use OpenDeepSearch independently or integrate it with **SmolAgents** for enhanced reasoning and code generation capabilities.
176 |
177 | ### Using OpenDeepSearch Standalone 🔍
178 |
179 | ```python
180 | from opendeepsearch import OpenDeepSearchTool
181 | import os
182 |
183 | # Set environment variables for API keys
184 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here" # If using Serper
185 | # Or for SearXNG
186 | # os.environ["SEARXNG_INSTANCE_URL"] = "https://your-searxng-instance.com"
187 | # os.environ["SEARXNG_API_KEY"] = "your-api-key-here" # Optional
188 |
189 | os.environ["OPENROUTER_API_KEY"] = "your-openrouter-api-key-here"
190 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here"
191 |
192 | # Using Serper (default)
193 | search_agent = OpenDeepSearchTool(
194 | model_name="openrouter/google/gemini-2.0-flash-001",
195 | reranker="jina"
196 | )
197 |
198 | # Or using SearXNG
199 | # search_agent = OpenDeepSearchTool(
200 | # model_name="openrouter/google/gemini-2.0-flash-001",
201 | # reranker="jina",
202 | # search_provider="searxng",
203 | # searxng_instance_url="https://your-searxng-instance.com",
204 | # searxng_api_key="your-api-key-here" # Optional
205 | # )
206 |
207 | if not search_agent.is_initialized:
208 | search_agent.setup()
209 |
210 | query = "Fastest land animal?"
211 | result = search_agent.forward(query)
212 | print(result)
213 | ```
214 |
215 | ### Running the Gradio Demo 🖥️
216 |
217 | To try out OpenDeepSearch with a user-friendly interface, simply run:
218 |
219 | ```bash
220 | python gradio_demo.py
221 | ```
222 |
223 | This will launch a local web interface where you can test different search queries and modes interactively.
224 |
225 | You can customize the demo with command-line arguments:
226 |
227 | ```bash
228 | # Using Serper (default)
229 | python gradio_demo.py --model-name "openrouter/google/gemini-2.0-flash-001" --reranker "jina"
230 |
231 | # Using SearXNG
232 | python gradio_demo.py --model-name "openrouter/google/gemini-2.0-flash-001" --reranker "jina" \
233 | --search-provider "searxng" --searxng-instance "https://your-searxng-instance.com" \
234 | --searxng-api-key "your-api-key-here" # Optional
235 | ```
236 |
237 | Available options:
238 | - `--model-name`: LLM model to use for search
239 | - `--orchestrator-model`: LLM model for the agent orchestrator
240 | - `--reranker`: Reranker to use (`jina` or `infinity`)
241 | - `--search-provider`: Search provider to use (`serper` or `searxng`)
242 | - `--searxng-instance`: SearXNG instance URL (required if using `searxng`)
243 | - `--searxng-api-key`: SearXNG API key (optional)
244 | - `--serper-api-key`: Serper API key (optional, will use environment variable if not provided)
245 | - `--openai-base-url`: OpenAI API base URL (optional, will use OPENAI_BASE_URL env var if not provided)
246 |
247 | ### Integrating with SmolAgents & LiteLLM 🤖⚙️
248 |
249 | ####
250 |
251 | ```python
252 | from opendeepsearch import OpenDeepSearchTool
253 | from smolagents import CodeAgent, LiteLLMModel
254 | import os
255 |
256 | # Set environment variables for API keys
257 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here" # If using Serper
258 | # Or for SearXNG
259 | # os.environ["SEARXNG_INSTANCE_URL"] = "https://your-searxng-instance.com"
260 | # os.environ["SEARXNG_API_KEY"] = "your-api-key-here" # Optional
261 |
262 | os.environ["OPENROUTER_API_KEY"] = "your-openrouter-api-key-here"
263 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here"
264 |
265 | # Using Serper (default)
266 | search_agent = OpenDeepSearchTool(
267 | model_name="openrouter/google/gemini-2.0-flash-001",
268 | reranker="jina"
269 | )
270 |
271 | # Or using SearXNG
272 | # search_agent = OpenDeepSearchTool(
273 | # model_name="openrouter/google/gemini-2.0-flash-001",
274 | # reranker="jina",
275 | # search_provider="searxng",
276 | # searxng_instance_url="https://your-searxng-instance.com",
277 | # searxng_api_key="your-api-key-here" # Optional
278 | # )
279 |
280 | model = LiteLLMModel(
281 | "openrouter/google/gemini-2.0-flash-001",
282 | temperature=0.2
283 | )
284 |
285 | code_agent = CodeAgent(tools=[search_agent], model=model)
286 | query = "How long would a cheetah at full speed take to run the length of Pont Alexandre III?"
287 | result = code_agent.run(query)
288 |
289 | print(result)
290 | ```
291 | ### ReAct agent with math and search tools 🤖⚙️
292 |
293 | ####
294 | ```python
295 | from opendeepsearch import OpenDeepSearchTool
296 | from opendeepsearch.wolfram_tool import WolframAlphaTool
297 | from opendeepsearch.prompts import REACT_PROMPT
298 | from smolagents import LiteLLMModel, ToolCallingAgent, Tool
299 | import os
300 |
301 | # Set environment variables for API keys
302 | os.environ["SERPER_API_KEY"] = "your-serper-api-key-here"
303 | os.environ["JINA_API_KEY"] = "your-jina-api-key-here"
304 | os.environ["WOLFRAM_ALPHA_APP_ID"] = "your-wolfram-alpha-app-id-here"
305 | os.environ["FIREWORKS_API_KEY"] = "your-fireworks-api-key-here"
306 |
307 | model = LiteLLMModel(
308 | "fireworks_ai/llama-v3p1-70b-instruct", # Your Fireworks Deepseek model
309 | temperature=0.7
310 | )
311 | search_agent = OpenDeepSearchTool(model_name="fireworks_ai/llama-v3p1-70b-instruct", reranker="jina") # Set reranker to "jina" or "infinity"
312 |
313 | # Initialize the Wolfram Alpha tool
314 | wolfram_tool = WolframAlphaTool(app_id=os.environ["WOLFRAM_ALPHA_APP_ID"])
315 |
316 | # Initialize the React Agent with search and wolfram tools
317 | react_agent = ToolCallingAgent(
318 | tools=[search_agent, wolfram_tool],
319 | model=model,
320 | prompt_templates=REACT_PROMPT # Using REACT_PROMPT as system prompt
321 | )
322 |
323 | # Example query for the React Agent
324 | query = "What is the distance, in metres, between the Colosseum in Rome and the Rialto bridge in Venice"
325 | result = react_agent.run(query)
326 |
327 | print(result)
328 | ```
329 |
330 | ## Search Modes 🔄
331 |
332 | OpenDeepSearch offers two distinct search modes to balance between speed and depth:
333 |
334 | ### Default Mode ⚡
335 | - Uses SERP-based interaction for quick results
336 | - Minimal processing overhead
337 | - Ideal for single-hop, straightforward queries
338 | - Fast response times
339 | - Perfect for basic information retrieval
340 |
341 | ### Pro Mode 🔍
342 | - Involves comprehensive web scraping
343 | - Implements semantic reranking of results
344 | - Includes advanced post-processing of data
345 | - Slightly longer processing time
346 | - Excels at:
347 | - Multi-hop queries
348 | - Complex search requirements
349 | - Detailed information gathering
350 | - Questions requiring cross-reference verification
351 |
352 | ## Acknowledgments 💡
353 |
354 | OpenDeepSearch is built on the shoulders of great open-source projects:
355 |
356 | - **[SmolAgents](https://huggingface.co/docs/smolagents/index)** 🤗 – Powers the agent framework and reasoning capabilities.
357 | - **[Crawl4AI](https://github.com/unclecode/crawl4ai)** 🕷️ – Provides data crawling support.
358 | - **[Infinity Embedding API](https://github.com/michaelfeil/infinity)** 🌍 – Powers semantic search capabilities.
359 | - **[LiteLLM](https://www.litellm.ai/)** 🔥 – Used for efficient AI model integration.
360 | - **Various Open-Source Libraries** 📚 – Enhancing search and retrieval functionalities.
361 |
362 | ## Citation
363 |
364 | If you use `OpenDeepSearch` in your works, please cite it using the following BibTex entry:
365 |
366 | ```
367 | @misc{alzubi2025opendeepsearchdemocratizing,
368 | title={Open Deep Search: Democratizing Search with Open-source Reasoning Agents},
369 | author={Salaheddin Alzubi and Creston Brooks and Purva Chiniya and Edoardo Contente and Chiara von Gerlach and Lucas Irwin and Yihan Jiang and Arda Kaz and Windsor Nguyen and Sewoong Oh and Himanshu Tyagi and Pramod Viswanath},
370 | year={2025},
371 | eprint={2503.20201},
372 | archivePrefix={arXiv},
373 | primaryClass={cs.LG},
374 | url={https://arxiv.org/abs/2503.20201},
375 | }
376 | ```
377 |
378 |
379 | ## Contact 📩
380 |
381 | For questions or collaborations, open an issue or reach out to the maintainers.
382 |
--------------------------------------------------------------------------------
/assets/evals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-agi/OpenDeepSearch/HEAD/assets/evals.png
--------------------------------------------------------------------------------
/assets/sentient-logo-narrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-agi/OpenDeepSearch/HEAD/assets/sentient-logo-narrow.png
--------------------------------------------------------------------------------
/evals/README.md:
--------------------------------------------------------------------------------
1 | # Evaluation Scripts
2 |
3 | This repository contains scripts for running evaluations and autograding on model outputs.
4 |
5 | ## Available Commands
6 |
7 | ### Autograde DataFrame Evaluation
8 | To evaluate and autograde DataFrame outputs:
9 |
10 | ```bash
11 | python -m evals.autograde_dataframe --csv_path --output_path
12 | ```
13 |
14 | Example:
15 |
16 | ```bash
17 | python evals/autograde_df.py output/fireworks_ai__accounts__fireworks__models__qwq-32b/codeact/simple_qa_test_set/fireworks_ai__accounts__fireworks__models__qwq-32b__codeact__simple_qa_test_set__trial1.jsonl
18 | ```
19 |
20 | This command processes the specified JSONL file and performs automated grading on DataFrame outputs.
21 |
22 | ### Run Task Evaluations
23 | To run evaluations on a dataset with parallel processing:
24 |
25 | ```bash
26 | python ./evals/eval_tasks.py --parallel-workers=8 --num-trials=1 --eval-tasks=./evals/datasets/frames_test_set.csv ./evals/datasets/simple_qa_test_set.csv
27 | ```
28 |
29 | Parameters:
30 | - `--date`: Optional date for the evaluation
31 | - `--eval-tasks`: List of paths to CSV files containing evaluation tasks (default: ["./evals/datasets/frames_test_set.csv", "./evals/datasets/simple_qa_test_set.csv"])
32 | - `--search-model-id`: Model ID for the search tool (default: "fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct")
33 | - `--model-type`: Type of model to use, either "LiteLLMModel" or "HfApiModel" (default: "LiteLLMModel")
34 | - `--model-id`: ID of the model to use (default: "fireworks_ai/accounts/fireworks/models/qwq-32b")
35 | - `--agent-action-type`: Type of agent action: "codeact", "tool-calling", or "vanilla" (default: "codeact")
36 | - `--parallel-workers`: Number of parallel workers to use (default: 8)
37 | - `--num-trials`: Number of evaluation trials to run (default: 1)
38 |
39 | The results will be saved as a DataFrame in the `evals` directory.
40 |
41 | ## Output
42 | Evaluation results are stored in the following locations:
43 | - Task evaluation results: `evals/` directory
44 | - DataFrame autograding results: Generated in the script's output
45 |
46 |
47 |
--------------------------------------------------------------------------------
/evals/autograde_df.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import litellm
3 | import argparse
4 | from evals.grader_prompts import GRADER_TEMPLATE
5 | from multiprocessing import Pool, cpu_count
6 | from tqdm import tqdm
7 |
8 | def grade_row(row_data):
9 | idx, row = row_data
10 | question = row['original_question']
11 | predicted_answer = row['answer']
12 | gold_answer = row['true_answer']
13 |
14 | input_prompt = GRADER_TEMPLATE.format(
15 | question=question,
16 | predicted_answer=predicted_answer,
17 | target=gold_answer
18 | )
19 |
20 | try:
21 | output = litellm.completion(
22 | model="openrouter/google/gemini-2.0-flash-001",
23 | messages=[{"role": "user", "content": input_prompt}],
24 | temperature=0.0
25 | )['choices'][0]['message']['content']
26 | return idx, output
27 | except Exception as e:
28 | print(f"Error processing row {idx}: {e}")
29 | return idx, "Error"
30 |
31 | def autograde_df(df_path, num_cpus=4):
32 | # Read the dataframe
33 | df = pd.read_json(df_path, lines=True)
34 |
35 | # Prepare data for parallel processing
36 | row_data = list(df.iterrows())
37 |
38 | # Use specified number of CPU cores
39 | n_processes = max(1, min(num_cpus, cpu_count()))
40 | print(f"Using {n_processes} processes")
41 |
42 | # Create process pool and process rows in parallel
43 | with Pool(n_processes) as pool:
44 | # Use tqdm for progress bar
45 | results = list(tqdm(
46 | pool.imap(grade_row, row_data),
47 | total=len(row_data),
48 | desc="Grading"
49 | ))
50 |
51 | # Sort results by index and extract grades
52 | results.sort(key=lambda x: x[0])
53 | final_grades = [grade for _, grade in results]
54 |
55 | # Add the grades as a new column
56 | df['final_grade'] = final_grades
57 |
58 | # Save the updated dataframe back to the same file
59 | df.to_json(df_path, orient='records', lines=True)
60 | print("Grading completed and results saved!")
61 |
62 | if __name__ == "__main__":
63 | parser = argparse.ArgumentParser(description='Auto-grade answers in a DataFrame')
64 | parser.add_argument('df_path', type=str, help='Path to the DataFrame JSON file')
65 | parser.add_argument('--num_cpus', type=int, default=4, help='Number of CPU cores to use')
66 |
67 | args = parser.parse_args()
68 | autograde_df(args.df_path, args.num_cpus)
69 |
--------------------------------------------------------------------------------
/evals/eval_gpt_web.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from openai import OpenAI
3 | import time
4 | from typing import List, Dict, Any
5 | import json
6 | import pandas as pd
7 | from pathlib import Path
8 | import argparse
9 | from dotenv import load_dotenv
10 | import os
11 | from tqdm import tqdm
12 | import multiprocessing as mp
13 | from queue import Empty
14 | from concurrent.futures import ProcessPoolExecutor
15 |
16 | load_dotenv()
17 |
18 | class WebSearchEvaluator:
19 | def __init__(self, model: str, output_path: Path, num_workers: int = 4, trial: int = 0):
20 | self.model = model
21 | self.output_path = output_path
22 | self.num_workers = num_workers
23 | self.trial = trial
24 |
25 | # Load existing results if any
26 | self.processed_questions = set()
27 | if self.output_path.exists():
28 | with open(self.output_path, 'r') as f:
29 | for line in f:
30 | try:
31 | result = json.loads(line)
32 | self.processed_questions.add(result['question'])
33 | except:
34 | continue
35 |
36 | def worker_init(self):
37 | """Initialize OpenAI client for each worker."""
38 | # Create new client for each process
39 | self.client = OpenAI(
40 | api_key=os.environ.get("OPENAI_API_KEY"),
41 | base_url=os.environ.get("OPENAI_BASE_URL")
42 | )
43 |
44 | def evaluate_single(self, row: pd.Series) -> Dict[str, Any]:
45 | """Evaluate a single question with its true answer."""
46 | # Skip if already processed
47 | if row['question'] in self.processed_questions:
48 | return None
49 |
50 | if not hasattr(self, 'client'):
51 | self.worker_init()
52 |
53 | try:
54 | start_time = time.time()
55 | response = self.client.responses.create(
56 | model=self.model,
57 | tools=[{"type": "web_search_preview"}],
58 | input=row['question']
59 | )
60 | end_time = time.time()
61 | result = {
62 | "question": row['question'],
63 | "true_answer": row['true_answer'],
64 | "answer": response.output_text,
65 | "model": self.model,
66 | "time_taken": end_time - start_time,
67 | "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
68 | }
69 | return result
70 | except Exception as e:
71 | return {
72 | "question": row['question'],
73 | "true_answer": row['true_answer'],
74 | "answer": None,
75 | "error": str(e),
76 | "model": self.model,
77 | "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
78 | }
79 |
80 | def save_result(self, result: Dict[str, Any]) -> None:
81 | """Save a single result to the JSONL file."""
82 | with open(self.output_path, 'a') as f:
83 | f.write(json.dumps(result) + '\n')
84 |
85 | def evaluate_batch(self, df: pd.DataFrame) -> None:
86 | """Evaluate questions in parallel using multiple workers."""
87 | with ProcessPoolExecutor(
88 | max_workers=self.num_workers,
89 | initializer=self.worker_init
90 | ) as executor:
91 | # Convert DataFrame rows to list of Series
92 | rows = [row for _, row in df.iterrows()]
93 |
94 | # Create progress bar for total rows
95 | with tqdm(total=len(rows), desc="Processing questions") as pbar:
96 | # Submit all tasks
97 | futures = [executor.submit(self.evaluate_single, row) for row in rows]
98 |
99 | # Process results as they complete
100 | for future in futures:
101 | result = future.result()
102 | if result is not None: # Only save if not already processed
103 | self.save_result(result)
104 | pbar.update(1)
105 |
106 | def parse_args():
107 | parser = argparse.ArgumentParser(description='Evaluate questions using GPT-4 with web search')
108 | parser.add_argument('--output_dir', type=str, default='output',
109 | help='Directory to save results (default: output)')
110 | parser.add_argument('--input_data', type=str,
111 | default='./evals/datasets/frames_test_set.csv',
112 | help='Path to input CSV file')
113 | parser.add_argument('--model', type=str,
114 | default=os.getenv("LITELLM_EVAL_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "gpt-4o-mini")),
115 | help='Model to use for evaluation')
116 | parser.add_argument('--num_workers', type=int, default=4,
117 | help='Number of parallel workers (default: 4)')
118 | parser.add_argument('--trial', type=int, default=0,
119 | help='Trial number for this evaluation run (default: 0)')
120 | return parser.parse_args()
121 |
122 | def main():
123 | args = parse_args()
124 |
125 | # Create output directory if it doesn't exist
126 | output_dir = Path(args.output_dir)
127 | output_dir.mkdir(parents=True, exist_ok=True)
128 |
129 | # Set up output path (now without timestamp)
130 | output_path = output_dir / f"evaluation_results_{args.model}_trial{args.trial}.jsonl"
131 |
132 | # Load input data
133 | print(f"Loading data from {args.input_data}")
134 | df = pd.read_csv(args.input_data)
135 | print(f"Loaded {len(df)} examples")
136 |
137 | # Initialize evaluator
138 | evaluator = WebSearchEvaluator(
139 | model=args.model,
140 | output_path=output_path,
141 | num_workers=args.num_workers,
142 | trial=args.trial
143 | )
144 |
145 | # Run evaluation
146 | print(f"Starting evaluation with model {args.model} using {args.num_workers} workers...")
147 | evaluator.evaluate_batch(df)
148 | print(f"Results saved to {output_path}")
149 |
150 | # Load and display summary
151 | results_df = pd.read_json(output_path, lines=True)
152 | print("\nResults summary:")
153 | print(f"Model: {args.model}")
154 | print(f"Total evaluations: {len(results_df)}")
155 | print(f"Successful evaluations: {len(results_df[~results_df['answer'].isna()])}")
156 | print(f"Failed evaluations: {len(results_df[results_df['answer'].isna()])}")
157 |
158 | if __name__ == "__main__":
159 | main()
160 |
--------------------------------------------------------------------------------
/evals/eval_tasks.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import datetime
3 | import json
4 | import os
5 | import threading
6 | import time
7 | from concurrent.futures import ThreadPoolExecutor, as_completed
8 | from pathlib import Path
9 |
10 | import datasets
11 | import pandas as pd
12 | from datasets import Dataset
13 | from dotenv import load_dotenv
14 | from tqdm import tqdm
15 | from opendeepsearch import OpenDeepSearchTool
16 |
17 | from smolagents import (
18 | AgentError,
19 | CodeAgent,
20 | LiteLLMModel,
21 | HfApiModel,
22 | PythonInterpreterTool,
23 | ToolCallingAgent,
24 | )
25 | from smolagents.agents import ActionStep
26 |
27 |
28 | load_dotenv()
29 |
30 | APPEND_ANSWER_LOCK = threading.Lock()
31 |
32 |
33 | def parse_arguments():
34 | parser = argparse.ArgumentParser(description="Runs an agent powered by the given model on smolagent benchmark.")
35 | parser.add_argument(
36 | "--date",
37 | type=str,
38 | default=None,
39 | help="The date for the evaluation.",
40 | )
41 | parser.add_argument(
42 | "--eval-tasks",
43 | type=str,
44 | nargs="+",
45 | default=["./evals/datasets/frames_test_set.csv", "./evals/datasets/simple_qa_test_set.csv"],
46 | help="List of evaluation task paths",
47 | )
48 | parser.add_argument(
49 | "--search-model-id",
50 | type=str,
51 | default="fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct",
52 | help="The model ID to use for the search tool (defaults to same as model-id)",
53 | )
54 | parser.add_argument(
55 | "--model-type",
56 | type=str,
57 | default="LiteLLMModel",
58 | choices=["LiteLLMModel", "HfApiModel"],
59 | help="The model type to use (LiteLLMModel or HfApiModel)",
60 | )
61 | parser.add_argument(
62 | "--model-id",
63 | type=str,
64 | default="fireworks_ai/accounts/fireworks/models/qwq-32b",
65 | help="The model ID to use for the specified model type",
66 | )
67 | parser.add_argument(
68 | "--agent-action-type",
69 | type=str,
70 | default="codeact",
71 | choices=["codeact", "tool-calling", "vanilla"],
72 | help="The agent action type: 'codeact', 'tool-calling', or 'vanilla' to use the vanilla llm",
73 | )
74 | parser.add_argument(
75 | "--parallel-workers",
76 | type=int,
77 | default=8,
78 | help="The number of processes to run in parallel",
79 | )
80 | parser.add_argument(
81 | "--num-trials",
82 | type=int,
83 | default=1,
84 | help="Number of trials to run for each evaluation",
85 | )
86 | return parser.parse_args()
87 |
88 |
89 | def load_eval_dataset(eval_tasks: list):
90 | eval_ds = {}
91 | for task_path in eval_tasks:
92 | task_name = task_path.split("/")[-1][:-4]
93 | df = pd.read_csv(task_path)
94 | dataset = Dataset.from_pandas(df)
95 | eval_ds[task_name] = dataset
96 | return eval_ds
97 |
98 |
99 | def serialize_agent_error(obj):
100 | if isinstance(obj, AgentError):
101 | return {"error_type": obj.__class__.__name__, "message": obj.message}
102 | else:
103 | return str(obj)
104 |
105 |
106 | def append_answer(entry: dict, jsonl_file: str) -> None:
107 | jsonl_file = Path(jsonl_file)
108 | jsonl_file.parent.mkdir(parents=True, exist_ok=True)
109 | with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp:
110 | fp.write(json.dumps(entry) + "\n")
111 | assert os.path.exists(jsonl_file), "File not found!"
112 |
113 |
114 | def run_with_timeout(func, timeout):
115 | with ThreadPoolExecutor(max_workers=1) as executor:
116 | future = executor.submit(func)
117 | try:
118 | return future.result(timeout=timeout)
119 | except TimeoutError:
120 | return "Timed Out"
121 |
122 |
123 | def answer_single_question(example, model, answers_file, action_type, search_model_id=None):
124 | if action_type == "vanilla":
125 | agent = model
126 | elif action_type == "codeact":
127 | agent = CodeAgent(
128 | tools=[OpenDeepSearchTool(model_name=search_model_id or model.model_id)],
129 | model=model,
130 | additional_authorized_imports=["numpy"],
131 | max_steps=15,
132 | )
133 | elif action_type == "tool-calling":
134 | agent = ToolCallingAgent(
135 | tools=[OpenDeepSearchTool(model_name=search_model_id or model.model_id), PythonInterpreterTool()],
136 | model=model,
137 | additional_authorized_imports=["numpy"],
138 | max_steps=15,
139 | )
140 |
141 | augmented_question = example["question"]
142 | start_time = time.time()
143 | TIMEOUT_SECONDS = 300 # 5 minutes timeout
144 |
145 | try:
146 | if action_type == "vanilla":
147 | def get_vanilla_response():
148 | response = agent([{"role": "user", "content": augmented_question}])
149 | return response.content, agent.last_output_token_count
150 |
151 | answer, token_count = run_with_timeout(get_vanilla_response, TIMEOUT_SECONDS)
152 | intermediate_steps = answer
153 | else:
154 | def get_agent_response():
155 | response = str(agent.run(augmented_question))
156 | token_count = agent.monitor.get_total_token_counts()
157 | # Remove memory from logs to make them more compact.
158 | for step in agent.memory.steps:
159 | if isinstance(step, ActionStep):
160 | step.agent_memory = None
161 | return response, token_count, str(agent.memory.steps)
162 |
163 | answer, token_count, intermediate_steps = run_with_timeout(get_agent_response, TIMEOUT_SECONDS)
164 |
165 | end_time = time.time()
166 | except Exception as e:
167 | print("Error on ", augmented_question, e)
168 | intermediate_steps = []
169 | end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
170 | annotated_example = {
171 | "model_id": model.model_id,
172 | "agent_action_type": action_type,
173 | "original_question": example["question"],
174 | "answer": answer,
175 | "true_answer": example["true_answer"],
176 | "intermediate_steps": intermediate_steps,
177 | "start_time": start_time,
178 | "end_time": end_time,
179 | "token_counts": token_count,
180 | }
181 | append_answer(annotated_example, answers_file)
182 |
183 |
184 | def answer_questions(
185 | eval_ds,
186 | model,
187 | date,
188 | action_type: str = "codeact",
189 | output_dir: str = "output",
190 | parallel_workers: int = 32,
191 | search_model_id: str = None,
192 | num_trials: int = 1,
193 | ):
194 | date = date or datetime.date.today().isoformat()
195 | model_id = model.model_id
196 |
197 | # Create directory structure: output/model_id/action_type/task
198 | model_dir = model_id.replace('/', '__')
199 |
200 | for task in eval_ds:
201 | task_dir = os.path.join(output_dir, model_dir, action_type, task)
202 | os.makedirs(task_dir, exist_ok=True)
203 |
204 | for trial in range(num_trials):
205 | file_name = f"{task_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__trial{trial}.jsonl"
206 | print(f"Starting processing trial {trial + 1}/{num_trials} and writing output to '{file_name}'")
207 | answered_questions = []
208 | if os.path.exists(file_name):
209 | with open(file_name, "r") as f:
210 | for line in f:
211 | answered_questions.append(json.loads(line)["original_question"])
212 | examples_todo = [example for example in eval_ds[task] if example["question"] not in answered_questions]
213 | print(f"Launching {parallel_workers} parallel workers.")
214 |
215 | with ThreadPoolExecutor(max_workers=parallel_workers) as exe:
216 | futures = [
217 | exe.submit(answer_single_question, example, model, file_name, action_type, search_model_id)
218 | for example in examples_todo
219 | ]
220 | for f in tqdm(as_completed(futures), total=len(examples_todo), desc="Processing tasks"):
221 | f.result()
222 |
223 | print("All tasks processed.")
224 |
225 |
226 | if __name__ == "__main__":
227 | args = parse_arguments()
228 |
229 | eval_ds = load_eval_dataset(args.eval_tasks)
230 |
231 | if args.model_type == "LiteLLMModel":
232 | model = LiteLLMModel(
233 | args.model_id,
234 | max_completion_tokens=8192,
235 | temperature=0.2,
236 | # api_key=os.getenv("OPENROUTER_API_KEY"),
237 | )
238 | else:
239 | model = HfApiModel(args.model_id, provider="together", max_tokens=8192)
240 |
241 | answer_questions(
242 | eval_ds,
243 | model,
244 | args.date,
245 | action_type=args.agent_action_type,
246 | parallel_workers=args.parallel_workers,
247 | search_model_id=args.search_model_id,
248 | num_trials=args.num_trials,
249 | )
--------------------------------------------------------------------------------
/evals/gpt_web_extract.py:
--------------------------------------------------------------------------------
1 | import litellm
2 | from multiprocessing import Pool
3 | import pandas as pd
4 | from tqdm import tqdm
5 | import argparse
6 |
7 | input_prompt = """You are a precise answer extractor. Your job is to read a question and a detailed answer, then output ONLY the final answer without any explanation.
8 |
9 | For example:
10 | Question: "What is 2+2?"
11 | Detailed Answer: "Let me calculate this. 2 plus 2 equals 4, which is a basic mathematical fact."
12 | Final Answer: 4
13 |
14 | Question: "What color is the sky on a clear day?"
15 | Detailed Answer: "When we look up on a clear day, the sky appears blue due to a phenomenon called Rayleigh scattering."
16 | Final Answer: blue
17 |
18 | Question: "If my future wife has the same first name as the 15th first lady of the United States' mother and her surname is the same as the second assassinated president's mother's maiden name, what is my future wife's name?"
19 | Detailed Answer: "The 15th First Lady of the United States was Ellen Wilson, and her mother's name was Hannah. The second assassinated president was Abraham Lincoln, and his mother's maiden name was Hodge. \n\nPutting that together, your future wife's name is **Hannah Hodge**."
20 | Final Answer: Hannah Hodge
21 |
22 | Now do this:
23 | Question: {question}
24 | Detailed Answer: {detailed_answer}
25 | Final Answer:"""
26 |
27 | def process_row(row):
28 | """Process a single row using litellm."""
29 | try:
30 | output = litellm.completion(
31 | model="openrouter/google/gemini-2.0-flash-001",
32 | messages=[{
33 | "role": "user",
34 | "content": input_prompt.format(
35 | question=row['question'],
36 | detailed_answer=row['original_answer']
37 | )
38 | }],
39 | temperature=0.3
40 | )
41 | return output['choices'][0]['message']['content']
42 | except Exception as e:
43 | print(f"Error processing row: {e}")
44 | return None
45 |
46 | def process_dataframe(df, num_workers=4):
47 | """Process the entire dataframe using a pool of workers."""
48 | with Pool(num_workers) as pool:
49 | # Use tqdm to show progress bar
50 | results = list(tqdm(
51 | pool.imap(process_row, [row for _, row in df.iterrows()]),
52 | total=len(df)
53 | ))
54 |
55 | # Add results as a new column
56 | df['processed_output'] = results
57 | return df
58 |
59 | if __name__ == '__main__':
60 | parser = argparse.ArgumentParser(description='Process a CSV file using litellm in parallel')
61 | parser.add_argument('input_file', type=str, help='Path to the input CSV file')
62 | parser.add_argument('--workers', type=int, default=4, help='Number of worker processes (default: 4)')
63 |
64 | args = parser.parse_args()
65 |
66 | # Load and process the dataframe
67 | df = pd.read_json(args.input_file, lines=True)
68 |
69 | # Rename 'answer' to 'original_answer'
70 | df = df.rename(columns={'answer': 'original_answer'})
71 |
72 | # Process the dataframe and store results in 'answer' column
73 | processed_df = process_dataframe(df, num_workers=args.workers)
74 | processed_df = processed_df.rename(columns={'processed_output': 'answer'})
75 |
76 | # Save to output file (adding '_processed' before the extension)
77 | output_file = args.input_file.rsplit('.', 1)[0] + '_processed.' + args.input_file.rsplit('.', 1)[1]
78 | processed_df.to_csv(output_file, index=False)
79 | print(f"Processed data saved to: {output_file}")
80 |
--------------------------------------------------------------------------------
/evals/grader_prompts.py:
--------------------------------------------------------------------------------
1 | GRADER_TEMPLATE = """
2 | Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
3 | First, I will give examples of each grade, and then you will grade a new example.
4 |
5 |
6 | The following are examples of CORRECT predicted answers.
7 | ```
8 | Question: What are the names of Barack Obama's children?
9 | Gold target: Malia Obama and Sasha Obama
10 | Predicted answer 1: sasha and malia obama
11 | Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
12 | Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
13 | ```
14 | These predicted answers are all CORRECT because:
15 | - They fully contain the important information in the gold target.
16 | - They do not contain any information that contradicts the gold target.
17 | - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
18 | - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
19 |
20 |
21 | The following are examples of INCORRECT predicted answers.
22 | ```
23 | Question: What are the names of Barack Obama's children?
24 | Gold target: Malia and Sasha
25 | Predicted answer 1: Malia.
26 | Predicted answer 2: Malia, Sasha, and Susan.
27 | Predicted answer 3: Barack Obama does not have any children.
28 | Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
29 | Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
30 | Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
31 | Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
32 | ```
33 | These predicted answers are all INCORRECT because:
34 | - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
35 |
36 |
37 | The following are examples of NOT_ATTEMPTED predicted answers.
38 | ```
39 | Question: What are the names of Barack Obama's children?
40 | Gold target: Malia and Sasha
41 | Predicted answer 1: I don't know.
42 | Predicted answer 2: I need more context about which Obama you are talking about.
43 | Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
44 | Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
45 | ```
46 | These predicted answers are all NOT_ATTEMPTED because:
47 | - The important information in the gold target is not included in the answer.
48 | - No statements in the answer contradict the gold target.
49 |
50 |
51 | Also note the following things:
52 | - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
53 | - Predicted answers "120k", "124k", and 115k" are all CORRECT.
54 | - Predicted answers "100k" and "113k" are INCORRECT.
55 | - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
56 | - The presence or absence of commas in numbers (e.g., "5,876" vs "5876") does not affect grading.
57 | - Numbers written as words or digits are equivalent (e.g., "2 million" vs "2000000" vs "2,000,000" are all considered the same).
58 | - For large numerical answers, a margin of error of ±1% is acceptable (e.g., if the gold answer is 855, predicted answers between 846.45 and 863.55 are CORRECT).
59 | - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
60 | - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
61 | - Do not punish predicted answers if they omit information that would be clearly inferred from the question.
62 | - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
63 | - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
64 | - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
65 | - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
66 | - Do not punish for typos in people's name if it's clearly the same name.
67 | - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
68 |
69 |
70 | Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
71 | ```
72 | Question: {question}
73 | Gold target: {target}
74 | Predicted answer: {predicted_answer}
75 | ```
76 |
77 | Grade the predicted answer of this new question as one of:
78 | A: CORRECT
79 | B: INCORRECT
80 | C: NOT_ATTEMPTED
81 |
82 | Just return the letters "A", "B", or "C", with no text around it.
83 | """.strip()
84 |
--------------------------------------------------------------------------------
/gradio_demo.py:
--------------------------------------------------------------------------------
1 | from smolagents import CodeAgent, GradioUI, LiteLLMModel
2 | from opendeepsearch import OpenDeepSearchTool
3 | import os
4 | from dotenv import load_dotenv
5 | import argparse
6 |
7 | # Load environment variables
8 | load_dotenv()
9 |
10 | # Add command line argument parsing
11 | parser = argparse.ArgumentParser(description='Run the Gradio demo with custom models')
12 | parser.add_argument('--model-name',
13 | default=os.getenv("LITELLM_SEARCH_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")),
14 | help='Model name for search')
15 | parser.add_argument('--orchestrator-model',
16 | default=os.getenv("LITELLM_ORCHESTRATOR_MODEL_ID", os.getenv("LITELLM_MODEL_ID", "openrouter/google/gemini-2.0-flash-001")),
17 | help='Model name for orchestration')
18 | parser.add_argument('--reranker',
19 | choices=['jina', 'infinity'],
20 | default='jina',
21 | help='Reranker to use (jina or infinity)')
22 | parser.add_argument('--search-provider',
23 | choices=['serper', 'searxng'],
24 | default='serper',
25 | help='Search provider to use (serper or searxng)')
26 | parser.add_argument('--searxng-instance',
27 | help='SearXNG instance URL (required if search-provider is searxng)')
28 | parser.add_argument('--searxng-api-key',
29 | help='SearXNG API key (optional)')
30 | parser.add_argument('--serper-api-key',
31 | help='Serper API key (optional, will use SERPER_API_KEY env var if not provided)')
32 | parser.add_argument('--openai-base-url',
33 | help='OpenAI API base URL (optional, will use OPENAI_BASE_URL env var if not provided)')
34 | parser.add_argument('--server-port',
35 | type=int,
36 | default=7860,
37 | help='Port to run the Gradio server on')
38 |
39 | args = parser.parse_args()
40 |
41 | # Validate arguments
42 | if args.search_provider == 'searxng' and not (args.searxng_instance or os.getenv('SEARXNG_INSTANCE_URL')):
43 | parser.error("--searxng-instance is required when using --search-provider=searxng")
44 |
45 | # Set OpenAI base URL if provided via command line
46 | if args.openai_base_url:
47 | os.environ["OPENAI_BASE_URL"] = args.openai_base_url
48 |
49 | # Use the command line arguments
50 | search_tool = OpenDeepSearchTool(
51 | model_name=args.model_name,
52 | reranker=args.reranker,
53 | search_provider=args.search_provider,
54 | serper_api_key=args.serper_api_key,
55 | searxng_instance_url=args.searxng_instance,
56 | searxng_api_key=args.searxng_api_key
57 | )
58 | model = LiteLLMModel(
59 | model_id=args.orchestrator_model,
60 | temperature=0.2,
61 | )
62 |
63 | # Initialize the agent with the search tool
64 | agent = CodeAgent(tools=[search_tool], model=model)
65 |
66 | # Add a name when initializing GradioUI
67 | GradioUI(agent).launch(server_name="127.0.0.1", server_port=args.server_port, share=False)
68 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "OpenDeepSearch"
3 | version = "0.1.0"
4 | description = "Default template for PDM package"
5 | authors = [
6 | {name = "Salaheddin Alzu'bi", email = "salaheddinalzubi@gmail.com"},
7 | ]
8 |
9 | dependencies = ["openai>=1.66.2", "datasets>=3.3.2", "transformers>=4.49.0", "litellm>=1.61.20", "langchain>=0.3.19", "crawl4ai @ git+https://github.com/salzubi401/crawl4ai.git@main", "fasttext-wheel>=0.9.2", "wikipedia-api>=0.8.1", "pillow>=10.4.0", "smolagents>=1.9.2", "gradio==5.20.1"]
10 | requires-python = ">=3.10"
11 | readme = "README.md"
12 | license = {text = "MIT"}
13 |
14 | [build-system]
15 | requires = ["hatchling"]
16 | build-backend = "hatchling.build"
17 |
18 |
19 | [tool.pdm]
20 | distribution = true
21 |
22 | [tool.hatch.metadata]
23 | allow-direct-references = true
24 |
25 | [tool.uv]
26 | python = "3.10"
27 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai>=1.65.1
2 | datasets>=3.3.2
3 | transformers>=4.49.0
4 | litellm>=1.61.20
5 | langchain>=0.3.19
6 | git+https://github.com/salzubi401/crawl4ai.git@main
7 | fasttext-wheel>=0.9.2
8 | wikipedia-api>=0.8.1
9 | pillow>=10.4.0
10 | smolagents>=1.9.2
11 | gradio==5.20.1
12 |
13 |
--------------------------------------------------------------------------------
/src/opendeepsearch/__init__.py:
--------------------------------------------------------------------------------
1 | from .ods_agent import OpenDeepSearchAgent
2 | from .ods_tool import OpenDeepSearchTool
3 |
4 | __all__ = ['OpenDeepSearchAgent', 'OpenDeepSearchTool']
5 |
--------------------------------------------------------------------------------
/src/opendeepsearch/context_building/build_context.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional
2 | from loguru import logger
3 | from langchain.text_splitter import RecursiveCharacterTextSplitter
4 |
5 |
6 | def extract_information(organic_results: List[Dict]) -> List[str]:
7 | """Extract snippets from organic search results in a formatted string."""
8 | formatted_results = []
9 | for item in organic_results:
10 | if 'snippet' in item:
11 | result_parts = [
12 | f"title: {item.get('title', 'N/A')}",
13 | f"date authored: {item.get('date', 'N/A')}",
14 | f"link: {item.get('link', 'N/A')}",
15 | f"snippet: {item['snippet']}"
16 | ]
17 |
18 | if 'html' in item:
19 | result_parts.append(f"additional information: {item['html']}")
20 |
21 | formatted_results.append('\n'.join(result_parts))
22 |
23 | return formatted_results
24 |
25 | def extract_top_stories(top_stories: Optional[List[Dict]]) -> List[str]:
26 | """Extract titles from top stories."""
27 | if not top_stories:
28 | return []
29 |
30 | return [
31 | item['title']
32 | for item in top_stories
33 | if 'title' in item
34 | ]
35 |
36 | def extract_answer_box(
37 | answer_box: Optional[Dict]
38 | ) -> List[str]:
39 | """Extract information from answer box."""
40 | results = []
41 |
42 | if answer_box:
43 | for key in ['answer', 'snippet']:
44 | if answer_box.get(key):
45 | results.append(answer_box[key])
46 |
47 | return results
48 |
49 | def build_context(
50 | sources_result: Dict,
51 | ) -> str:
52 | """
53 | Build context from search results.
54 |
55 | Args:
56 | sources_result: Dictionary containing search results
57 |
58 | Returns:
59 | A formatted string containing all relevant search results
60 | """
61 | try:
62 | # Build context from different components
63 | organic_results = extract_information(sources_result.get('organic', []))
64 | top_stories = extract_top_stories(sources_result.get('topStories'))
65 | answer_box = extract_answer_box(
66 | sources_result.get('answerBox')
67 | )
68 |
69 | # Combine all results into a single string
70 | context_parts = []
71 |
72 | # Add answer box if available
73 | if answer_box:
74 | context_parts.append("ANSWER BOX:")
75 | context_parts.extend(answer_box)
76 | context_parts.append("") # Empty line for separation
77 |
78 | # Add organic results
79 | if organic_results:
80 | context_parts.append("SEARCH RESULTS:")
81 | context_parts.extend(organic_results)
82 | context_parts.append("") # Empty line for separation
83 |
84 | # Add top stories if available
85 | if top_stories:
86 | context_parts.append("TOP STORIES:")
87 | context_parts.extend(top_stories)
88 |
89 | # Join all parts with newlines
90 | return "\n".join(context_parts)
91 |
92 | except Exception as e:
93 | logger.exception(f"An error occurred while building context: {e}")
94 | return "" # Return empty string in case of error
95 |
--------------------------------------------------------------------------------
/src/opendeepsearch/context_building/process_sources_pro.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Optional, Tuple
3 | from opendeepsearch.context_scraping.crawl4ai_scraper import WebScraper
4 | from opendeepsearch.ranking_models.infinity_rerank import InfinitySemanticSearcher
5 | from opendeepsearch.ranking_models.jina_reranker import JinaReranker
6 | from opendeepsearch.ranking_models.chunker import Chunker
7 |
8 | @dataclass
9 | class Source:
10 | link: str
11 | html: str = ""
12 | # Add other relevant fields here
13 |
14 | class SourceProcessor:
15 | def __init__(
16 | self,
17 | top_results: int = 5,
18 | strategies: List[str] = ["no_extraction"],
19 | filter_content: bool = True,
20 | reranker: str = "infinity"
21 | ):
22 | self.strategies = strategies
23 | self.filter_content = filter_content
24 | self.scraper = WebScraper(
25 | strategies=self.strategies,
26 | filter_content=self.filter_content
27 | )
28 | self.top_results = top_results
29 | self.chunker = Chunker()
30 |
31 | # Initialize the appropriate reranker
32 | if reranker.lower() == "jina":
33 | self.semantic_searcher = JinaReranker()
34 | print("Using Jina Reranker")
35 | else: # default to infinity
36 | self.semantic_searcher = InfinitySemanticSearcher()
37 | print("Using Infinity Reranker")
38 |
39 | async def process_sources(
40 | self,
41 | sources: List[dict],
42 | num_elements: int,
43 | query: str,
44 | pro_mode: bool = False
45 | ) -> List[dict]:
46 | try:
47 | valid_sources = self._get_valid_sources(sources, num_elements)
48 | if not valid_sources:
49 | return sources
50 |
51 | if not pro_mode:
52 | # Check if there's a Wikipedia article among valid sources
53 | wiki_sources = [(i, source) for i, source in valid_sources
54 | if 'wikipedia.org' in source['link']]
55 | if not wiki_sources:
56 | return sources.data
57 | # If Wikipedia article exists, only process that
58 | valid_sources = wiki_sources[:1] # Take only the first Wikipedia source
59 |
60 | html_contents = await self._fetch_html_contents([s[1]['link'] for s in valid_sources])
61 | return self._update_sources_with_content(sources.data, valid_sources, html_contents, query)
62 | except Exception as e:
63 | print(f"Error in process_sources: {e}")
64 | return sources
65 |
66 | def _get_valid_sources(self, sources: List[dict], num_elements: int) -> List[Tuple[int, dict]]:
67 | return [(i, source) for i, source in enumerate(sources.data['organic'][:num_elements]) if source]
68 |
69 | async def _fetch_html_contents(self, links: List[str]) -> List[str]:
70 | raw_contents = await self.scraper.scrape_many(links)
71 | return [x['no_extraction'].content for x in raw_contents.values()]
72 |
73 | def _process_html_content(self, html: str, query: str) -> str:
74 | if not html:
75 | return ""
76 | try:
77 | # Split the HTML content into chunks
78 | documents = self.chunker.split_text(html)
79 |
80 | # Rerank the chunks based on the query
81 | reranked_content = self.semantic_searcher.get_reranked_documents(
82 | query,
83 | documents,
84 | top_k=self.top_results
85 | )
86 |
87 | return reranked_content
88 |
89 | except Exception as e:
90 | print(f"Error in content processing: {e}")
91 | return ""
92 |
93 | def _update_sources_with_content(
94 | self,
95 | sources: List[dict],
96 | valid_sources: List[Tuple[int, dict]],
97 | html_contents: List[str],
98 | query: str
99 | ) -> List[dict]:
100 | for (i, source), html in zip(valid_sources, html_contents):
101 | source['html'] = self._process_html_content(html, query)
102 | # sources[i] = source
103 | return sources
--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/basic_web_scraper.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the BasicWebScraper class for basic web scraping functionality.
3 | """
4 |
5 | from dataclasses import dataclass
6 | from typing import Optional
7 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
8 | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
9 | from crawl4ai.content_filter_strategy import PruningContentFilter
10 |
11 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult
12 | from crawl4ai.extraction_strategy import ExtractionStrategy
13 |
14 | @dataclass
15 | class ExtractionConfig:
16 | """Configuration for extraction strategies"""
17 | name: str
18 | strategy: ExtractionStrategy
19 |
20 | class BasicWebScraper:
21 | """Basic web scraper implementation"""
22 | def __init__(self, browser_config: Optional[BrowserConfig] = None):
23 | self.browser_config = browser_config or BrowserConfig(headless=True, verbose=True)
24 |
25 | def _create_crawler_config(self) -> CrawlerRunConfig:
26 | """Creates default crawler configuration"""
27 | return CrawlerRunConfig(
28 | cache_mode=CacheMode.BYPASS,
29 | markdown_generator=DefaultMarkdownGenerator(
30 | content_filter=PruningContentFilter()
31 | )
32 | )
33 |
34 | async def extract(self, extraction_config: ExtractionConfig, url: str) -> ExtractionResult:
35 | """Performs extraction using specified strategy"""
36 | try:
37 | config = self._create_crawler_config()
38 | config.extraction_strategy = extraction_config.strategy
39 |
40 | async with AsyncWebCrawler(config=self.browser_config) as crawler:
41 | result = await crawler.arun(url=url, config=config)
42 |
43 | extraction_result = ExtractionResult(
44 | name=extraction_config.name,
45 | success=result.success,
46 | content=result.extracted_content
47 | )
48 |
49 | if result.success:
50 | extraction_result.raw_markdown_length = len(result.markdown_v2.raw_markdown)
51 | extraction_result.citations_markdown_length = len(result.markdown_v2.markdown_with_citations)
52 |
53 | return extraction_result
54 |
55 | except Exception as e:
56 | return ExtractionResult(
57 | name=extraction_config.name,
58 | success=False,
59 | error=str(e)
60 | )
--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/crawl4ai_scraper.py:
--------------------------------------------------------------------------------
1 | """
2 | Modular web scraping implementation using Crawl4AI.
3 | Supports multiple extraction strategies including LLM, CSS, and XPath.
4 | """
5 |
6 | import asyncio
7 | import os
8 | from dataclasses import dataclass
9 | from typing import Dict, List, Optional
10 |
11 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
12 | from crawl4ai.content_filter_strategy import PruningContentFilter
13 | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
14 |
15 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult, print_extraction_result
16 | from opendeepsearch.context_scraping.basic_web_scraper import ExtractionConfig
17 | from opendeepsearch.context_scraping.strategy_factory import StrategyFactory
18 |
19 | class WebScraper:
20 | """Unified scraper that encapsulates all extraction strategies and configuration"""
21 | def __init__(
22 | self,
23 | browser_config: Optional[BrowserConfig] = None,
24 | strategies: List[str] = ['no_extraction'],
25 | llm_instruction: str = "Extract relevant content from the provided text, only return the text, no markdown formatting, remove all footnotes, citations, and other metadata and only keep the main content",
26 | user_query: Optional[str] = None,
27 | debug: bool = False,
28 | filter_content: bool = False
29 | ):
30 | self.browser_config = browser_config or BrowserConfig(headless=True, verbose=True)
31 | self.debug = debug
32 | self.factory = StrategyFactory()
33 | self.strategies = strategies or ['markdown_llm', 'html_llm', 'fit_markdown_llm', 'css', 'xpath', 'no_extraction', 'cosine']
34 | self.llm_instruction = llm_instruction
35 | self.user_query = user_query
36 | self.filter_content = filter_content
37 |
38 | # Validate strategies
39 | valid_strategies = {'markdown_llm', 'html_llm', 'fit_markdown_llm', 'css', 'xpath', 'no_extraction', 'cosine'}
40 | invalid_strategies = set(self.strategies) - valid_strategies
41 | if invalid_strategies:
42 | raise ValueError(f"Invalid strategies: {invalid_strategies}")
43 |
44 | # Initialize strategy map
45 | self.strategy_map = {
46 | 'markdown_llm': lambda: self.factory.create_llm_strategy('markdown', self.llm_instruction),
47 | 'html_llm': lambda: self.factory.create_llm_strategy('html', self.llm_instruction),
48 | 'fit_markdown_llm': lambda: self.factory.create_llm_strategy('fit_markdown', self.llm_instruction),
49 | 'css': self.factory.create_css_strategy,
50 | 'xpath': self.factory.create_xpath_strategy,
51 | 'no_extraction': self.factory.create_no_extraction_strategy,
52 | 'cosine': lambda: self.factory.create_cosine_strategy(debug=self.debug)
53 | }
54 |
55 | def _create_crawler_config(self) -> CrawlerRunConfig:
56 | """Creates default crawler configuration"""
57 | content_filter = PruningContentFilter(user_query=self.user_query) if self.user_query else PruningContentFilter()
58 | return CrawlerRunConfig(
59 | cache_mode=CacheMode.BYPASS,
60 | markdown_generator=DefaultMarkdownGenerator(
61 | content_filter=content_filter
62 | )
63 | )
64 |
65 | async def scrape(self, url: str) -> Dict[str, ExtractionResult]:
66 | """
67 | Scrape URL using configured strategies
68 |
69 | Args:
70 | url: Target URL to scrape
71 | """
72 | # Handle Wikipedia URLs
73 | if 'wikipedia.org/wiki/' in url:
74 | from src.opendeepsearch.context_scraping.utils import get_wikipedia_content
75 | try:
76 | content = get_wikipedia_content(url)
77 | # Create same result for all strategies since we're using Wikipedia content
78 | return {
79 | strategy_name: ExtractionResult(
80 | name=strategy_name,
81 | success=True,
82 | content=content
83 | ) for strategy_name in self.strategies
84 | }
85 | except Exception as e:
86 | if self.debug:
87 | print(f"Debug: Wikipedia extraction failed: {str(e)}")
88 | # If Wikipedia extraction fails, fall through to normal scraping
89 |
90 | # Normal scraping for non-Wikipedia URLs or if Wikipedia extraction failed
91 | results = {}
92 | for strategy_name in self.strategies:
93 | config = ExtractionConfig(
94 | name=strategy_name,
95 | strategy=self.strategy_map[strategy_name]()
96 | )
97 | result = await self.extract(config, url)
98 | results[strategy_name] = result
99 |
100 | return results
101 |
102 | async def scrape_many(self, urls: List[str]) -> Dict[str, Dict[str, ExtractionResult]]:
103 | """
104 | Scrape multiple URLs using configured strategies in parallel
105 |
106 | Args:
107 | urls: List of target URLs to scrape
108 |
109 | Returns:
110 | Dictionary mapping URLs to their extraction results
111 | """
112 | # Create tasks for all URLs
113 | tasks = [self.scrape(url) for url in urls]
114 | # Run all tasks concurrently
115 | results_list = await asyncio.gather(*tasks)
116 |
117 | # Build results dictionary
118 | results = {}
119 | for url, result in zip(urls, results_list):
120 | results[url] = result
121 |
122 | return results
123 |
124 | async def extract(self, extraction_config: ExtractionConfig, url: str) -> ExtractionResult:
125 | """Internal method to perform extraction using specified strategy"""
126 | try:
127 | config = self._create_crawler_config()
128 | config.extraction_strategy = extraction_config.strategy
129 |
130 | if self.debug:
131 | print(f"\nDebug: Attempting extraction with strategy: {extraction_config.name}")
132 | print(f"Debug: URL: {url}")
133 | print(f"Debug: Strategy config: {config.extraction_strategy}")
134 | if self.user_query:
135 | print(f"Debug: User query: {self.user_query}")
136 |
137 | async with AsyncWebCrawler(config=self.browser_config) as crawler:
138 | if isinstance(url, list):
139 | result = await crawler.arun_many(urls=url, config=config)
140 | else:
141 | result = await crawler.arun(url=url, config=config)
142 |
143 | if self.debug:
144 | print(f"Debug: Raw result attributes: {dir(result)}")
145 | print(f"Debug: Raw result: {result.__dict__}")
146 |
147 | # Handle different result formats based on strategy
148 | content = None
149 | if result.success:
150 | if extraction_config.name in ['no_extraction', 'cosine']:
151 | # For strategies that return a list of dictionaries
152 | if hasattr(result, 'markdown_v2'):
153 | content = result.markdown_v2.raw_markdown
154 | elif hasattr(result, 'raw_html'):
155 | content = result.raw_html
156 | elif hasattr(result, 'extracted_content') and result.extracted_content:
157 | if isinstance(result.extracted_content, list):
158 | content = '\n'.join(item.get('content', '') for item in result.extracted_content)
159 | else:
160 | content = result.extracted_content
161 |
162 | if self.filter_content and content:
163 | from src.opendeepsearch.context_scraping.utils import filter_quality_content
164 | content = filter_quality_content(content)
165 | else:
166 | content = result.extracted_content
167 | if self.filter_content and content:
168 | from src.opendeepsearch.context_scraping.utils import filter_quality_content
169 | content = filter_quality_content(content)
170 |
171 | if self.debug:
172 | print(f"Debug: Processed content: {content[:200] if content else None}")
173 |
174 | extraction_result = ExtractionResult(
175 | name=extraction_config.name,
176 | success=result.success,
177 | content=content,
178 | error=getattr(result, 'error', None) # Capture error if available
179 | )
180 |
181 | if result.success:
182 | extraction_result.raw_markdown_length = len(result.markdown_v2.raw_markdown)
183 | extraction_result.citations_markdown_length = len(result.markdown_v2.markdown_with_citations)
184 | elif self.debug:
185 | print(f"Debug: Final extraction result: {extraction_result.__dict__}")
186 |
187 | return extraction_result
188 |
189 | except Exception as e:
190 | if self.debug:
191 | import traceback
192 | print(f"Debug: Exception occurred during extraction:")
193 | print(traceback.format_exc())
194 |
195 | return ExtractionResult(
196 | name=extraction_config.name,
197 | success=False,
198 | error=str(e)
199 | )
200 |
201 | async def main():
202 | # Example usage with single URL
203 | single_url = "https://example.com/product-page"
204 | scraper = WebScraper(debug=True)
205 | results = await scraper.scrape(single_url)
206 |
207 | # Print single URL results
208 | for result in results.values():
209 | print_extraction_result(result)
210 |
211 | # Example usage with multiple URLs
212 | urls = [
213 | "https://example.com",
214 | "https://python.org",
215 | "https://github.com"
216 | ]
217 |
218 | multi_results = await scraper.scrape_many(urls)
219 |
220 | # Print multiple URL results
221 | for url, url_results in multi_results.items():
222 | print(f"\nResults for {url}:")
223 | for result in url_results.values():
224 | print_extraction_result(result)
225 |
226 | if __name__ == "__main__":
227 | asyncio.run(main())
228 |
--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/extraction_result.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the ExtractionResult class for holding extraction operation results.
3 | """
4 |
5 | from typing import Optional
6 |
7 | class ExtractionResult:
8 | """Holds the results of an extraction operation"""
9 | def __init__(self, name: str, success: bool, content: Optional[str] = None, error: Optional[str] = None):
10 | self.name = name
11 | self.success = success
12 | self.content = content
13 | self.error = error
14 | self.raw_markdown_length = 0
15 | self.citations_markdown_length = 0
16 |
17 | def print_extraction_result(result: ExtractionResult):
18 | """Utility function to print extraction results"""
19 | if result.success:
20 | print(f"\n=== {result.name} Results ===")
21 | print(f"Extracted Content: {result.content}")
22 | print(f"Raw Markdown Length: {result.raw_markdown_length}")
23 | print(f"Citations Markdown Length: {result.citations_markdown_length}")
24 | else:
25 | print(f"Error in {result.name}: {result.error}")
--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/fast_scraper.py:
--------------------------------------------------------------------------------
1 | """
2 | Enhanced web scraping implementation using Crawl4AI and vLLM.
3 | Supports multiple extraction strategies with LLM-powered content processing.
4 | """
5 |
6 | import asyncio
7 | from dataclasses import dataclass
8 | from typing import Dict, List, Optional, Any
9 | import json
10 |
11 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
12 | from vllm import LLM, SamplingParams
13 |
14 | from opendeepsearch.context_scraping.extraction_result import ExtractionResult
15 | from opendeepsearch.context_scraping.utils import clean_html, get_wikipedia_content
16 |
17 | @dataclass
18 | class LLMConfig:
19 | """Configuration for LLM-based extraction"""
20 | model_name: str = 'jinaai/ReaderLM-v2'
21 | max_model_len: int = 512_000
22 | temperature: float = 0.0
23 | top_k: int = 1
24 | presence_penalty: float = 0.25
25 | frequency_penalty: float = 0.25
26 | repetition_penalty: float = 1.13
27 | max_tokens: int = 16_384
28 |
29 | # DEFAULT_SCHEMA = """
30 | # {
31 | # "type": "object",
32 | # "properties": {
33 | # "title": {
34 | # "type": "string"
35 | # },
36 | # "author": {
37 | # "type": "string"
38 | # },
39 | # "date": {
40 | # "type": "string"
41 | # },
42 | # "content": {
43 | # "type": "string"
44 | # }
45 | # },
46 | # "required": ["title", "author", "date", "content"]
47 | # }
48 | # """
49 |
50 | class FastWebScraper:
51 | """Enhanced scraper with LLM-powered extraction and multiple strategies"""
52 | def __init__(
53 | self,
54 | llm_config: Optional[LLMConfig] = None,
55 | browser_config: Optional[BrowserConfig] = None,
56 | json_schema: Optional[Dict[str, Any]] = None,
57 | debug: bool = False
58 | ):
59 | self.debug = debug
60 | self.browser_config = browser_config or BrowserConfig(headless=True, verbose=debug)
61 | self.llm_config = llm_config or LLMConfig()
62 | self.json_schema = None #json_schema or json.loads(DEFAULT_SCHEMA)
63 |
64 | # Initialize LLM
65 | self.sampling_params = SamplingParams(
66 | temperature=self.llm_config.temperature,
67 | top_k=self.llm_config.top_k,
68 | presence_penalty=self.llm_config.presence_penalty,
69 | repetition_penalty=self.llm_config.repetition_penalty,
70 | max_tokens=self.llm_config.max_tokens,
71 | frequency_penalty=self.llm_config.frequency_penalty
72 | )
73 |
74 | self.llm = LLM(
75 | model=self.llm_config.model_name,
76 | max_model_len=self.llm_config.max_model_len,
77 | dtype='float16'
78 | )
79 |
80 | self.tokenizer = self.llm.get_tokenizer()
81 |
82 | def _create_prompt(self, text: str, instruction: Optional[str] = None) -> str:
83 | """Create a prompt for the LLM"""
84 | if not instruction:
85 | instruction = "Extract the main content and convert to structured format."
86 |
87 | if self.json_schema:
88 | instruction = "Extract information according to the schema and return JSON."
89 | prompt = f"{instruction}\n```html\n{text}\n```\nSchema:```json\n{json.dumps(self.json_schema, indent=2)}\n```"
90 | else:
91 | prompt = f"{instruction}\n```html\n{text}\n```"
92 |
93 | messages = [{"role": "user", "content": prompt}]
94 | return self.tokenizer.apply_chat_template(
95 | messages, tokenize=False, add_generation_prompt=True
96 | )
97 |
98 | async def _extract_content(self, html: str, instruction: Optional[str] = None) -> str:
99 | """Extract content using LLM"""
100 | cleaned_html = clean_html(html, clean_svg=True, clean_base64=True)
101 | prompt = self._create_prompt(cleaned_html, instruction)
102 |
103 | outputs = self.llm.generate(prompt, self.sampling_params)
104 | raw_text = outputs[0].outputs[0].text
105 | return self._parse_llm_output(raw_text)
106 |
107 | def _parse_llm_output(self, text: str) -> str:
108 | """
109 | Parse LLM output, handling both single dictionaries and lists of dictionaries.
110 | Returns the content field from the most appropriate dictionary.
111 | """
112 | try:
113 | # Strip any markdown code block markers
114 | text = text.strip()
115 | if text.startswith('```') and text.endswith('```'):
116 | text = text.split('```')[1]
117 | if text.startswith('json'):
118 | text = text[4:]
119 |
120 | data = json.loads(text.strip())
121 |
122 | if isinstance(data, dict):
123 | return data.get('content', '')
124 |
125 | if isinstance(data, list):
126 | # First try to find a dictionary with non-empty content
127 | for item in data:
128 | if isinstance(item, dict) and item.get('content'):
129 | return item['content']
130 |
131 | # If no content found, return content from last item or empty string
132 | last_item = data[-1]
133 | return last_item.get('content', '') if isinstance(last_item, dict) else ''
134 |
135 | return ''
136 |
137 | except json.JSONDecodeError:
138 | # If JSON parsing fails, return the original text
139 | return text.strip()
140 | except Exception:
141 | return ''
142 |
143 | async def scrape(self, url: str, instruction: Optional[str] = None) -> ExtractionResult:
144 | """
145 | Scrape and process content from a URL
146 |
147 | Args:
148 | url: Target URL to scrape
149 | instruction: Optional custom instruction for the LLM
150 | """
151 | try:
152 | if self.debug:
153 | print(f"Debug: Processing URL: {url}")
154 |
155 | # Handle Wikipedia URLs
156 | if 'wikipedia.org/wiki/' in url:
157 | try:
158 | content = get_wikipedia_content(url)
159 | return ExtractionResult(
160 | name="llm_extraction",
161 | success=True,
162 | content=content
163 | )
164 | except Exception as e:
165 | if self.debug:
166 | print(f"Debug: Wikipedia extraction failed: {str(e)}")
167 | # If Wikipedia extraction fails, fall through to normal scraping
168 |
169 | # Fetch HTML
170 | async with AsyncWebCrawler(config=self.browser_config) as crawler:
171 | result = await crawler.arun(url=url, config=CrawlerRunConfig())
172 |
173 | if not result.success:
174 | return ExtractionResult(
175 | name="llm_extraction",
176 | success=False,
177 | error="Failed to fetch HTML"
178 | )
179 |
180 | # Process with LLM
181 | content = await self._extract_content(result.html, instruction)
182 |
183 | return ExtractionResult(
184 | name="llm_extraction",
185 | success=True,
186 | content=content
187 | )
188 |
189 | except Exception as e:
190 | if self.debug:
191 | import traceback
192 | print(f"Debug: Exception during scraping:")
193 | print(traceback.format_exc())
194 |
195 | return ExtractionResult(
196 | name="llm_extraction",
197 | success=False,
198 | error=str(e)
199 | )
200 |
201 | async def scrape_many(self, urls: List[str], instruction: Optional[str] = None) -> Dict[str, ExtractionResult]:
202 | """
203 | Scrape multiple URLs
204 |
205 | Args:
206 | urls: List of target URLs
207 | instruction: Optional custom instruction for the LLM
208 | """
209 | results = {}
210 | for url in urls:
211 | results[url] = await self.scrape(url, instruction)
212 | return results
213 |
--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/strategy_factory.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the StrategyFactory class for creating various extraction strategies.
3 | """
4 |
5 | import os
6 | from typing import Optional
7 |
8 | from crawl4ai.extraction_strategy import (
9 | LLMExtractionStrategy,
10 | JsonCssExtractionStrategy,
11 | JsonXPathExtractionStrategy,
12 | NoExtractionStrategy,
13 | CosineStrategy,
14 | )
15 |
16 | class StrategyFactory:
17 | """Factory for creating extraction strategies"""
18 | @staticmethod
19 | def create_llm_strategy(
20 | input_format: str = "markdown",
21 | instruction: str = "Extract relevant content from the provided text, only return the text, no markdown formatting, remove all footnotes, citations, and other metadata and only keep the main content",
22 | ) -> LLMExtractionStrategy:
23 | return LLMExtractionStrategy(
24 | input_format=input_format,
25 | provider="openrouter/google/gemini-2.0-flash-lite-001", # Uses LiteLLM as provider
26 | api_token=os.getenv("OPENROUTER_API_KEY"),
27 | instruction=instruction
28 | )
29 |
30 | @staticmethod
31 | def create_css_strategy() -> JsonCssExtractionStrategy:
32 | schema = {
33 | "baseSelector": ".product",
34 | "fields": [
35 | {"name": "title", "selector": "h1.product-title", "type": "text"},
36 | {"name": "price", "selector": ".price", "type": "text"},
37 | {"name": "description", "selector": ".description", "type": "text"},
38 | ],
39 | }
40 | return JsonCssExtractionStrategy(schema=schema)
41 |
42 | @staticmethod
43 | def create_xpath_strategy() -> JsonXPathExtractionStrategy:
44 | schema = {
45 | "baseSelector": "//div[@class='product']",
46 | "fields": [
47 | {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
48 | {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
49 | {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"},
50 | ],
51 | }
52 | return JsonXPathExtractionStrategy(schema=schema)
53 |
54 | @staticmethod
55 | def create_no_extraction_strategy() -> NoExtractionStrategy:
56 | return NoExtractionStrategy()
57 |
58 | @staticmethod
59 | def create_cosine_strategy(
60 | semantic_filter: Optional[str] = None,
61 | word_count_threshold: int = 10,
62 | max_dist: float = 0.2,
63 | sim_threshold: float = 0.3,
64 | debug: bool = False
65 | ) -> CosineStrategy:
66 | return CosineStrategy(
67 | semantic_filter=semantic_filter,
68 | word_count_threshold=word_count_threshold,
69 | max_dist=max_dist,
70 | sim_threshold=sim_threshold,
71 | verbose=debug
72 | )
--------------------------------------------------------------------------------
/src/opendeepsearch/context_scraping/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Tuple
3 | import fasttext
4 | from huggingface_hub import hf_hub_download
5 | import wikipediaapi
6 |
7 | # Load the model
8 | model = fasttext.load_model(hf_hub_download("kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2", "model.bin"))
9 |
10 | def clean_markdown_links(text: str, min_quality_score: float = 0.2) -> Tuple[str, float]:
11 | """
12 | Clean markdown links and filter low-quality content.
13 | Returns tuple of (cleaned_text, quality_score)
14 | """
15 | # Split by double newlines to preserve paragraph structure
16 | paragraphs = text.split('\n\n')
17 |
18 | cleaned_paragraphs = []
19 | for paragraph in paragraphs:
20 | # Preserve code blocks by checking if paragraph contains ``` tags
21 | if '```' in paragraph:
22 | cleaned_paragraphs.append(paragraph)
23 | continue
24 |
25 | lines = paragraph.split('\n')
26 | filtered_lines = []
27 | for line in lines:
28 | line = line.strip()
29 | # Keep headers regardless of length
30 | if re.match(r'^#{1,6}\s+', line):
31 | filtered_lines.append(line)
32 | continue
33 |
34 | # Skip common UI/navigation elements
35 | if re.match(r'^(Share|Trade|More|Buy|Sell|Download|Menu|Home|Back|Next|Previous|\d+\s*(BTC|USD|EUR|GBP)|\w{3}-\w{1,3}|Currency:.*|You (Buy|Spend|Receive)|≈|\d+\.\d+)', line, re.IGNORECASE):
36 | continue
37 |
38 | # Count words before removing markdown
39 | word_count = len(re.sub(r'\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|<.*?>', '', line).split())
40 |
41 | # Increase minimum word threshold to 12
42 | if word_count < 12:
43 | # Check if line only contains markdown patterns or appears to be a currency/trading related line
44 | cleaned_line = re.sub(r'\[!\[.*?\]\(.*?\)\]\(.*?\)|\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|<.*?>|\d+(\.\d+)?%?|\$\d+(\.\d+)?', '', line).strip()
45 | if not cleaned_line or len(cleaned_line.split()) < 8: # If nothing substantial remains, skip this line
46 | continue
47 |
48 | filtered_lines.append(line)
49 |
50 | # Only add paragraph if it has any lines left
51 | if filtered_lines:
52 | cleaned_paragraphs.append('\n'.join(filtered_lines))
53 |
54 | # Rejoin with double newlines
55 | cleaned_text = '\n\n'.join(cleaned_paragraphs)
56 |
57 | # Get quality score
58 | quality_score = predict_educational_value([cleaned_text])[0]
59 |
60 | return cleaned_text, quality_score
61 |
62 | def filter_quality_content(text: str, min_quality_score: float = 0.2) -> str:
63 | """
64 | Filter content based on quality and returns concatenated quality content
65 | """
66 | # Split text into paragraphs
67 | paragraphs = text.split('\n\n')
68 |
69 | # Process each paragraph
70 | quality_content = []
71 | for paragraph in paragraphs:
72 | if not paragraph.strip(): # Skip empty paragraphs
73 | continue
74 |
75 | cleaned_text, quality_score = clean_markdown_links(paragraph, min_quality_score)
76 | if cleaned_text and quality_score >= min_quality_score:
77 | quality_content.append((cleaned_text, quality_score))
78 |
79 | # Debug print
80 | print(f"Found {len(quality_content)} quality paragraphs out of {len(paragraphs)} total")
81 |
82 | if quality_content:
83 | return "\n\n".join(text for text, _ in quality_content)
84 | return text # Return original text if no quality content found
85 |
86 | def replace_newlines(text: str) -> str:
87 | """Replace multiple newlines with a single space."""
88 | return re.sub("\n+", " ", text)
89 |
90 | score_dict = {
91 | '__label__': 0,
92 | '__label__Low': 0,
93 | '__label__Mid': 1,
94 | '__label__High': 2
95 | }
96 |
97 | def predict_educational_value(text_list: List[str]) -> List[float]:
98 | """
99 | Predict educational value scores for a list of texts.
100 | Returns a list of scores between 0 and 2.
101 | """
102 | text_list = [replace_newlines(text) for text in text_list]
103 | pred = model.predict(text_list, k=-1)
104 | score_list = []
105 | for l, s in zip(*pred):
106 | score = 0
107 | for _l, _s in zip(l, s):
108 | score += score_dict[_l] * _s
109 | score_list.append(float(score))
110 | return score_list
111 |
112 | def get_wikipedia_content(url: str) -> str | None:
113 | """
114 | Extract content from a Wikipedia URL.
115 |
116 | Args:
117 | url: Wikipedia URL to scrape
118 |
119 | Returns:
120 | str: Page content if found, None otherwise
121 | """
122 | wiki = wikipediaapi.Wikipedia(user_agent="opendeepsearch", language='en')
123 |
124 | # Extract the page title from URL (everything after /wiki/)
125 | try:
126 | title = url.split('/wiki/')[-1]
127 | page = wiki.page(title)
128 | if page.exists():
129 | return page.text
130 | return None
131 | except Exception:
132 | return None
133 |
134 | # Patterns
135 | SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
136 | STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
137 | META_PATTERN = r"<[ ]*meta.*?>"
138 | COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
139 | LINK_PATTERN = r"<[ ]*link.*?>"
140 | BASE64_IMG_PATTERN = r']+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
141 | SVG_PATTERN = r"(