├── .env.example
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── deep_researcher
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── baseclass.py
    │   ├── knowledge_gap_agent.py
    │   ├── long_writer_agent.py
    │   ├── planner_agent.py
    │   ├── proofreader_agent.py
    │   ├── thinking_agent.py
    │   ├── tool_agents
    │   │   ├── __init__.py
    │   │   ├── crawl_agent.py
    │   │   └── search_agent.py
    │   ├── tool_selector_agent.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── parse_output.py
    │   └── writer_agent.py
    ├── deep_research.py
    ├── iterative_research.py
    ├── llm_config.py
    ├── main.py
    ├── tools
    │   ├── __init__.py
    │   ├── crawl_website.py
    │   └── web_search.py
    └── utils
    │   ├── __init__.py
    │   ├── markdown.css
    │   ├── md_to_pdf.py
    │   └── os.py
├── examples
    ├── deep_example.py
    ├── iterative_example.py
    ├── report_plan_example.py
    └── sample_output
    │   ├── labour_policies.md
    │   ├── plato.md
    │   ├── quantera_market_size.md
    │   ├── quantum_computing.md
    │   └── tesla.md
├── pyproject.toml
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── config.py
    ├── test_model_providers.py
    ├── test_reformat_references.py
    ├── test_reformat_section_headings.py
    ├── test_research_agents.py
    └── test_tool_agents.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # Note: You can optionally add the prefix 'DR_' to the variable names to avoid conflicts with other variables in your app
 2 | 
 3 | # LLM api keys
 4 | OPENAI_API_KEY=<your-openai-api-key>
 5 | DEEPSEEK_API_KEY=<your-deepseek-api-key>
 6 | OPENROUTER_API_KEY=<your-openrouter-api-key>
 7 | GEMINI_API_KEY=<your-google-api-key>
 8 | ANTHROPIC_API_KEY=<your-anthropic-api-key>
 9 | PERPLEXITY_API_KEY=<your-perplexity-api-key>
10 | HUGGINGFACE_API_KEY=<your-huggingface-api-key>
11 | LOCAL_MODEL_URL=<your-local-model-url>
12 | AZURE_OPENAI_ENDPOINT=<your-azureopenai-endpoint-url>
13 | AZURE_OPENAI_DEPLOYMENT=<your-deployment-name>
14 | AZURE_OPENAI_API_KEY=<your-azureopenai-api-key>
15 | AZURE_OPENAI_API_VERSION=<your-azureopenai-deployment-api-version>
16 | 
17 | # Search provider
18 | SEARCH_PROVIDER=serper  # serper or openai
19 | SERPER_API_KEY=<your-serper-api-key>
20 | 
21 | # Selected LLM models
22 | # Current options for model providers: 
23 | # azureopenai, openai, deepseek, openrouter, gemini, anthropic, perplexity, huggingface, local
24 | REASONING_MODEL_PROVIDER=openai
25 | REASONING_MODEL=o3-mini
26 | MAIN_MODEL_PROVIDER=openai
27 | MAIN_MODEL=gpt-4o
28 | FAST_MODEL_PROVIDER=openai
29 | FAST_MODEL=gpt-4o-mini
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | MANIFEST
23 | .env
24 | .venv
25 | env/
26 | venv/
27 | ENV/
28 | .python-version
29 | .pytest_cache/
30 | .coverage
31 | htmlcov/
32 | 
33 | # IDEs and Editors
34 | .idea/
35 | .vscode/
36 | *.swp
37 | *.swo
38 | *~
39 | .project
40 | .settings/
41 | .classpath
42 | 
43 | # Mac OS
44 | .DS_Store
45 | .AppleDouble
46 | .LSOverride
47 | Icon
48 | ._*
49 | .DocumentRevisions-V100
50 | .fseventsd
51 | .Spotlight-V100
52 | .TemporaryItems
53 | .Trashes
54 | .VolumeIcon.icns
55 | .com.apple.timemachine.donotpresent
56 | 
57 | # Windows
58 | Thumbs.db
59 | ehthumbs.db
60 | ehthumbs_vista.db
61 | *.stackdump
62 | [Dd]esktop.ini
63 | $RECYCLE.BIN/
64 | *.cab
65 | *.msi
66 | *.msix
67 | *.msm
68 | *.msp
69 | *.lnk
70 | 
71 | # Linux
72 | *~
73 | .fuse_hidden*
74 | .directory
75 | .Trash-*
76 | .nfs*
77 | 
78 | sandbox/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include README.md
3 | include LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | [![GitHub Stars](https://img.shields.io/github/stars/qx-labs/agents-deep-research?style=social)](https://github.com/qx-labs/agents-deep-research/stargazers)
  4 | [![GitHub Forks](https://img.shields.io/github/forks/qx-labs/agents-deep-research?style=social)](https://github.com/qx-labs/agents-deep-research/network/members)
  5 | 
  6 | [![PyPI version](https://badge.fury.io/py/deep-researcher.svg)](https://pypi.org/project/deep-researcher/)
  7 | [![License](https://img.shields.io/github/license/qx-labs/agents-deep-research)](https://github.com/qx-labs/agents-deep-research/blob/main/LICENSE)
  8 | [![PyPI Downloads](https://static.pepy.tech/badge/deep-researcher)](https://pepy.tech/projects/deep-researcher)
  9 | 
 10 | </div>
 11 | 
 12 | # Agentic Deep Research using the OpenAI Agents SDK
 13 | 
 14 | A powerful deep research assistant built using the [OpenAI Agents SDK](https://github.com/openai/openai-agents-python), designed to perform in-depth research on any given topic. Compatible with AzureOpenAI, OpenAI, Anthropic, Gemini, DeepSeek, Perplexity, OpenRouter, Hugging Face and local models such as Ollama.
 15 | 
 16 | It uses a multi-agent architecture that works iteratively, continually refining its understanding of a topic and producing increasingly detailed insights that feed the final report.
 17 | 
 18 | Designed to be extendable to use custom tools and any other 3rd party LLMs compatible with the OpenAI API spec. LLM and tool calls can be optionally traced using OpenAI's tracing feature.
 19 | 
 20 | Some background reading [here](https://www.j2.gg/thoughts/deep-research-how-it-works).
 21 | 
 22 | ## Overview
 23 | 
 24 | This package has two modes of research:
 25 | 
 26 | - An `IterativeResearcher` which runs a continuous loop of research on a topic or sub-topic and drafts a report
 27 |   - This is preferred and sufficient for shorter reports (up to 5 pages / 1,000 words)
 28 |   - The user can specify constraints such as research depth, time limits, report length and formatting instructions
 29 | - A `DeepResearcher` which runs a more thorough and structured process, first forming a report outline, and then running concurrent `IterativeResearcher` instances for each section of the report
 30 |   - This is useful for longer reports (e.g. 20+ pages)
 31 | 
 32 | The flow of the `DeepResearcher` is as follows:
 33 | 
 34 | 1. Takes a research topic and conducts preliminary research to form a report outline / plan
 35 | 2. For each section of the report plan, runs parallel instances of the `IterativeResearcher`, which:
 36 |    1. Identifies knowledge gaps in the current research
 37 |    2. Strategically selects the appropriate tools to fill those gaps
 38 |    3. Executes research actions through specialized agents
 39 |    4. Synthesizes findings into a comprehensive section
 40 | 3. Compiles all of the sections into a coherent and well-structured report
 41 | 
 42 | It is worth noting that the deep research agent does not ask clarifying questions at the start, so can be used in an automated fashion.
 43 | 
 44 | ## Sample Output
 45 | 
 46 | Deep Research Examples (using DeepResearcher):
 47 | - [Life and Works of Plato](examples/sample_output/plato.md) - 7,980 words
 48 | - [Text Book on Quantum Computing](examples/sample_output/quantum_computing.md) - 5,253 words
 49 | - [Deep-Dive on Tesla](examples/sample_output/tesla.md) - 4,732 words
 50 | 
 51 | Simple Research Examples (using IterativeResearcher):
 52 | - [Quantera Market Size](examples/sample_output/quantera_market_size.md) - 1,001 words
 53 | - [UK Government Policies](examples/sample_output/labour_policies.md) - 1,077 words
 54 | 
 55 | ## Flow Diagram
 56 | 
 57 | ### IterativeResearcher Flow
 58 | 
 59 | ```mermaid
 60 | flowchart LR
 61 |     A["User Input<br>- query<br>- max_iterations<br>- max_time<br>- output_instructions"] --> B
 62 | 
 63 |     subgraph "Deep Research Loop"
 64 |         B["Knowledge<br>Gap Agent"] -->|"Current gaps<br>& objective"| C["Tool Selector<br>Agent"]
 65 |         C -->|"Tool queries<br>(run in parallel)"| D["Tool Agents<br>- Web Search<br>- Crawler<br>- Custom tools"]
 66 |         D -->|"New findings"| E["Observations<br>Agent"]
 67 |         E --> |"Thoughts on findings<br>and research strategy"| B
 68 |     end
 69 | 
 70 |     E --> F["Writer Agent<br>(final output<br>with references)"]
 71 | ```
 72 | 
 73 | ### DeepResearcher Flow
 74 | 
 75 | ```mermaid
 76 | flowchart LR
 77 |     A["User Input<br>- query<br>- max_iterations<br>- max_time"] --> B["Planner Agent"]
 78 |     
 79 |     B -->|"Report plan<br>(sections & background context)"| D2
 80 |     
 81 |     subgraph Parallel["Parallel Section Research"]
 82 |         D1["IterativeResearcher<br>(Section 1)"]
 83 |         D2["IterativeResearcher<br>(Section 2)"]
 84 |         D3["IterativeResearcher<br>(Section 3)"]
 85 |     end
 86 |     
 87 |     D1 -->|"Section 1<br>Draft"| E["Proofreader<br>Agent"]
 88 |     D2 -->|"Section 2<br>Draft"| E
 89 |     D3 -->|"Section 3<br>Draft"| E
 90 |     
 91 |     E --> F["Final<br>Research<br>Report"]
 92 | ```
 93 | 
 94 | ## Installation
 95 | 
 96 | Install using `pip`:
 97 | 
 98 | ```
 99 | pip install deep-researcher
100 | ```
101 | 
102 | Or clone the GitHub repo:
103 | 
104 | ```sh
105 | git clone https://github.com/qx-labs/agents-deep-research.git
106 | cd agents-deep-research
107 | pip install -r requirements.txt
108 | ```
109 | 
110 | Then create a `.env` file with your API keys:
111 | 
112 | ```sh
113 | cp .env.example .env
114 | ```
115 | 
116 | Edit the `.env` file to add your OpenAI, Serper and other settings as needed, e.g.:
117 | 
118 | ```sh
119 | OPENAI_API_KEY=<your_key>
120 | SEARCH_PROVIDER=serper  # or set to openai
121 | SERPER_API_KEY=<your_key>
122 | ```
123 | 
124 | ## Usage
125 | 
126 | ### Python Module
127 | 
128 | ```python
129 | # See the /examples folder for working examples
130 | import asyncio
131 | from deep_researcher import IterativeResearcher, DeepResearcher
132 | 
133 | # Run the IterativeResearcher for simple queries
134 | researcher = IterativeResearcher(max_iterations=5, max_time_minutes=5)
135 | query = "Provide a comprehensive overview of quantum computing"
136 | report = asyncio.run(
137 |     researcher.run(query, output_length="5 pages")
138 | )
139 | 
140 | # Run the DeepResearcher for more lengthy and structured reports
141 | researcher = DeepResearcher(max_iterations=3, max_time_minutes=5)
142 | report = asyncio.run(
143 |     researcher.run(query)
144 | )
145 | 
146 | print(report)
147 | ```
148 | 
149 | #### Custom LLM Configuration at Runtime
150 | 
151 | When running the deep researcher in Python, you have the option to set custom LLM configuration variables at runtime. This gives you flexibility to dynamically change the model choice within your code.
152 | 
153 | ```python
154 | import asyncio
155 | from deep_researcher import DeepResearcher, LLMConfig
156 | 
157 | # These configuration options will take precedence over the environment variables
158 | llm_config = LLMConfig(
159 |     search_provider="serper",
160 |     reasoning_model_provider="openai",
161 |     reasoning_model="o3-mini",
162 |     main_model_provider="openai",
163 |     main_model="gpt-4o",
164 |     fast_model_provider="openai",
165 |     fast_model="gpt-4o-mini"
166 | )
167 | researcher = DeepResearcher(max_iterations=3, max_time_minutes=5, config=llm_config)
168 | report = asyncio.run(
169 |     researcher.run(query)
170 | )
171 | ```
172 | 
173 | ### Command Line
174 | 
175 | Run the research assistant from the command line.
176 | 
177 | If you've installed via `pip`:
178 | ```sh
179 | deep-researcher --mode deep --query "Provide a comprehensive overview of quantum computing" --max-iterations 3 --max-time 10 --verbose
180 | ```
181 | 
182 | Or if you've cloned the GitHub repo:
183 | 
184 | ```sh
185 | python -m deep_researcher.main --mode deep --query "Provide a comprehensive overview of quantum computing" --max-iterations 3 --max-time 10 --verbose
186 | ```
187 | 
188 | Parameters:
189 | 
190 | - `--query`: The research topic or question (if not provided, you'll be prompted)
191 | - `--mode`: If `deep` uses the DeepResearcher, if `simple` uses the IterativeResearcher (default: deep)
192 | - `--max-iterations`: Maximum number of research iterations (default: 5)
193 | - `--max-time`: Maximum time in minutes before the research loop auto-exits to produce a final output (default: 10)
194 | - `--output-length`: Desired output length for the report (default: "5 pages")
195 | - `--output-instructions`: Additional formatting instructions for the final report
196 | 
197 | Boolean Flags:
198 | 
199 | - `--verbose`: Prints the research progress to console
200 | - `--tracing`: Traces the workflow on the OpenAI platform (only works for OpenAI models)
201 | 
202 | ## Compatible Models
203 | 
204 | The deep researcher is designed to run any model compatible with the OpenAI API spec, and does so by adjusting the `base_url` parameter to the relevant model provider. Compatible providers include AzureOpenAI, OpenAI, Anthropic, Gemini, DeepSeek, Hugging Face and OpenRouter as well as locally hosted models via Ollama and LM Studio.
205 | 
206 | However, in order for the deep researcher to be run without errors it relies on models that are highly performant at tool calling.
207 | 
208 | - If using OpenAI models, we find that the `gpt-4o-mini` is as good if not better at tool selection than `o3-mini` (which is consistent with [this leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html)). Given the speed and cost benefits we therefore advise using `gpt-4o-mini` as the model for the majority of agents in our workflow, with `o3-mini` for planning tasks and `gpt-4o` for final writing.
209 | - If using Gemini models, note that only Gemini 2.5 Pro (currently `gemini-2.5-pro-preview-03-25`) works well. Gemini 2.0 Flash (`gemini-2.0-flash`), despite being listed as compatible with tool calling, very frequently fails to call any tools.
210 | 
211 | ## Architecture
212 | 
213 | The Deep Research Assistant is built with the following components:
214 | 
215 | ### Core Components
216 | 
217 | - **IterativeResearcher**: Orchestrates the iterative research workflow on a single topic or subtopic
218 | - **DeepResearcher**: Orchestrates a deeper and broader workflow that includes an initial report outline, calling of multiple parallel `IterativeResearch` instances, and final proofreading step
219 | - **LLMConfig**: Manages interactions with language models so that these can be swapped out as needed
220 | 
221 | ### Agent System
222 | 
223 | - **Knowledge Gap Agent**: Analyzes current research state and identifies gaps in knowledge
224 | - **Tool Selector Agent**: Determines which tools to use for addressing specific knowledge gaps
225 | - **Tool Agents**: Specialized agents for executing specific research actions (can be extended to add custom tools):
226 |   - Web Search Agent
227 |   - Website Crawler Agent
228 | - **Writer Agent**: Synthesizes research findings into coherent reports
229 | 
230 | ### Research Tools
231 | 
232 | - **Web Search**: Finds relevant information from SERP queries
233 |   - Our implementation uses [Serper](https://www.serper.dev) to run Google searches by default, which requires an API key set to the `SERPER_API_KEY` env variable.
234 |   - You can replace this with the native web search tool from OpenAI by setting the environment variable `SEARCH_PROVIDER` to `openai`
235 | - **Website Crawler**: Extracts detailed content from the pages of a given website
236 | 
237 | ### Implementing Custom Tool Agents
238 | 
239 | Tool agents are agents specialized in carrying out specific tasks using one or more tools (e.g. web searches, fetching and interpreting data from an API, etc). To implement a custom tool agent:
240 | * Create any tools that the agent will use in the `deep_researcher/tools` folder
241 | * Create a new tool agent that calls this tool in the `deep_researcher/agents/tool_agents` folder
242 | * Add the tool agent definition to the `init_tool_agents` function in `deep_researcher/agents/tool_agents/__init__.py`
243 | * Update the system prompt of `deep_researcher/agents/tool_selector_agent.py` to include the name and description of the new agent, so that the ToolSelectorAgent knows of its existence
244 | 
245 | ### Configuring Custom LLMs
246 | 
247 | This repository is in theory compatible with any LLMs that follow the OpenAI API specs. This includes the likes of DeepSeek as well as models served through OpenRouter. However, the models need to be compatible with [Structured Outputs](https://platform.openai.com/docs/guides/structured-outputs) in the OpenAI API spec (i.e. being able to set `response_format: {type: "json_schema", ...}`).
248 | 
249 | LLMs are configured and managed in the `deep_researcher/llm_config.py` file.
250 | 
251 | ## Trace Monitoring
252 | 
253 | The Deep Research assistant integrates with OpenAI's trace monitoring system. Each research session generates a trace ID that can be used to monitor the execution flow and agent interactions in real-time through the OpenAI platform.
254 | 
255 | ## Observations and Limitations
256 | 
257 | ### Rate Limits
258 | - The `DeepResearcher` runs a lot of searches and API calls in parallel (at any given point in time it could be ingesting 50-60 different web pages). As a result you may find that yourself hitting rate limits for AzureOpenAI, OpenAI, Gemini, Anthropic and other model providers particularly if you are on lower or free tiers. 
259 | - If you run into these errors, you may wish to use the `IterativeResearcher` instead which is less consumptive of API calls.
260 | 
261 | ### **Output Length:** 
262 | 
263 | LLMs are not good at following guidelines on output length. You typically run into two issues:
264 | 
265 | - LLMs are bad at counting. When giving length instructions, it's better to provide a reference that the model will be familiar with from its training data (e.g. 'length of a tweet', 'a few paragraphs', 'length of a book') rather than a specific word count. 
266 | - Even though the output token limit on many of these models is massive, it is very difficult to get them to produce more than 1-2,000 words per response. There are methods such as [this one](https://medium.com/@techsachin/longwriter-using-llm-agent-based-pipeline-to-scale-llms-output-window-size-to-10-000-words-33210d299e2b) to produce longer outputs.
267 | 
268 | We include an `output_length` parameter for the `IterativeResearcher` to give the user control but bear in mind the above limitations.
269 | 
270 | ## TODOs:
271 | 
272 | - [ ] Add unit tests for different model providers
273 | - [ ] Add example implementation for different models
274 | - [ ] Add compatibility with other search providers (e.g. SearXNG, Bing, Tavily, DuckDuckGo etc.)
275 | - [ ] Add caching (e.g. Redis) of scraped web pages to avoid duplicate work/calls
276 | - [ ] Add more specialized research tools (e.g. Wikipedia, arXiv, data analysis etc.)
277 | - [ ] Add PDF parser
278 | - [ ] Add integration / RAG for local files
279 | 
280 | ## Author
281 | 
282 | Created by Jai Juneja at [QX Labs](https://www.qxlabs.com).
283 | 


--------------------------------------------------------------------------------
/deep_researcher/__init__.py:
--------------------------------------------------------------------------------
1 | from .deep_research import DeepResearcher
2 | from .iterative_research import IterativeResearcher
3 | from .agents.baseclass import ResearchRunner
4 | from .llm_config import LLMConfig
5 | 
6 | __all__ = ["DeepResearcher", "IterativeResearcher", "ResearchRunner", "LLMConfig"]
7 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .baseclass import ResearchAgent, ResearchRunner


--------------------------------------------------------------------------------
/deep_researcher/agents/baseclass.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Optional
 2 | from agents import Agent, Runner, RunResult
 3 | from agents.run_context import TContext
 4 | 
 5 | 
 6 | class ResearchAgent(Agent[TContext]):
 7 |     """
 8 |     This is a custom implementation of the OpenAI Agent class that supports output parsing
 9 |     for models that don't support structured output types. The user can specify an output_parser
10 |     function that will be called with the raw output from the agent. This can run custom logic 
11 |     such as cleaning up the output and converting it to a structured JSON object.
12 | 
13 |     Needs to be run with the ResearchRunner to work.
14 |     """
15 |     
16 |     def __init__(
17 |         self,
18 |         *args,
19 |         output_parser: Optional[Callable[[str], Any]] = None,
20 |         **kwargs
21 |     ):
22 |         # The output_parser is a function that only takes effect if output_type is not specified
23 |         self.output_parser = output_parser
24 | 
25 |         # If both are specified, we raise an error - they can't be used together
26 |         if self.output_parser and kwargs.get('output_type'):
27 |             raise ValueError("Cannot specify both output_parser and output_type")
28 |             
29 |         super().__init__(*args, **kwargs)
30 |     
31 | 
32 |     async def parse_output(self, run_result: RunResult) -> RunResult:
33 |         """
34 |         Process the RunResult by applying the output_parser to its final_output if specified.
35 |         This preserves the RunResult structure while modifying its content.
36 |         """
37 |         if self.output_parser:
38 |             raw_output = run_result.final_output            
39 |             parsed_output = self.output_parser(raw_output)
40 |             run_result.final_output = parsed_output            
41 |         return run_result
42 |     
43 | 
44 | class ResearchRunner(Runner):
45 |     """
46 |     Custom implementation of the OpenAI Runner class that supports output parsing
47 |     for models that don't support structured output types with tools. 
48 |     
49 |     Needs to be run with the ResearchAgent class.
50 |     """
51 |     
52 |     @classmethod
53 |     async def run(cls, *args, **kwargs) -> RunResult:
54 |         """
55 |         Run the agent and process its output with the custom parser if applicable.
56 |         """
57 |         # Call the original run method
58 |         result = await Runner.run(*args, **kwargs)
59 |         
60 |         # Get the starting agent
61 |         starting_agent = kwargs.get('starting_agent') or args[0]
62 |         
63 |         # If the starting agent is of type ResearchAgent, parse the output
64 |         if isinstance(starting_agent, ResearchAgent):
65 |             return await starting_agent.parse_output(result)
66 |         
67 |         return result


--------------------------------------------------------------------------------
/deep_researcher/agents/knowledge_gap_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to evaluate the state of the research report (typically done in a loop) and identify knowledge gaps that still 
 3 | need to be addressed.
 4 | 
 5 | The Agent takes as input a string in the following format:
 6 | ===========================================================
 7 | ORIGINAL QUERY: <original user query>
 8 | 
 9 | HISTORY OF ACTIONS, FINDINGS AND THOUGHTS: <breakdown of activities and findings carried out so far>
10 | ===========================================================
11 | 
12 | The Agent then:
13 | 1. Carefully reviews the current draft and assesses its completeness in answering the original query
14 | 2. Identifies specific knowledge gaps that still exist and need to be filled
15 | 3. Returns a KnowledgeGapOutput object
16 | """
17 | 
18 | from pydantic import BaseModel, Field
19 | from typing import List
20 | from .baseclass import ResearchAgent
21 | from ..llm_config import LLMConfig, model_supports_structured_output
22 | from datetime import datetime
23 | from .utils.parse_output import create_type_parser
24 | 
25 | class KnowledgeGapOutput(BaseModel):
26 |     """Output from the Knowledge Gap Agent"""
27 |     research_complete: bool = Field(description="Whether the research and findings are complete enough to end the research loop")
28 |     outstanding_gaps: List[str] = Field(description="List of knowledge gaps that still need to be addressed")
29 | 
30 | 
31 | INSTRUCTIONS = f"""
32 | You are a Research State Evaluator. Today's date is {datetime.now().strftime("%Y-%m-%d")}.
33 | Your job is to critically analyze the current state of a research report, 
34 | identify what knowledge gaps still exist and determine the best next step to take.
35 | 
36 | You will be given:
37 | 1. The original user query and any relevant background context to the query
38 | 2. A full history of the tasks, actions, findings and thoughts you've made up until this point in the research process
39 | 
40 | Your task is to:
41 | 1. Carefully review the findings and thoughts, particularly from the latest iteration, and assess their completeness in answering the original query
42 | 2. Determine if the findings are sufficiently complete to end the research loop
43 | 3. If not, identify up to 3 knowledge gaps that need to be addressed in sequence in order to continue with research - these should be relevant to the original query
44 | 
45 | Be specific in the gaps you identify and include relevant information as this will be passed onto another agent to process without additional context.
46 | 
47 | Only output JSON and follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
48 | {KnowledgeGapOutput.model_json_schema()}
49 | """
50 | 
51 | def init_knowledge_gap_agent(config: LLMConfig) -> ResearchAgent:
52 |     selected_model = config.fast_model
53 | 
54 |     return ResearchAgent(
55 |         name="KnowledgeGapAgent",
56 |         instructions=INSTRUCTIONS,
57 |         model=selected_model,
58 |         output_type=KnowledgeGapOutput if model_supports_structured_output(selected_model) else None,
59 |         output_parser=create_type_parser(KnowledgeGapOutput) if not model_supports_structured_output(selected_model) else None
60 |     )


--------------------------------------------------------------------------------
/deep_researcher/agents/long_writer_agent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agent used to synthesize a final report by iteratively writing each section of the report.
  3 | Used to produce long reports given drafts of each section. Broadly aligned with the methodology described here:
  4 | 
  5 | 
  6 | The LongWriterAgent takes as input a string in the following format:
  7 | ===========================================================
  8 | ORIGINAL QUERY: <original user query>
  9 | 
 10 | CURRENT REPORT DRAFT: <current working draft of the report, all sections up to the current one being written>
 11 | 
 12 | TITLE OF NEXT SECTION TO WRITE: <title of the next section of the report to be written>
 13 | 
 14 | DRAFT OF NEXT SECTION: <draft of the next section of the report>
 15 | ===========================================================
 16 | 
 17 | The Agent then:
 18 | 1. Reads the current draft and the draft of the next section
 19 | 2. Writes the next section of the report
 20 | 3. Produces an updated draft of the new section to fit the flow of the report
 21 | 4. Returns the updated draft of the new section along with references/citations
 22 | """
 23 | from .baseclass import ResearchAgent, ResearchRunner
 24 | from ..llm_config import LLMConfig, model_supports_structured_output
 25 | from .utils.parse_output import create_type_parser
 26 | from datetime import datetime
 27 | from pydantic import BaseModel, Field
 28 | from .proofreader_agent import ReportDraft
 29 | from typing import List, Tuple, Dict
 30 | import re
 31 | 
 32 | 
 33 | class LongWriterOutput(BaseModel):
 34 |     next_section_markdown: str = Field(description="The final draft of the next section in markdown format")
 35 |     references: List[str] = Field(description="A list of URLs and their corresponding reference numbers for the section")
 36 | 
 37 | 
 38 | INSTRUCTIONS = f"""
 39 | You are an expert report writer tasked with iteratively writing each section of a report. 
 40 | Today's date is {datetime.now().strftime('%Y-%m-%d')}.
 41 | You will be provided with:
 42 | 1. The original research query
 43 | 3. A final draft of the report containing the table of contents and all sections written up until this point (in the first iteration there will be no sections written yet)
 44 | 3. A first draft of the next section of the report to be written
 45 | 
 46 | OBJECTIVE:
 47 | 1. Write a final draft of the next section of the report with numbered citations in square brackets in the body of the report
 48 | 2. Produce a list of references to be appended to the end of the report
 49 | 
 50 | CITATIONS/REFERENCES:
 51 | The citations should be in numerical order, written in numbered square brackets in the body of the report.
 52 | Separately, a list of all URLs and their corresponding reference numbers will be included at the end of the report.
 53 | Follow the example below for formatting.
 54 | 
 55 | LongWriterOutput(
 56 |     next_section_markdown="The company specializes in IT consulting [1](https://example.com/first-source-url). It operates in the software services market which is expected to grow at 10% per year [2](https://example.com/second-source-url).",
 57 |     references=["[1] https://example.com/first-source-url", "[2] https://example.com/second-source-url"]
 58 | )
 59 | 
 60 | GUIDELINES:
 61 | - You can reformat and reorganize the flow of the content and headings within a section to flow logically, but DO NOT remove details that were included in the first draft
 62 | - Only remove text from the first draft if it is already mentioned earlier in the report, or if it should be covered in a later section per the table of contents
 63 | - Ensure the heading for the section matches the table of contents
 64 | - Format the final output and references section as markdown
 65 | - Do not include a title for the reference section, just a list of numbered references
 66 | 
 67 | Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
 68 | {LongWriterOutput.model_json_schema()}
 69 | """
 70 | 
 71 | def init_long_writer_agent(config: LLMConfig) -> ResearchAgent:
 72 |     selected_model = config.fast_model
 73 | 
 74 |     return ResearchAgent(
 75 |         name="LongWriterAgent",
 76 |         instructions=INSTRUCTIONS,
 77 |         model=selected_model,
 78 |         output_type=LongWriterOutput if model_supports_structured_output(selected_model) else None,
 79 |         output_parser=create_type_parser(LongWriterOutput) if not model_supports_structured_output(selected_model) else None
 80 |     )
 81 | 
 82 | 
 83 | async def write_next_section(
 84 |     long_writer_agent: ResearchAgent,
 85 |     original_query: str,
 86 |     report_draft: str,
 87 |     next_section_title: str,
 88 |     next_section_draft: str,
 89 | ) -> LongWriterOutput:
 90 |     """Write the next section of the report"""
 91 | 
 92 |     user_message = f"""
 93 |     <ORIGINAL QUERY>
 94 |     {original_query}
 95 |     </ORIGINAL QUERY>
 96 | 
 97 |     <CURRENT REPORT DRAFT>
 98 |     {report_draft or "No draft yet"}
 99 |     </CURRENT REPORT DRAFT>
100 | 
101 |     <TITLE OF NEXT SECTION TO WRITE>
102 |     {next_section_title}
103 |     </TITLE OF NEXT SECTION TO WRITE>
104 | 
105 |     <DRAFT OF NEXT SECTION>
106 |     {next_section_draft}
107 |     </DRAFT OF NEXT SECTION>
108 |     """
109 | 
110 |     result = await ResearchRunner.run(
111 |         long_writer_agent,
112 |         user_message,
113 |     )
114 | 
115 |     return result.final_output_as(LongWriterOutput)
116 | 
117 | 
118 | async def write_report(
119 |     long_writer_agent: ResearchAgent,
120 |     original_query: str,
121 |     report_title: str,
122 |     report_draft: ReportDraft,
123 | ) -> str:
124 |     """Write the final report by iteratively writing each section"""
125 | 
126 |     # Initialize the final draft of the report with the title and table of contents
127 |     final_draft = f"# {report_title}\n\n" + "## Table of Contents\n\n" + "\n".join([f"{i+1}. {section.section_title}" for i, section in enumerate(report_draft.sections)]) + "\n\n"
128 |     all_references = []
129 | 
130 |     for section in report_draft.sections:
131 |         # Produce the final draft of each section and add it to the report with corresponding references
132 |         next_section_draft = await write_next_section(long_writer_agent, original_query, final_draft, section.section_title, section.section_content)
133 |         section_markdown, all_references = reformat_references(
134 |             next_section_draft.next_section_markdown, 
135 |             next_section_draft.references,
136 |             all_references
137 |         )
138 |         section_markdown = reformat_section_headings(section_markdown)
139 |         final_draft += section_markdown + '\n\n'
140 | 
141 |     # Add the final references to the end of the report
142 |     final_draft += '## References:\n\n' + '  \n'.join(all_references)
143 |     return final_draft
144 | 
145 | 
146 | def reformat_references(
147 |         section_markdown: str, 
148 |         section_references: List[str], 
149 |         all_references: List[str] 
150 |     ) -> Tuple[str, List[str]]:
151 |     """
152 |     This method gracefully handles the re-numbering, de-duplication and re-formatting of references as new sections are added to the report draft.
153 |     It takes as input:
154 |     1. The markdown content of the new section containing inline references in square brackets, e.g. [1], [2]
155 |     2. The list of references for the new section, e.g. ["[1] https://example1.com", "[2] https://example2.com"]
156 |     3. The list of references covering all prior sections of the report
157 | 
158 |     It returns:
159 |     1. The updated markdown content of the new section with the references re-numbered and de-duplicated, such that they increment from the previous references
160 |     2. The updated list of references for the full report, to include the new section's references
161 |     """
162 |     def convert_ref_list_to_map(ref_list: List[str]) -> Dict[str, str]:
163 |         ref_map = {}
164 |         for ref in ref_list:
165 |             try:
166 |                 ref_num = int(ref.split(']')[0].strip('['))
167 |                 url = ref.split(']', 1)[1].strip()
168 |                 ref_map[url] = ref_num
169 |             except ValueError:
170 |                 print(f"Invalid reference format: {ref}")
171 |                 continue
172 |         return ref_map
173 | 
174 |     section_ref_map = convert_ref_list_to_map(section_references)
175 |     report_ref_map = convert_ref_list_to_map(all_references)
176 |     section_to_report_ref_map = {}
177 | 
178 |     report_urls = set(report_ref_map.keys())
179 |     ref_count = max(report_ref_map.values() or [0])
180 |     for url, section_ref_num in section_ref_map.items():
181 |         if url in report_urls:
182 |             section_to_report_ref_map[section_ref_num] = report_ref_map[url]
183 |         else:
184 |             # If the reference is not in the report, add it to the report
185 |             ref_count += 1
186 |             section_to_report_ref_map[section_ref_num] = ref_count
187 |             all_references.append(f"[{ref_count}] {url}")
188 | 
189 |     def replace_reference(match):
190 |         # Extract the reference number from the match
191 |         ref_num = int(match.group(1))
192 |         # Look up the new reference number
193 |         mapped_ref_num = section_to_report_ref_map.get(ref_num)
194 |         if mapped_ref_num:
195 |             return f'[{mapped_ref_num}]'
196 |         return ''
197 |     
198 |     # Replace all references in a single pass using a replacement function
199 |     section_markdown = re.sub(r'\[(\d+)\]', replace_reference, section_markdown)
200 | 
201 |     return section_markdown, all_references
202 | 
203 | 
204 | def reformat_section_headings(section_markdown: str) -> str:
205 |     """
206 |     Reformat the headings of a section to be consistent with the report, by rebasing the section's heading to be a level-2 heading
207 | 
208 |     E.g. this:
209 |     # Big Title
210 |     Some content
211 |     ## Subsection
212 | 
213 |     Becomes this:
214 |     ## Big Title
215 |     Some content
216 |     ### Subsection
217 |     """
218 |     # If the section is empty, return as-is
219 |     if not section_markdown.strip():
220 |         return section_markdown
221 | 
222 |     # Find the first heading level
223 |     first_heading_match = re.search(r'^(#+)\s', section_markdown, re.MULTILINE)
224 |     if not first_heading_match:
225 |         return section_markdown
226 | 
227 |     # Calculate the level adjustment needed
228 |     first_heading_level = len(first_heading_match.group(1))
229 |     level_adjustment = 2 - first_heading_level
230 | 
231 |     def adjust_heading_level(match):
232 |         hashes = match.group(1)
233 |         content = match.group(2)
234 |         new_level = max(2, len(hashes) + level_adjustment)
235 |         return '#' * new_level + ' ' + content
236 | 
237 |     # Apply the heading adjustment to all headings in one pass
238 |     return re.sub(r'^(#+)\s(.+)$', adjust_heading_level, section_markdown, flags=re.MULTILINE)
239 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/planner_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to produce an initial outline of the report, including a list of section titles and the key question to be 
 3 | addressed in each section.
 4 | 
 5 | The Agent takes as input a string in the following format:
 6 | ===========================================================
 7 | QUERY: <original user query>
 8 | ===========================================================
 9 | 
10 | The Agent then outputs a ReportPlan object, which includes:
11 | 1. A summary of initial background context (if needed), based on web searches and/or crawling
12 | 2. An outline of the report that includes a list of section titles and the key question to be addressed in each section
13 | """
14 | 
15 | from pydantic import BaseModel, Field
16 | from typing import List
17 | from .baseclass import ResearchAgent
18 | from ..llm_config import LLMConfig, model_supports_structured_output
19 | from .tool_agents.crawl_agent import init_crawl_agent
20 | from .tool_agents.search_agent import init_search_agent
21 | from .utils.parse_output import create_type_parser
22 | from datetime import datetime
23 | 
24 | 
25 | class ReportPlanSection(BaseModel):
26 |     """A section of the report that needs to be written"""
27 |     title: str = Field(description="The title of the section")
28 |     key_question: str = Field(description="The key question to be addressed in the section")
29 | 
30 | 
31 | class ReportPlan(BaseModel):
32 |     """Output from the Report Planner Agent"""
33 |     background_context: str = Field(description="A summary of supporting context that can be passed onto the research agents")
34 |     report_outline: List[ReportPlanSection] = Field(description="List of sections that need to be written in the report")
35 |     report_title: str = Field(description="The title of the report")
36 | 
37 | 
38 | INSTRUCTIONS = f"""
39 | You are a research manager, managing a team of research agents. Today's date is {datetime.now().strftime("%Y-%m-%d")}.
40 | Given a research query, your job is to produce an initial outline of the report (section titles and key questions),
41 | as well as some background context. Each section will be assigned to a different researcher in your team who will then
42 | carry out research on the section.
43 | 
44 | You will be given:
45 | - An initial research query
46 | 
47 | Your task is to:
48 | 1. Produce 1-2 paragraphs of initial background context (if needed) on the query by running web searches or crawling websites
49 | 2. Produce an outline of the report that includes a list of section titles and the key question to be addressed in each section
50 | 3. Provide a title for the report that will be used as the main heading
51 | 
52 | Guidelines:
53 | - Each section should cover a single topic/question that is independent of other sections
54 | - The key question for each section should include both the NAME and DOMAIN NAME / WEBSITE (if available and applicable) if it is related to a company, product or similar
55 | - The background_context should not be more than 2 paragraphs
56 | - The background_context should be very specific to the query and include any information that is relevant for researchers across all sections of the report
57 | - The background_context should be draw only from web search or crawl results rather than prior knowledge (i.e. it should only be included if you have called tools)
58 | - For example, if the query is about a company, the background context should include some basic information about what the company does
59 | - DO NOT do more than 2 tool calls
60 | 
61 | Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
62 | {ReportPlan.model_json_schema()}
63 | """
64 | 
65 | def init_planner_agent(config: LLMConfig) -> ResearchAgent:
66 |     selected_model = config.reasoning_model
67 |     search_agent = init_search_agent(config)
68 |     crawl_agent = init_crawl_agent(config)
69 | 
70 |     return ResearchAgent(
71 |             name="PlannerAgent",
72 |             instructions=INSTRUCTIONS,
73 |         tools=[
74 |             search_agent.as_tool(
75 |                 tool_name="web_search",
76 |                 tool_description="Use this tool to search the web for information relevant to the query - provide a query with 3-6 words as input"
77 |             ),
78 |             crawl_agent.as_tool(
79 |                 tool_name="crawl_website",
80 |                 tool_description="Use this tool to crawl a website for information relevant to the query - provide a starting URL as input"
81 |             )
82 |         ],
83 |         model=selected_model,
84 |         output_type=ReportPlan if model_supports_structured_output(selected_model) else None,
85 |         output_parser=create_type_parser(ReportPlan) if not model_supports_structured_output(selected_model) else None
86 |     )
87 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/proofreader_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to produce the final draft of a report given initial drafts of each section.
 3 | 
 4 | The Agent takes as input the original user query and a stringified object of type ReportDraft.model_dump_json() (defined below).
 5 | 
 6 | ====
 7 | QUERY: <original user query>
 8 | 
 9 | REPORT DRAFT: <stringified ReportDraft object containing all draft sections>
10 | ====
11 | 
12 | The Agent then outputs the final markdown for the report as a string.
13 | """
14 | 
15 | from pydantic import BaseModel, Field
16 | from typing import List
17 | from .baseclass import ResearchAgent
18 | from ..llm_config import LLMConfig
19 | from datetime import datetime
20 | 
21 | 
22 | class ReportDraftSection(BaseModel):
23 |     """A section of the report that needs to be written"""
24 |     section_title: str = Field(description="The title of the section")
25 |     section_content: str = Field(description="The content of the section")
26 | 
27 | 
28 | class ReportDraft(BaseModel):
29 |     """Output from the Report Planner Agent"""
30 |     sections: List[ReportDraftSection] = Field(description="List of sections that are in the report")
31 | 
32 | 
33 | INSTRUCTIONS = f"""
34 | You are a research expert who proofreads and edits research reports.
35 | Today's date is {datetime.now().strftime("%Y-%m-%d")}.
36 | 
37 | You are given:
38 | 1. The original query topic for the report
39 | 2. A first draft of the report in ReportDraft format containing each section in sequence
40 | 
41 | Your task is to:
42 | 1. **Combine sections:** Concatenate the sections into a single string
43 | 2. **Add section titles:** Add the section titles to the beginning of each section in markdown format, as well as a main title for the report
44 | 3. **De-duplicate:** Remove duplicate content across sections to avoid repetition
45 | 4. **Remove irrelevant sections:** If any sections or sub-sections are completely irrelevant to the query, remove them
46 | 5. **Refine wording:** Edit the wording of the report to be polished, concise and punchy, but **without eliminating any detail** or large chunks of text
47 | 6. **Add a summary:** Add a short report summary / outline to the beginning of the report to provide an overview of the sections and what is discussed
48 | 7. **Preserve sources:** Preserve all sources / references - move the long list of references to the end of the report
49 | 8. **Update reference numbers:** Continue to include reference numbers in square brackets  ([1], [2], [3], etc.) in the main body of the report, but update the numbering to match the new order of references at the end of the report
50 | 9. **Output final report:** Output the final report in markdown format (do not wrap it in a code block)
51 | 
52 | Guidelines:
53 | - Do not add any new facts or data to the report
54 | - Do not remove any content from the report unless it is very clearly wrong, contradictory or irrelevant
55 | - Remove or reformat any redundant or excessive headings, and ensure that the final nesting of heading levels is correct
56 | - Ensure that the final report flows well and has a logical structure
57 | - Include all sources and references that are present in the final report
58 | """
59 | 
60 | def init_proofreader_agent(config: LLMConfig) -> ResearchAgent:
61 |     selected_model = config.fast_model
62 | 
63 |     return ResearchAgent(
64 |         name="ProofreaderAgent",
65 |         instructions=INSTRUCTIONS,
66 |         model=selected_model
67 |     )


--------------------------------------------------------------------------------
/deep_researcher/agents/thinking_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to reflect on the research process so far and share your latest thoughts.
 3 | 
 4 | The Agent takes as input a string in the following format:
 5 | ===========================================================
 6 | ORIGINAL QUERY: <original user query>
 7 | 
 8 | BACKGROUND CONTEXT: <supporting background context related to the original query>
 9 | 
10 | HISTORY OF ACTIONS, FINDINGS AND THOUGHTS: <a log of prior iterations of the research process>
11 | ===========================================================
12 | 
13 | The Agent then outputs a string containing its latest thoughts on the research process.
14 | """
15 | from .baseclass import ResearchAgent
16 | from ..llm_config import LLMConfig
17 | from datetime import datetime
18 | 
19 | INSTRUCTIONS = f"""
20 | You are a research expert who is managing a research process in iterations. Today's date is {datetime.now().strftime("%Y-%m-%d")}.
21 | 
22 | You are given:
23 | 1. The original research query along with some supporting background context
24 | 2. A history of the tasks, actions, findings and thoughts you've made up until this point in the research process (on iteration 1 you will be at the start of the research process, so this will be empty)
25 | 
26 | Your objective is to reflect on the research process so far and share your latest thoughts.
27 | 
28 | Specifically, your thoughts should include reflections on questions such as:
29 | - What have you learned from the last iteration?
30 | - What new areas would you like to explore next, or existing topics you'd like to go deeper into?
31 | - Were you able to retrieve the information you were looking for in the last iteration?
32 | - If not, should we change our approach or move to the next topic?
33 | - Is there any info that is contradictory or conflicting?
34 | 
35 | Guidelines:
36 | - Share you stream of consciousness on the above questions as raw text
37 | - Keep your response concise and informal
38 | - Focus most of your thoughts on the most recent iteration and how that influences this next iteration
39 | - Our aim is to do very deep and thorough research - bear this in mind when reflecting on the research process
40 | - DO NOT produce a draft of the final report. This is not your job.
41 | - If this is the first iteration (i.e. no data from prior iterations), provide thoughts on what info we need to gather in the first iteration to get started
42 | """
43 | 
44 | def init_thinking_agent(config: LLMConfig) -> ResearchAgent:
45 |     selected_model = config.reasoning_model
46 | 
47 |     return ResearchAgent(
48 |         name="ThinkingAgent",
49 |         instructions=INSTRUCTIONS,
50 |         model=selected_model,
51 |     )
52 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/tool_agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | 
 3 | class ToolAgentOutput(BaseModel):
 4 |     """Standard output for all tool agents"""
 5 |     output: str
 6 |     sources: list[str] = Field(default_factory=list)
 7 | 
 8 | from .search_agent import init_search_agent
 9 | from .crawl_agent import init_crawl_agent
10 | from ...llm_config import LLMConfig
11 | from ..baseclass import ResearchAgent
12 | 
13 | def init_tool_agents(config: LLMConfig) -> dict[str, ResearchAgent]:
14 |     search_agent = init_search_agent(config)
15 |     crawl_agent = init_crawl_agent(config)
16 | 
17 |     return {
18 |         "WebSearchAgent": search_agent,
19 |         "SiteCrawlerAgent": crawl_agent,
20 |     }
21 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/tool_agents/crawl_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to crawl a website and return the results.
 3 | 
 4 | The SearchAgent takes as input a string in the format of AgentTask.model_dump_json(), or can take a simple starting url string as input
 5 | 
 6 | The Agent then:
 7 | 1. Uses the crawl_website tool to crawl the website
 8 | 2. Writes a 3+ paragraph summary of the crawled contents
 9 | 3. Includes citations/URLs in brackets next to information sources
10 | 4. Returns the formatted summary as a string
11 | """
12 | 
13 | from ...tools import crawl_website
14 | from . import ToolAgentOutput
15 | from ...llm_config import LLMConfig, model_supports_structured_output
16 | from ..baseclass import ResearchAgent
17 | from ..utils.parse_output import create_type_parser
18 | 
19 | 
20 | INSTRUCTIONS = f"""
21 | You are a web craling agent that crawls the contents of a website answers a query based on the crawled contents. Follow these steps exactly:
22 | 
23 | * From the provided information, use the 'entity_website' as the starting_url for the web crawler
24 | * Crawl the website using the crawl_website tool
25 | * After using the crawl_website tool, write a 3+ paragraph summary that captures the main points from the crawled contents
26 | * In your summary, try to comprehensively answer/address the 'gaps' and 'query' provided (if available)
27 | * If the crawled contents are not relevant to the 'gaps' or 'query', simply write "No relevant results found"
28 | * Use headings and bullets to organize the summary if needed
29 | * Include citations/URLs in brackets next to all associated information in your summary
30 | * Only run the crawler once
31 | 
32 | Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
33 | {ToolAgentOutput.model_json_schema()}
34 | """
35 | 
36 | def init_crawl_agent(config: LLMConfig) -> ResearchAgent:
37 |     selected_model = config.fast_model
38 | 
39 |     return ResearchAgent(
40 |         name="SiteCrawlerAgent",
41 |         instructions=INSTRUCTIONS,
42 |         tools=[crawl_website],
43 |         model=selected_model,
44 |         output_type=ToolAgentOutput if model_supports_structured_output(selected_model) else None,
45 |         output_parser=create_type_parser(ToolAgentOutput) if not model_supports_structured_output(selected_model) else None
46 |     )
47 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/tool_agents/search_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to perform web searches and summarize the results.
 3 | 
 4 | The SearchAgent takes as input a string in the format of AgentTask.model_dump_json(), or can take a simple query string as input
 5 | 
 6 | The Agent then:
 7 | 1. Uses the web_search tool to retrieve search results
 8 | 2. Analyzes the retrieved information
 9 | 3. Writes a 3+ paragraph summary of the search results
10 | 4. Includes citations/URLs in brackets next to information sources
11 | 5. Returns the formatted summary as a string
12 | 
13 | The agent can use either OpenAI's built-in web search capability or a custom
14 | web search implementation based on environment configuration.
15 | """
16 | 
17 | from agents import WebSearchTool
18 | from ...tools.web_search import create_web_search_tool
19 | from ...llm_config import LLMConfig, model_supports_structured_output, get_base_url
20 | from . import ToolAgentOutput
21 | from ..baseclass import ResearchAgent
22 | from ..utils.parse_output import create_type_parser
23 | 
24 | INSTRUCTIONS = f"""You are a research assistant that specializes in retrieving and summarizing information from the web.
25 | 
26 | OBJECTIVE:
27 | Given an AgentTask, follow these steps:
28 | - Convert the 'query' into an optimized SERP search term for Google, limited to 3-5 words
29 | - If an 'entity_website' is provided, make sure to include the domain name in your optimized Google search term
30 | - Enter the optimized search term into the web_search tool
31 | - After using the web_search tool, write a 3+ paragraph summary that captures the main points from the search results
32 | 
33 | GUIDELINES:
34 | - In your summary, try to comprehensively answer/address the 'gap' provided (which is the objective of the search)
35 | - The summary should always quote detailed facts, figures and numbers where these are available
36 | - If the search results are not relevant to the search term or do not address the 'gap', simply write "No relevant results found"
37 | - Use headings and bullets to organize the summary if needed
38 | - Include citations/URLs in brackets next to all associated information in your summary
39 | - Do not make additional searches
40 | 
41 | Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
42 | {ToolAgentOutput.model_json_schema()}
43 | """
44 | 
45 | def init_search_agent(config: LLMConfig) -> ResearchAgent:
46 |     selected_model = config.fast_model
47 |     provider_base_url = get_base_url(selected_model)
48 | 
49 |     if config.search_provider == "openai" and 'openai.com' not in provider_base_url:
50 |         raise ValueError(f"You have set the SEARCH_PROVIDER to 'openai', but are using the model {str(selected_model.model)} which is not an OpenAI model")
51 |     elif config.search_provider == "openai":
52 |         web_search_tool = WebSearchTool()
53 |     else:
54 |         web_search_tool = create_web_search_tool(config)
55 | 
56 |     return ResearchAgent(
57 |         name="WebSearchAgent",
58 |         instructions=INSTRUCTIONS,
59 |         tools=[web_search_tool],
60 |         model=selected_model,
61 |         output_type=ToolAgentOutput if model_supports_structured_output(selected_model) else None,
62 |         output_parser=create_type_parser(ToolAgentOutput) if not model_supports_structured_output(selected_model) else None
63 |     )


--------------------------------------------------------------------------------
/deep_researcher/agents/tool_selector_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to determine which specialized agents should be used to address knowledge gaps.
 3 | 
 4 | The Agent takes as input a string in the following format:
 5 | ===========================================================
 6 | ORIGINAL QUERY: <original user query>
 7 | 
 8 | KNOWLEDGE GAP TO ADDRESS: <knowledge gap that needs to be addressed>
 9 | 
10 | BACKGROUND CONTEXT: <supporting background context related to the original query>
11 | 
12 | HISTORY OF ACTIONS, FINDINGS AND THOUGHTS: <a log of prior iterations of the research process>
13 | ===========================================================
14 | 
15 | The Agent then:
16 | 1. Analyzes the knowledge gap to determine which agents are best suited to address it
17 | 2. Returns an AgentSelectionPlan object containing a list of AgentTask objects
18 | 
19 | The available agents are:
20 | - WebSearchAgent: General web search for broad topics
21 | - SiteCrawlerAgent: Crawl the pages of a specific website to retrieve information about it
22 | """
23 | 
24 | from pydantic import BaseModel, Field
25 | from typing import List, Optional
26 | from ..llm_config import LLMConfig, model_supports_structured_output
27 | from datetime import datetime
28 | from .baseclass import ResearchAgent
29 | from .utils.parse_output import create_type_parser
30 | 
31 | 
32 | class AgentTask(BaseModel):
33 |     """A task for a specific agent to address knowledge gaps"""
34 |     gap: Optional[str] = Field(description="The knowledge gap being addressed", default=None)
35 |     agent: str = Field(description="The name of the agent to use")
36 |     query: str = Field(description="The specific query for the agent")
37 |     entity_website: Optional[str] = Field(description="The website of the entity being researched, if known", default=None)
38 | 
39 | 
40 | class AgentSelectionPlan(BaseModel):
41 |     """Plan for which agents to use for knowledge gaps"""
42 |     tasks: List[AgentTask] = Field(description="List of agent tasks to address knowledge gaps")
43 | 
44 | 
45 | INSTRUCTIONS = f"""
46 | You are an Tool Selector responsible for determining which specialized agents should address a knowledge gap in a research project.
47 | Today's date is {datetime.now().strftime("%Y-%m-%d")}.
48 | 
49 | You will be given:
50 | 1. The original user query
51 | 2. A knowledge gap identified in the research
52 | 3. A full history of the tasks, actions, findings and thoughts you've made up until this point in the research process
53 | 
54 | Your task is to decide:
55 | 1. Which specialized agents are best suited to address the gap
56 | 2. What specific queries should be given to the agents (keep this short - 3-6 words)
57 | 
58 | Available specialized agents:
59 | - WebSearchAgent: General web search for broad topics (can be called multiple times with different queries)
60 | - SiteCrawlerAgent: Crawl the pages of a specific website to retrieve information about it - use this if you want to find out something about a particular company, entity or product
61 | 
62 | Guidelines:
63 | - Aim to call at most 3 agents at a time in your final output
64 | - You can list the WebSearchAgent multiple times with different queries if needed to cover the full scope of the knowledge gap
65 | - Be specific and concise (3-6 words) with the agent queries - they should target exactly what information is needed
66 | - If you know the website or domain name of an entity being researched, always include it in the query
67 | - If a gap doesn't clearly match any agent's capability, default to the WebSearchAgent
68 | - Use the history of actions / tool calls as a guide - try not to repeat yourself if an approach didn't work previously
69 | 
70 | Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
71 | {AgentSelectionPlan.model_json_schema()}
72 | """
73 | 
74 | def init_tool_selector_agent(config: LLMConfig) -> ResearchAgent:
75 |     selected_model = config.reasoning_model
76 | 
77 |     return ResearchAgent(
78 |         name="ToolSelectorAgent",
79 |         instructions=INSTRUCTIONS,
80 |         model=selected_model,
81 |         output_type=AgentSelectionPlan if model_supports_structured_output(selected_model) else None,
82 |         output_parser=create_type_parser(AgentSelectionPlan) if not model_supports_structured_output(selected_model) else None
83 |     )
84 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx-labs/agents-deep-research/4e3ce35be10a59dfdbe4180ce60085a6e5ffb7ff/deep_researcher/agents/utils/__init__.py


--------------------------------------------------------------------------------
/deep_researcher/agents/utils/parse_output.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pydantic import BaseModel
 3 | from typing import Any, Callable
 4 | 
 5 | 
 6 | class OutputParserError(Exception):
 7 |     """
 8 |     Exception raised when the output parser fails to parse the output.
 9 |     """
10 |     def __init__(self, message, output=None):
11 |         self.message = message
12 |         self.output = output
13 |         super().__init__(self.message)
14 |         
15 |     def __str__(self):
16 |         if self.output:
17 |             return f"{self.message}\nProblematic output: {self.output}"
18 |         return self.message
19 | 
20 | 
21 | def find_json_in_string(string: str) -> str:
22 |     """
23 |     Method to extract all text in the left-most brace that appears in a string.
24 |     Used to extract JSON from a string (note that this function does not validate the JSON).
25 | 
26 |     Example:
27 |         string = "bla bla bla {this is {some} text{{}and it's sneaky}} because {it's} confusing"
28 |         output = "{this is {some} text{{}and it's sneaky}}"
29 |     """
30 |     stack = 0
31 |     start_index = None
32 | 
33 |     for i, c in enumerate(string):
34 |         if c == '{':
35 |             if stack == 0:
36 |                 start_index = i  # Start index of the first '{'
37 |             stack += 1  # Push to stack
38 |         elif c == '}':
39 |             stack -= 1  # Pop stack
40 |             if stack == 0:
41 |                 # Return the substring from the start of the first '{' to the current '}'
42 |                 return string[start_index:i + 1] if start_index is not None else ""
43 | 
44 |     # If no complete set of braces is found, return an empty string
45 |     return ""
46 | 
47 | 
48 | def parse_json_output(output: str) -> Any:
49 |     """Take a string output and parse it as JSON"""
50 |     # First try to load the string as JSON
51 |     try:
52 |         return json.loads(output)
53 |     except json.JSONDecodeError as e:
54 |         pass
55 | 
56 |     # If that fails, assume that the output is in a code block - remove the code block markers and try again
57 |     parsed_output = output
58 |     parsed_output = parsed_output.split("```")[1]
59 |     parsed_output = parsed_output.split("```")[0]
60 |     if parsed_output.startswith("json") or parsed_output.startswith("JSON"):
61 |         parsed_output = parsed_output[4:].strip()
62 |     try:
63 |         return json.loads(parsed_output)
64 |     except json.JSONDecodeError:
65 |         pass
66 | 
67 |     # As a last attempt, try to manually find the JSON object in the output and parse it
68 |     parsed_output = find_json_in_string(output)
69 |     if parsed_output:
70 |         try:
71 |             return json.loads(parsed_output)
72 |         except json.JSONDecodeError:
73 |             raise OutputParserError(f"Failed to parse output as JSON", output)
74 | 
75 |     # If all fails, raise an error
76 |     raise OutputParserError(f"Failed to parse output as JSON", output)
77 | 
78 | 
79 | def create_type_parser(type: BaseModel) -> Callable[[str], BaseModel]:
80 |     """Create a function that takes a string output and parses it as a specified Pydantic model"""
81 | 
82 |     def convert_json_string_to_type(output: str) -> BaseModel:
83 |         """Take a string output and parse it as a Pydantic model"""
84 |         output_dict = parse_json_output(output)
85 |         return type.model_validate(output_dict)
86 | 
87 |     return convert_json_string_to_type
88 | 


--------------------------------------------------------------------------------
/deep_researcher/agents/writer_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent used to synthesize a final report based on provided findings.
 3 | 
 4 | The WriterAgent takes as input a string in the following format:
 5 | ===========================================================
 6 | QUERY: <original user query>
 7 | 
 8 | FINDINGS: <findings from the iterative research process>
 9 | ===========================================================
10 | 
11 | The Agent then:
12 | 1. Generates a comprehensive markdown report based on all available information
13 | 2. Includes proper citations for sources in the format [1], [2], etc.
14 | 3. Returns a string containing the markdown formatted report
15 | """
16 | from .baseclass import ResearchAgent
17 | from ..llm_config import LLMConfig
18 | from datetime import datetime
19 | 
20 | INSTRUCTIONS = f"""
21 | You are a senior researcher tasked with comprehensively answering a research query. 
22 | Today's date is {datetime.now().strftime('%Y-%m-%d')}.
23 | You will be provided with the original query along with research findings put together by a research assistant.
24 | Your objective is to generate the final response in markdown format.
25 | The response should be as lengthy and detailed as possible with the information provided, focusing on answering the original query.
26 | In your final output, include references to the source URLs for all information and data gathered. 
27 | This should be formatted in the form of a numbered square bracket next to the relevant information, 
28 | followed by a list of URLs at the end of the response, per the example below.
29 | 
30 | EXAMPLE REFERENCE FORMAT:
31 | The company has XYZ products [1]. It operates in the software services market which is expected to grow at 10% per year [2].
32 | 
33 | References:
34 | [1] https://example.com/first-source-url
35 | [2] https://example.com/second-source-url
36 | 
37 | GUIDELINES:
38 | * Answer the query directly, do not include unrelated or tangential information.
39 | * Adhere to any instructions on the length of your final response if provided in the user prompt.
40 | * If any additional guidelines are provided in the user prompt, follow them exactly and give them precedence over these system instructions.
41 | """
42 | 
43 | def init_writer_agent(config: LLMConfig) -> ResearchAgent:
44 |     selected_model = config.main_model
45 | 
46 |     return ResearchAgent(
47 |         name="WriterAgent",
48 |         instructions=INSTRUCTIONS,
49 |         model=selected_model,
50 |     )
51 | 


--------------------------------------------------------------------------------
/deep_researcher/deep_research.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import time
  3 | from .iterative_research import IterativeResearcher
  4 | from .agents.planner_agent import init_planner_agent, ReportPlan, ReportPlanSection
  5 | from .agents.proofreader_agent import ReportDraftSection, ReportDraft, init_proofreader_agent
  6 | from .agents.long_writer_agent import init_long_writer_agent, write_report
  7 | from .agents.baseclass import ResearchRunner
  8 | from typing import List, Optional
  9 | from agents.tracing import trace, gen_trace_id, custom_span
 10 | from .llm_config import LLMConfig, create_default_config
 11 | 
 12 | 
 13 | class DeepResearcher:
 14 |     """
 15 |     Manager for the deep research workflow that breaks down a query into a report plan with sections and then runs an iterative research loop for each section.
 16 |     """
 17 |     def __init__(
 18 |             self, 
 19 |             max_iterations: int = 5,
 20 |             max_time_minutes: int = 10,
 21 |             verbose: bool = True,
 22 |             tracing: bool = False,
 23 |             config: Optional[LLMConfig] = None
 24 |         ):
 25 |         self.max_iterations = max_iterations
 26 |         self.max_time_minutes = max_time_minutes
 27 |         self.verbose = verbose
 28 |         self.tracing = tracing
 29 |         self.config = create_default_config() if not config else config
 30 |         self.planner_agent = init_planner_agent(self.config)
 31 |         self.proofreader_agent = init_proofreader_agent(self.config)
 32 |         self.long_writer_agent = init_long_writer_agent(self.config)
 33 | 
 34 |     async def run(self, query: str) -> str:
 35 |         """Run the deep research workflow"""
 36 |         start_time = time.time()
 37 | 
 38 |         if self.tracing:
 39 |             trace_id = gen_trace_id()
 40 |             workflow_trace = trace("deep_researcher", trace_id=trace_id)
 41 |             print(f"View trace: https://platform.openai.com/traces/trace?trace_id={trace_id}")
 42 |             workflow_trace.start(mark_as_current=True)
 43 | 
 44 |         # First build the report plan which outlines the sections and compiles any relevant background context on the query
 45 |         report_plan: ReportPlan = await self._build_report_plan(query)
 46 | 
 47 |         # Run the independent research loops concurrently for each section and gather the results
 48 |         research_results: List[str] = await self._run_research_loops(report_plan)
 49 | 
 50 |         # Create the final report from the original report plan and the drafts of each section
 51 |         final_report: str = await self._create_final_report(query, report_plan, research_results)
 52 | 
 53 |         elapsed_time = time.time() - start_time
 54 |         self._log_message(f"DeepResearcher completed in {int(elapsed_time // 60)} minutes and {int(elapsed_time % 60)} seconds")
 55 | 
 56 |         if self.tracing:
 57 |             workflow_trace.finish(reset_current=True)
 58 | 
 59 |         return final_report
 60 | 
 61 |     async def _build_report_plan(self, query: str) -> ReportPlan:
 62 |         """Build the initial report plan including the report outline (sections and key questions) and background context"""
 63 |         if self.tracing:
 64 |             span = custom_span(name="build_report_plan")
 65 |             span.start(mark_as_current=True)
 66 | 
 67 |         self._log_message("=== Building Report Plan ===")
 68 |         user_message = f"QUERY: {query}"
 69 |         result = await ResearchRunner.run(
 70 |             self.planner_agent,
 71 |             user_message
 72 |         )
 73 |         report_plan = result.final_output_as(ReportPlan)
 74 | 
 75 |         if self.verbose:
 76 |             num_sections = len(report_plan.report_outline)
 77 |             message_log = '\n\n'.join(f"Section: {section.title}\nKey question: {section.key_question}" for section in report_plan.report_outline)
 78 |             if report_plan.background_context:
 79 |                 message_log += f"\n\nThe following background context has been included for the report build:\n{report_plan.background_context}"
 80 |             else:
 81 |                 message_log += "\n\nNo background context was provided for the report build.\n"
 82 |             self._log_message(f"Report plan created with {num_sections} sections:\n{message_log}")
 83 | 
 84 |         if self.tracing:
 85 |             span.finish(reset_current=True)
 86 | 
 87 |         return report_plan
 88 | 
 89 |     async def _run_research_loops(
 90 |         self, 
 91 |         report_plan: ReportPlan
 92 |     ) -> List[str]:
 93 |         """For a given ReportPlan, run a research loop concurrently for each section and gather the results"""
 94 |         async def run_research_for_section(section: ReportPlanSection):
 95 |             iterative_researcher = IterativeResearcher(
 96 |                 max_iterations=self.max_iterations,
 97 |                 max_time_minutes=self.max_time_minutes,
 98 |                 verbose=self.verbose,
 99 |                 tracing=False,  # Do not trace as this will conflict with the tracing we already have set up for the deep researcher
100 |                 config=self.config
101 |             )
102 |             args = {
103 |                 "query": section.key_question,
104 |                 "output_length": "",
105 |                 "output_instructions": "",
106 |                 "background_context": report_plan.background_context,
107 |             }
108 |             
109 |             # Only use custom span if tracing is enabled
110 |             if self.tracing:
111 |                 with custom_span(
112 |                     name=f"iterative_researcher:{section.title}", 
113 |                     data={"key_question": section.key_question}
114 |                 ):
115 |                     return await iterative_researcher.run(**args)
116 |             else:
117 |                 return await iterative_researcher.run(**args)
118 |         
119 |         self._log_message("=== Initializing Research Loops ===")
120 |         # Run all research loops concurrently in a single gather call
121 |         research_results = await asyncio.gather(
122 |             *(run_research_for_section(section) for section in report_plan.report_outline)
123 |         )
124 |         return research_results
125 | 
126 |     async def _create_final_report(
127 |         self, 
128 |         query: str, 
129 |         report_plan: ReportPlan, 
130 |         section_drafts: List[str],
131 |         use_long_writer: bool = True
132 |     ) -> str:
133 |         """Create the final report from the original report plan and the drafts of each section"""
134 |         if self.tracing:
135 |             span = custom_span(name="create_final_report")
136 |             span.start(mark_as_current=True)
137 | 
138 |         # Each section is a string containing the markdown for the section
139 |         # From this we need to build a ReportDraft object to feed to the final proofreader agent
140 |         report_draft = ReportDraft(
141 |             sections=[]
142 |         )
143 |         for i, section_draft in enumerate(section_drafts):
144 |             report_draft.sections.append(
145 |                 ReportDraftSection(
146 |                     section_title=report_plan.report_outline[i].title,
147 |                     section_content=section_draft
148 |                 )
149 |             )
150 | 
151 |         self._log_message("\n=== Building Final Report ===")
152 | 
153 |         if use_long_writer:
154 |             final_output = await write_report(self.long_writer_agent, query, report_plan.report_title, report_draft)
155 |         else:
156 |             user_prompt = f"QUERY:\n{query}\n\nREPORT DRAFT:\n{report_draft.model_dump_json()}"
157 |             # Run the proofreader agent to produce the final report
158 |             final_report = await ResearchRunner.run(
159 |                 self.proofreader_agent,
160 |                 user_prompt
161 |             )
162 |             final_output = final_report.final_output
163 | 
164 |         self._log_message(f"Final report completed")
165 | 
166 |         if self.tracing:
167 |             span.finish(reset_current=True)
168 | 
169 |         return final_output
170 | 
171 |     def _log_message(self, message: str) -> None:
172 |         """Log a message if verbose is True"""
173 |         if self.verbose:
174 |             print(message)


--------------------------------------------------------------------------------
/deep_researcher/iterative_research.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import asyncio
  3 | import time
  4 | from typing import Dict, List, Optional
  5 | from agents import custom_span, gen_trace_id, trace
  6 | from .agents.baseclass import ResearchRunner
  7 | from .agents.writer_agent import init_writer_agent
  8 | from .agents.knowledge_gap_agent import KnowledgeGapOutput, init_knowledge_gap_agent
  9 | from .agents.tool_selector_agent import AgentTask, AgentSelectionPlan, init_tool_selector_agent
 10 | from .agents.thinking_agent import init_thinking_agent
 11 | from .agents.tool_agents import init_tool_agents, ToolAgentOutput
 12 | from pydantic import BaseModel, Field
 13 | from .llm_config import LLMConfig, create_default_config
 14 | 
 15 | 
 16 | class IterationData(BaseModel):
 17 |     """Data for a single iteration of the research loop."""
 18 |     gap: str = Field(description="The gap addressed in the iteration", default_factory=list)
 19 |     tool_calls: List[str] = Field(description="The tool calls made", default_factory=list)
 20 |     findings: List[str] = Field(description="The findings collected from tool calls", default_factory=list)
 21 |     thought: List[str] = Field(description="The thinking done to reflect on the success of the iteration and next steps", default_factory=list)
 22 | 
 23 | 
 24 | class Conversation(BaseModel):
 25 |     """A conversation between the user and the iterative researcher."""
 26 |     history: List[IterationData] = Field(description="The data for each iteration of the research loop", default_factory=list)
 27 | 
 28 |     def add_iteration(self, iteration_data: Optional[IterationData] = None):
 29 |         if iteration_data is None:
 30 |             iteration_data = IterationData()
 31 |         self.history.append(iteration_data)
 32 |     
 33 |     def set_latest_gap(self, gap: str):
 34 |         self.history[-1].gap = gap
 35 | 
 36 |     def set_latest_tool_calls(self, tool_calls: List[str]):
 37 |         self.history[-1].tool_calls = tool_calls
 38 | 
 39 |     def set_latest_findings(self, findings: List[str]):
 40 |         self.history[-1].findings = findings
 41 | 
 42 |     def set_latest_thought(self, thought: str):
 43 |         self.history[-1].thought = thought
 44 | 
 45 |     def get_latest_gap(self) -> str:
 46 |         return self.history[-1].gap
 47 |     
 48 |     def get_latest_tool_calls(self) -> List[str]:
 49 |         return self.history[-1].tool_calls
 50 |     
 51 |     def get_latest_findings(self) -> List[str]:
 52 |         return self.history[-1].findings
 53 |     
 54 |     def get_latest_thought(self) -> str:
 55 |         return self.history[-1].thought
 56 |     
 57 |     def get_all_findings(self) -> List[str]:
 58 |         return [finding for iteration_data in self.history for finding in iteration_data.findings]
 59 | 
 60 |     def compile_conversation_history(self) -> str:
 61 |         """Compile the conversation history into a string."""
 62 |         conversation = ""
 63 |         for iteration_num, iteration_data in enumerate(self.history):
 64 |             conversation += f"[ITERATION {iteration_num + 1}]\n\n"
 65 |             if iteration_data.thought:
 66 |                 conversation += f"{self.get_thought_string(iteration_num)}\n\n"
 67 |             if iteration_data.gap:
 68 |                 conversation += f"{self.get_task_string(iteration_num)}\n\n"
 69 |             if iteration_data.tool_calls:
 70 |                 conversation += f"{self.get_action_string(iteration_num)}\n\n"
 71 |             if iteration_data.findings:
 72 |                 conversation += f"{self.get_findings_string(iteration_num)}\n\n"
 73 | 
 74 |         return conversation
 75 |     
 76 |     def get_task_string(self, iteration_num: int) -> str:
 77 |         """Get the task for the current iteration."""
 78 |         if self.history[iteration_num].gap:
 79 |             return f"<task>\nAddress this knowledge gap: {self.history[iteration_num].gap}\n</task>"
 80 |         return ""
 81 |     
 82 |     def get_action_string(self, iteration_num: int) -> str:
 83 |         """Get the action for the current iteration."""
 84 |         if self.history[iteration_num].tool_calls:
 85 |             joined_calls = '\n'.join(self.history[iteration_num].tool_calls)
 86 |             return (
 87 |                 "<action>\nCalling the following tools to address the knowledge gap:\n"
 88 |                 f"{joined_calls}\n</action>"
 89 |             )
 90 |         return ""
 91 |         
 92 |     def get_findings_string(self, iteration_num: int) -> str:
 93 |         """Get the findings for the current iteration."""
 94 |         if self.history[iteration_num].findings:
 95 |             joined_findings = '\n\n'.join(self.history[iteration_num].findings)
 96 |             return f"<findings>\n{joined_findings}\n</findings>"
 97 |         return ""
 98 |     
 99 |     def get_thought_string(self, iteration_num: int) -> str:
100 |         """Get the thought for the current iteration."""
101 |         if self.history[iteration_num].thought:
102 |             return f"<thought>\n{self.history[iteration_num].thought}\n</thought>"
103 |         return ""
104 |     
105 |     def latest_task_string(self) -> str:
106 |         """Get the latest task."""
107 |         return self.get_task_string(len(self.history) - 1)
108 |     
109 |     def latest_action_string(self) -> str:
110 |         """Get the latest action."""
111 |         return self.get_action_string(len(self.history) - 1)
112 |     
113 |     def latest_findings_string(self) -> str:
114 |         """Get the latest findings."""
115 |         return self.get_findings_string(len(self.history) - 1)
116 |     
117 |     def latest_thought_string(self) -> str:
118 |         """Get the latest thought."""
119 |         return self.get_thought_string(len(self.history) - 1)
120 |     
121 | 
122 | class IterativeResearcher:
123 |     """Manager for the iterative research workflow that conducts research on a topic or subtopic by running a continuous research loop."""
124 | 
125 |     def __init__(
126 |         self, 
127 |         max_iterations: int = 5,
128 |         max_time_minutes: int = 10,
129 |         verbose: bool = True,
130 |         tracing: bool = False,
131 |         config: Optional[LLMConfig] = None
132 |     ):
133 |         self.max_iterations: int = max_iterations
134 |         self.max_time_minutes: int = max_time_minutes
135 |         self.start_time: float = None
136 |         self.iteration: int = 0
137 |         self.conversation: Conversation = Conversation()
138 |         self.should_continue: bool = True
139 |         self.verbose: bool = verbose
140 |         self.tracing: bool = tracing
141 |         self.config: LLMConfig = create_default_config() if not config else config
142 |         self.knowledge_gap_agent = init_knowledge_gap_agent(self.config)
143 |         self.tool_selector_agent = init_tool_selector_agent(self.config)
144 |         self.thinking_agent = init_thinking_agent(self.config)
145 |         self.writer_agent = init_writer_agent(self.config)
146 |         self.tool_agents = init_tool_agents(self.config)
147 |         
148 |     async def run(
149 |             self, 
150 |             query: str,
151 |             output_length: str = "",  # A text description of the desired output length, can be left blank
152 |             output_instructions: str = "",  # Instructions for the final report (e.g. don't include any headings, just a couple of paragraphs of text)
153 |             background_context: str = "",
154 |         ) -> str:
155 |         """Run the deep research workflow for a given query."""
156 |         self.start_time = time.time()
157 | 
158 |         if self.tracing:
159 |             trace_id = gen_trace_id()
160 |             workflow_trace = trace("iterative_researcher", trace_id=trace_id)
161 |             print(f"View trace: https://platform.openai.com/traces/trace?trace_id={trace_id}")
162 |             workflow_trace.start(mark_as_current=True)
163 | 
164 |         self._log_message("=== Starting Iterative Research Workflow ===")
165 |         
166 |         # Iterative research loop
167 |         while self.should_continue and self._check_constraints():
168 |             self.iteration += 1
169 |             self._log_message(f"\n=== Starting Iteration {self.iteration} ===")
170 | 
171 |             # Set up blank IterationData for this iteration
172 |             self.conversation.add_iteration()
173 | 
174 |             # 1. Generate observations
175 |             observations: str = await self._generate_observations(query, background_context=background_context)
176 | 
177 |             # 2. Evaluate current gaps in the research
178 |             evaluation: KnowledgeGapOutput = await self._evaluate_gaps(query, background_context=background_context)
179 |             
180 |             # Check if we should continue or break the loop
181 |             if not evaluation.research_complete:
182 |                 next_gap = evaluation.outstanding_gaps[0]
183 | 
184 |                 # 3. Select agents to address knowledge gap
185 |                 selection_plan: AgentSelectionPlan = await self._select_agents(next_gap, query, background_context=background_context)
186 | 
187 |                 # 4. Run the selected agents to gather information
188 |                 results: Dict[str, ToolAgentOutput] = await self._execute_tools(selection_plan.tasks)
189 |             else:
190 |                 self.should_continue = False
191 |                 self._log_message("=== IterativeResearcher Marked As Complete - Finalizing Output ===")
192 |         
193 |         # Create final report
194 |         report = await self._create_final_report(query, length=output_length, instructions=output_instructions)
195 |         
196 |         elapsed_time = time.time() - self.start_time
197 |         self._log_message(f"IterativeResearcher completed in {int(elapsed_time // 60)} minutes and {int(elapsed_time % 60)} seconds after {self.iteration} iterations.")
198 |         
199 |         if self.tracing:
200 |             workflow_trace.finish(reset_current=True)
201 | 
202 |         return report
203 |     
204 |     def _check_constraints(self) -> bool:
205 |         """Check if we've exceeded our constraints (max iterations or time)."""
206 |         if self.iteration >= self.max_iterations:
207 |             self._log_message("\n=== Ending Research Loop ===")
208 |             self._log_message(f"Reached maximum iterations ({self.max_iterations})")
209 |             return False
210 |         
211 |         elapsed_minutes = (time.time() - self.start_time) / 60
212 |         if elapsed_minutes >= self.max_time_minutes:
213 |             self._log_message("\n=== Ending Research Loop ===")
214 |             self._log_message(f"Reached maximum time ({self.max_time_minutes} minutes)")
215 |             return False
216 |         
217 |         return True
218 |     
219 |     async def _evaluate_gaps(
220 |         self, 
221 |         query: str,
222 |         background_context: str = ""
223 |     ) -> KnowledgeGapOutput:
224 |         """Evaluate the current state of research and identify knowledge gaps."""
225 | 
226 |         background = f"BACKGROUND CONTEXT:\n{background_context}" if background_context else ""
227 | 
228 |         input_str = f"""
229 |         Current Iteration Number: {self.iteration}
230 |         Time Elapsed: {(time.time() - self.start_time) / 60:.2f} minutes of maximum {self.max_time_minutes} minutes
231 | 
232 |         ORIGINAL QUERY:
233 |         {query}
234 | 
235 |         {background}
236 | 
237 |         HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
238 |         {self.conversation.compile_conversation_history() or "No previous actions, findings or thoughts available."}        
239 |         """
240 | 
241 |         result = await ResearchRunner.run(
242 |             self.knowledge_gap_agent,
243 |             input_str,
244 |         )
245 |         
246 |         evaluation = result.final_output_as(KnowledgeGapOutput)
247 | 
248 |         if not evaluation.research_complete:
249 |             next_gap = evaluation.outstanding_gaps[0]
250 |             self.conversation.set_latest_gap(next_gap)
251 |             self._log_message(self.conversation.latest_task_string())
252 |         
253 |         return evaluation
254 |     
255 |     async def _select_agents(
256 |         self, 
257 |         gap: str, 
258 |         query: str,
259 |         background_context: str = ""
260 |     ) -> AgentSelectionPlan:
261 |         """Select agents to address the identified knowledge gap."""
262 |         
263 |         background = f"BACKGROUND CONTEXT:\n{background_context}" if background_context else ""
264 | 
265 |         input_str = f"""
266 |         ORIGINAL QUERY:
267 |         {query}
268 | 
269 |         KNOWLEDGE GAP TO ADDRESS:
270 |         {gap}
271 | 
272 |         {background}
273 | 
274 |         HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
275 |         {self.conversation.compile_conversation_history() or "No previous actions, findings or thoughts available."}
276 |         """
277 |         
278 |         result = await ResearchRunner.run(
279 |             self.tool_selector_agent,
280 |             input_str,
281 |         )
282 |         
283 |         selection_plan = result.final_output_as(AgentSelectionPlan)
284 | 
285 |         # Add the tool calls to the conversation
286 |         self.conversation.set_latest_tool_calls([
287 |             f"[Agent] {task.agent} [Query] {task.query} [Entity] {task.entity_website if task.entity_website else 'null'}" for task in selection_plan.tasks
288 |         ])
289 |         self._log_message(self.conversation.latest_action_string())
290 |         
291 |         return selection_plan
292 |     
293 |     async def _execute_tools(self, tasks: List[AgentTask]) -> Dict[str, ToolAgentOutput]:
294 |         """Execute the selected tools concurrently to gather information."""
295 |         with custom_span("Execute Tool Agents"):
296 |             # Create a task for each agent
297 |             async_tasks = []
298 |             for task in tasks:
299 |                 async_tasks.append(self._run_agent_task(task))
300 |             
301 |             # Run all tasks concurrently
302 |             num_completed = 0
303 |             results = {}
304 |             for future in asyncio.as_completed(async_tasks):
305 |                 gap, agent_name, result = await future
306 |                 results[f"{agent_name}_{gap}"] = result
307 |                 num_completed += 1
308 |                 self._log_message(f"<processing>\nTool execution progress: {num_completed}/{len(async_tasks)}\n</processing>")
309 | 
310 |             # Add findings from the tool outputs to the conversation
311 |             findings = []
312 |             for tool_output in results.values():
313 |                 findings.append(tool_output.output)
314 |             self.conversation.set_latest_findings(findings)
315 | 
316 |             return results
317 |     
318 |     async def _run_agent_task(self, task: AgentTask) -> tuple[str, str, ToolAgentOutput]:
319 |         """Run a single agent task and return the result."""
320 |         try:
321 |             agent_name = task.agent
322 |             agent = self.tool_agents.get(agent_name)
323 |             if agent:
324 |                 result = await ResearchRunner.run(
325 |                     agent,
326 |                     task.model_dump_json(),
327 |                 )
328 |                 # Extract ToolAgentOutput from RunResult
329 |                 output = result.final_output_as(ToolAgentOutput)
330 |             else:
331 |                 output = ToolAgentOutput(
332 |                     output=f"No implementation found for agent {agent_name}",
333 |                     sources=[]
334 |                 )
335 |             
336 |             return task.gap, agent_name, output
337 |         except Exception as e:
338 |             error_output = ToolAgentOutput(
339 |                 output=f"Error executing {task.agent} for gap '{task.gap}': {str(e)}",
340 |                 sources=[]
341 |             )
342 |             return task.gap, task.agent, error_output
343 |         
344 |     async def _generate_observations(self, query: str, background_context: str = "") -> str:
345 |         """Generate observations from the current state of the research."""
346 |         
347 |         background = f"BACKGROUND CONTEXT:\n{background_context}" if background_context else ""
348 | 
349 |         input_str = f"""
350 |         You are starting iteration {self.iteration} of your research process.
351 | 
352 |         ORIGINAL QUERY:
353 |         {query}
354 | 
355 |         {background}
356 | 
357 |         HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
358 |         {self.conversation.compile_conversation_history() or "No previous actions, findings or thoughts available."}
359 |         """
360 |         result = await ResearchRunner.run(
361 |             self.thinking_agent,
362 |             input_str,
363 |         )
364 | 
365 |         # Add the observations to the conversation
366 |         observations = result.final_output
367 |         self.conversation.set_latest_thought(observations)
368 |         self._log_message(self.conversation.latest_thought_string())
369 |         return observations
370 | 
371 |     async def _create_final_report(
372 |         self, 
373 |         query: str,
374 |         length: str = "",
375 |         instructions: str = ""
376 |         ) -> str:
377 |         """Create the final response from the completed draft."""
378 |         self._log_message("=== Drafting Final Response ===")
379 | 
380 |         length_str = f"* The full response should be approximately {length}.\n" if length else ""
381 |         instructions_str = f"* {instructions}" if instructions else ""
382 |         guidelines_str = ("\n\nGUIDELINES:\n" + length_str + instructions_str).strip('\n') if length or instructions else ""
383 | 
384 |         all_findings = '\n\n'.join(self.conversation.get_all_findings()) or "No findings available yet."
385 | 
386 |         input_str = f"""
387 |         Provide a response based on the query and findings below with as much detail as possible. {guidelines_str}
388 | 
389 |         QUERY: {query}
390 | 
391 |         FINDINGS:
392 |         {all_findings}
393 |         """
394 | 
395 |         result = await ResearchRunner.run(
396 |             self.writer_agent,
397 |             input_str,
398 |         )
399 |         
400 |         self._log_message("Final response from IterativeResearcher created successfully")
401 |         
402 |         return result.final_output
403 |     
404 |     def _log_message(self, message: str) -> None:
405 |         """Log a message if verbose is True"""
406 |         if self.verbose:
407 |             print(message)


--------------------------------------------------------------------------------
/deep_researcher/llm_config.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | from agents import (
  4 |     OpenAIChatCompletionsModel,
  5 |     OpenAIResponsesModel,
  6 |     set_tracing_disabled,
  7 |     set_tracing_export_api_key,
  8 | )
  9 | from dotenv import load_dotenv
 10 | from openai import AsyncAzureOpenAI, AsyncOpenAI
 11 | 
 12 | from .utils.os import get_env_with_prefix
 13 | 
 14 | load_dotenv(override=True)
 15 | 
 16 | OPENAI_API_KEY = get_env_with_prefix("OPENAI_API_KEY")
 17 | DEEPSEEK_API_KEY = get_env_with_prefix("DEEPSEEK_API_KEY")
 18 | OPENROUTER_API_KEY = get_env_with_prefix("OPENROUTER_API_KEY")
 19 | GEMINI_API_KEY = get_env_with_prefix("GEMINI_API_KEY")
 20 | ANTHROPIC_API_KEY = get_env_with_prefix("ANTHROPIC_API_KEY")
 21 | PERPLEXITY_API_KEY = get_env_with_prefix("PERPLEXITY_API_KEY")
 22 | HUGGINGFACE_API_KEY = get_env_with_prefix("HUGGINGFACE_API_KEY")
 23 | LOCAL_MODEL_URL = get_env_with_prefix(
 24 |     "LOCAL_MODEL_URL"
 25 | )  # e.g. "http://localhost:11434/v1"
 26 | AZURE_OPENAI_ENDPOINT = get_env_with_prefix("AZURE_OPENAI_ENDPOINT")
 27 | AZURE_OPENAI_DEPLOYMENT = get_env_with_prefix("AZURE_OPENAI_DEPLOYMENT")
 28 | AZURE_OPENAI_API_KEY = get_env_with_prefix("AZURE_OPENAI_API_KEY")
 29 | AZURE_OPENAI_API_VERSION = get_env_with_prefix("AZURE_OPENAI_API_VERSION")
 30 | 
 31 | REASONING_MODEL_PROVIDER = get_env_with_prefix("REASONING_MODEL_PROVIDER", "openai")
 32 | REASONING_MODEL = get_env_with_prefix("REASONING_MODEL", "o3-mini")
 33 | MAIN_MODEL_PROVIDER = get_env_with_prefix("MAIN_MODEL_PROVIDER", "openai")
 34 | MAIN_MODEL = get_env_with_prefix("MAIN_MODEL", "gpt-4o")
 35 | FAST_MODEL_PROVIDER = get_env_with_prefix("FAST_MODEL_PROVIDER", "openai")
 36 | FAST_MODEL = get_env_with_prefix("FAST_MODEL", "gpt-4o-mini")
 37 | 
 38 | SEARCH_PROVIDER = get_env_with_prefix("SEARCH_PROVIDER", "serper")
 39 | SEARCHXNG_HOST = get_env_with_prefix("SEARCHXNG_HOST")
 40 | 
 41 | supported_providers = [
 42 |     "openai",
 43 |     "deepseek",
 44 |     "openrouter",
 45 |     "gemini",
 46 |     "anthropic",
 47 |     "perplexity",
 48 |     "huggingface",
 49 |     "local",
 50 |     "azureopenai",
 51 | ]
 52 | 
 53 | provider_mapping = {
 54 |     "openai": {
 55 |         "client": AsyncOpenAI,
 56 |         "model": OpenAIResponsesModel,
 57 |         "base_url": None,
 58 |         "api_key": OPENAI_API_KEY,
 59 |     },
 60 |     "deepseek": {
 61 |         "client": AsyncOpenAI,
 62 |         "model": OpenAIChatCompletionsModel,
 63 |         "base_url": "https://api.deepseek.com/v1",
 64 |         "api_key": DEEPSEEK_API_KEY,
 65 |     },
 66 |     "openrouter": {
 67 |         "client": AsyncOpenAI,
 68 |         "model": OpenAIChatCompletionsModel,
 69 |         "base_url": "https://openrouter.ai/api/v1",
 70 |         "api_key": OPENROUTER_API_KEY,
 71 |     },
 72 |     "gemini": {
 73 |         "client": AsyncOpenAI,
 74 |         "model": OpenAIChatCompletionsModel,
 75 |         "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
 76 |         "api_key": GEMINI_API_KEY,
 77 |     },
 78 |     "anthropic": {
 79 |         "client": AsyncOpenAI,
 80 |         "model": OpenAIChatCompletionsModel,
 81 |         "base_url": "https://api.anthropic.com/v1/",
 82 |         "api_key": ANTHROPIC_API_KEY,
 83 |     },
 84 |     "perplexity": {
 85 |         "client": AsyncOpenAI,
 86 |         "model": OpenAIChatCompletionsModel,
 87 |         "base_url": "https://api.perplexity.ai/chat/completions",
 88 |         "api_key": PERPLEXITY_API_KEY,
 89 |     },
 90 |     "huggingface": {
 91 |         "client": AsyncOpenAI,
 92 |         "model": OpenAIChatCompletionsModel,
 93 |         "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
 94 |         "api_key": HUGGINGFACE_API_KEY,
 95 |     },
 96 |     "local": {
 97 |         "client": AsyncOpenAI,
 98 |         "model": OpenAIChatCompletionsModel,
 99 |         "base_url": LOCAL_MODEL_URL,
100 |         "api_key": "ollama",  # Required by OpenAI client but not used
101 |     },
102 |     "azureopenai": {
103 |         "client": AsyncAzureOpenAI,
104 |         "model": OpenAIChatCompletionsModel,
105 |         "api_key": AZURE_OPENAI_API_KEY,
106 |         "azure_endpoint": AZURE_OPENAI_ENDPOINT,
107 |         "azure_deployment": AZURE_OPENAI_DEPLOYMENT,
108 |         "api_version": AZURE_OPENAI_API_VERSION,
109 |     },
110 | }
111 | 
112 | if OPENAI_API_KEY:
113 |     set_tracing_export_api_key(OPENAI_API_KEY)
114 | else:
115 |     # If no OpenAI API key is provided, disable tracing
116 |     set_tracing_disabled(True)
117 | 
118 | supported_search_providers = ["serper", "searchxng", "openai"]
119 | 
120 | 
121 | class LLMConfig:
122 | 
123 |     def __init__(
124 |         self,
125 |         search_provider: str,
126 |         reasoning_model_provider: str,
127 |         reasoning_model: str,
128 |         main_model_provider: str,
129 |         main_model: str,
130 |         fast_model_provider: str,
131 |         fast_model: str,
132 |     ):
133 |         if search_provider not in supported_search_providers:
134 |             raise ValueError(f"Invalid search provider: {search_provider}")
135 | 
136 |         self.search_provider = search_provider
137 | 
138 |         if reasoning_model_provider not in supported_providers:
139 |             raise ValueError(f"Invalid model provider: {reasoning_model_provider}")
140 |         if main_model_provider not in supported_providers:
141 |             raise ValueError(f"Invalid model provider: {main_model_provider}")
142 |         if fast_model_provider not in supported_providers:
143 |             raise ValueError(f"Invalid model provider: {fast_model_provider}")
144 | 
145 |         # Helper to init any provider model
146 |         def _init_model(provider_key: str, model_name: str):
147 |             m = provider_mapping[provider_key]
148 |             client_cls = m["client"]
149 |             kwargs = {k: v for k, v in m.items() if k not in ("model", "client")}
150 |             client = client_cls(**kwargs)
151 |             return m["model"](model=model_name, openai_client=client)
152 | 
153 |         self.reasoning_model = _init_model(reasoning_model_provider, reasoning_model)
154 |         self.main_model = _init_model(main_model_provider, main_model)
155 |         self.fast_model = _init_model(fast_model_provider, fast_model)
156 | 
157 | 
158 | def create_default_config() -> LLMConfig:
159 |     return LLMConfig(
160 |         search_provider=SEARCH_PROVIDER,
161 |         reasoning_model_provider=REASONING_MODEL_PROVIDER,
162 |         reasoning_model=REASONING_MODEL,
163 |         main_model_provider=MAIN_MODEL_PROVIDER,
164 |         main_model=MAIN_MODEL,
165 |         fast_model_provider=FAST_MODEL_PROVIDER,
166 |         fast_model=FAST_MODEL,
167 |     )
168 | 
169 | 
170 | def get_base_url(model: Union[OpenAIChatCompletionsModel, OpenAIResponsesModel]) -> str:
171 |     """Utility function to get the base URL for a given model"""
172 |     return str(model._client._base_url)
173 | 
174 | 
175 | def model_supports_structured_output(
176 |     model: Union[OpenAIChatCompletionsModel, OpenAIResponsesModel],
177 | ) -> bool:
178 |     """Utility function to check if a model supports structured output"""
179 |     structured_output_providers = ["openai.com", "anthropic.com"]
180 |     return any(
181 |         provider in get_base_url(model) for provider in structured_output_providers
182 |     )
183 | 


--------------------------------------------------------------------------------
/deep_researcher/main.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import argparse
 3 | from .iterative_research import IterativeResearcher
 4 | from .deep_research import DeepResearcher
 5 | from typing import Literal
 6 | from dotenv import load_dotenv
 7 | 
 8 | load_dotenv(override=True)
 9 | 
10 | 
11 | async def main() -> None:
12 |     parser = argparse.ArgumentParser(description="Deep Research Assistant")
13 |     parser.add_argument("--query", type=str, help="Research query")
14 |     parser.add_argument("--model", type=str, choices=["deep", "simple"], 
15 |                         help="Mode of research (deep or simple)", default="deep")
16 |     parser.add_argument("--max-iterations", type=int, default=5,
17 |                        help="Maximum number of iterations for deep research")
18 |     parser.add_argument("--max-time", type=int, default=10,
19 |                        help="Maximum time in minutes for deep research")
20 |     parser.add_argument("--output-length", type=str, default="5 pages",
21 |                        help="Desired output length for the report")
22 |     parser.add_argument("--output-instructions", type=str, default="",
23 |                        help="Additional instructions for the report")
24 |     parser.add_argument("--verbose", action="store_true",
25 |                        help="Print status updates to the console")
26 |     parser.add_argument("--tracing", action="store_true",
27 |                        help="Enable tracing for the research (only valid for OpenAI models)")
28 |     
29 |     args = parser.parse_args()
30 |     
31 |     # If no query is provided via command line, prompt the user
32 |     query = args.query if args.query else input("What would you like to research? ")
33 |     
34 |     print(f"Starting deep research on: {query}")
35 |     print(f"Max iterations: {args.max_iterations}, Max time: {args.max_time} minutes")
36 |     
37 |     if args.model == "deep":
38 |         manager = DeepResearcher(
39 |             max_iterations=args.max_iterations,
40 |             max_time_minutes=args.max_time,
41 |             verbose=args.verbose,
42 |             tracing=args.tracing
43 |         )
44 |         report = await manager.run(query)
45 |     else:
46 |         manager = IterativeResearcher(
47 |             max_iterations=args.max_iterations,
48 |             max_time_minutes=args.max_time,
49 |             verbose=args.verbose,
50 |             tracing=args.tracing
51 |         )
52 |         report = await manager.run(
53 |             query, 
54 |             output_length=args.output_length, 
55 |             output_instructions=args.output_instructions
56 |         )
57 | 
58 |     print("\n=== Final Report ===")
59 |     print(report)
60 | 
61 | # Command line entry point
62 | def cli_entry():
63 |     """Entry point for the command-line interface."""
64 |     asyncio.run(main())
65 | 
66 | if __name__ == "__main__":
67 |     cli_entry()
68 | 


--------------------------------------------------------------------------------
/deep_researcher/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .web_search import create_web_search_tool
2 | from .crawl_website import crawl_website
3 | 


--------------------------------------------------------------------------------
/deep_researcher/tools/crawl_website.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Set, Union
  2 | from urllib.parse import urlparse, urljoin
  3 | from bs4 import BeautifulSoup
  4 | import aiohttp
  5 | from .web_search import scrape_urls, ssl_context, ScrapeResult, WebpageSnippet
  6 | from agents import function_tool
  7 | 
  8 | 
  9 | @function_tool
 10 | async def crawl_website(starting_url: str) -> Union[List[ScrapeResult], str]:
 11 |     """Crawls the pages of a website starting with the starting_url and then descending into the pages linked from there.
 12 |     Prioritizes links found in headers/navigation, then body links, then subsequent pages.
 13 |     
 14 |     Args:
 15 |         starting_url: Starting URL to scrape
 16 |         
 17 |     Returns:
 18 |         List of ScrapeResult objects which have the following fields:
 19 |             - url: The URL of the web page
 20 |             - title: The title of the web page
 21 |             - description: The description of the web page
 22 |             - text: The text content of the web page
 23 |     """
 24 |     if not starting_url:
 25 |         return "Empty URL provided"
 26 | 
 27 |     # Ensure URL has a protocol
 28 |     if not starting_url.startswith(('http://', 'https://')):
 29 |         starting_url = 'http://' + starting_url
 30 | 
 31 |     max_pages = 10
 32 |     base_domain = urlparse(starting_url).netloc
 33 |     
 34 |     async def extract_links(html: str, current_url: str) -> tuple[List[str], List[str]]:
 35 |         """Extract prioritized links from HTML content"""
 36 |         soup = BeautifulSoup(html, 'html.parser')
 37 |         nav_links = set()
 38 |         body_links = set()
 39 |         
 40 |         # Find navigation/header links
 41 |         for nav_element in soup.find_all(['nav', 'header']):
 42 |             for a in nav_element.find_all('a', href=True):
 43 |                 link = urljoin(current_url, a['href'])
 44 |                 if urlparse(link).netloc == base_domain:
 45 |                     nav_links.add(link)
 46 |         
 47 |         # Find remaining body links
 48 |         for a in soup.find_all('a', href=True):
 49 |             link = urljoin(current_url, a['href'])
 50 |             if urlparse(link).netloc == base_domain and link not in nav_links:
 51 |                 body_links.add(link)
 52 |                 
 53 |         return list(nav_links), list(body_links)
 54 | 
 55 |     async def fetch_page(url: str) -> str:
 56 |         """Fetch HTML content from a URL"""
 57 |         connector = aiohttp.TCPConnector(ssl=ssl_context)
 58 |         async with aiohttp.ClientSession(connector=connector) as session:
 59 |             try:
 60 |                 async with session.get(url, timeout=30) as response:
 61 |                     if response.status == 200:
 62 |                         return await response.text()
 63 |             except Exception as e:
 64 |                 print(f"Error fetching {url}: {str(e)}")
 65 |                 return "Error fetching page"
 66 | 
 67 |     # Initialize with starting URL
 68 |     queue: List[str] = [starting_url]
 69 |     next_level_queue: List[str] = []
 70 |     all_pages_to_scrape: Set[str] = set([starting_url])
 71 |     
 72 |     # Breadth-first crawl
 73 |     while queue and len(all_pages_to_scrape) < max_pages:
 74 |         current_url = queue.pop(0)
 75 |         
 76 |         # Fetch and process the page
 77 |         html_content = await fetch_page(current_url)
 78 |         if html_content:
 79 |             nav_links, body_links = await extract_links(html_content, current_url)
 80 |             
 81 |             # Add unvisited nav links to current queue (higher priority)
 82 |             remaining_slots = max_pages - len(all_pages_to_scrape)
 83 |             for link in nav_links:
 84 |                 link = link.rstrip('/')
 85 |                 if link not in all_pages_to_scrape and remaining_slots > 0:
 86 |                     queue.append(link)
 87 |                     all_pages_to_scrape.add(link)
 88 |                     remaining_slots -= 1
 89 |             
 90 |             # Add unvisited body links to next level queue (lower priority)
 91 |             for link in body_links:
 92 |                 link = link.rstrip('/')
 93 |                 if link not in all_pages_to_scrape and remaining_slots > 0:
 94 |                     next_level_queue.append(link)
 95 |                     all_pages_to_scrape.add(link)
 96 |                     remaining_slots -= 1
 97 |         
 98 |         # If current queue is empty, add next level links
 99 |         if not queue:
100 |             queue = next_level_queue
101 |             next_level_queue = []
102 |     
103 |     # Convert set to list for final processing
104 |     pages_to_scrape = list(all_pages_to_scrape)[:max_pages]
105 |     pages_to_scrape = [WebpageSnippet(url=page, title="", description="") for page in pages_to_scrape]
106 |     
107 |     # Use scrape_urls to get the content for all discovered pages
108 |     result = await scrape_urls(pages_to_scrape)
109 |     return result


--------------------------------------------------------------------------------
/deep_researcher/tools/web_search.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import os
  4 | import ssl
  5 | from typing import List, Optional, Union
  6 | 
  7 | import aiohttp
  8 | from agents import function_tool
  9 | from bs4 import BeautifulSoup
 10 | from dotenv import load_dotenv
 11 | from pydantic import BaseModel, Field
 12 | 
 13 | from ..agents.baseclass import ResearchAgent, ResearchRunner
 14 | from ..agents.utils.parse_output import create_type_parser
 15 | from ..llm_config import LLMConfig, model_supports_structured_output
 16 | 
 17 | load_dotenv()
 18 | CONTENT_LENGTH_LIMIT = 10000  # Trim scraped content to this length to avoid large context / token limit issues
 19 | 
 20 | # ------- DEFINE TYPES -------
 21 | 
 22 | 
 23 | class ScrapeResult(BaseModel):
 24 |     url: str = Field(description="The URL of the webpage")
 25 |     text: str = Field(description="The full text content of the webpage")
 26 |     title: str = Field(description="The title of the webpage")
 27 |     description: str = Field(description="A short description of the webpage")
 28 | 
 29 | 
 30 | class WebpageSnippet(BaseModel):
 31 |     url: str = Field(description="The URL of the webpage")
 32 |     title: str = Field(description="The title of the webpage")
 33 |     description: Optional[str] = Field(description="A short description of the webpage")
 34 | 
 35 | 
 36 | class SearchResults(BaseModel):
 37 |     results_list: List[WebpageSnippet]
 38 | 
 39 | 
 40 | # ------- DEFINE TOOL -------
 41 | 
 42 | 
 43 | def create_web_search_tool(config: LLMConfig) -> function_tool:
 44 |     filter_agent = init_filter_agent(config)
 45 | 
 46 |     if config.search_provider == "serper":
 47 |         search_client = SerperClient(filter_agent)
 48 |     elif config.search_provider == "searchxng":
 49 |         search_client = SearchXNGClient(filter_agent)
 50 |     else:
 51 |         raise ValueError(f"Invalid search provider: {config.search_provider}")
 52 | 
 53 |     @function_tool
 54 |     async def web_search(query: str) -> Union[List[ScrapeResult], str]:
 55 |         """Perform a web search for a given query and get back the URLs along with their titles, descriptions and text contents.
 56 | 
 57 |         Args:
 58 |             query: The search query
 59 | 
 60 |         Returns:
 61 |             List of ScrapeResult objects which have the following fields:
 62 |                 - url: The URL of the search result
 63 |                 - title: The title of the search result
 64 |                 - description: The description of the search result
 65 |                 - text: The full text content of the search result
 66 |         """
 67 |         try:
 68 |             search_results = await search_client.search(
 69 |                 query, filter_for_relevance=True, max_results=5
 70 |             )
 71 |             results = await scrape_urls(search_results)
 72 |             return results
 73 |         except Exception as e:
 74 |             # Return a user-friendly error message
 75 |             return f"Sorry, I encountered an error while searching: {str(e)}"
 76 | 
 77 |     return web_search
 78 | 
 79 | 
 80 | # ------- DEFINE AGENT FOR FILTERING SEARCH RESULTS BY RELEVANCE -------
 81 | 
 82 | FILTER_AGENT_INSTRUCTIONS = f"""
 83 | You are a search result filter. Your task is to analyze a list of SERP search results and determine which ones are relevant
 84 | to the original query based on the link, title and snippet. Return only the relevant results in the specified format. 
 85 | 
 86 | - Remove any results that refer to entities that have similar names to the queried entity, but are not the same.
 87 | - E.g. if the query asks about a company "Amce Inc, acme.com", remove results with "acmesolutions.com" or "acme.net" in the link.
 88 | 
 89 | Only output JSON. Follow the JSON schema below. Do not output anything else. I will be parsing this with Pydantic so output valid JSON only:
 90 | {SearchResults.model_json_schema()}
 91 | """
 92 | 
 93 | 
 94 | def init_filter_agent(config: LLMConfig) -> ResearchAgent:
 95 |     selected_model = config.reasoning_model
 96 | 
 97 |     return ResearchAgent(
 98 |         name="SearchFilterAgent",
 99 |         instructions=FILTER_AGENT_INSTRUCTIONS,
100 |         model=selected_model,
101 |         output_type=(
102 |             SearchResults if model_supports_structured_output(selected_model) else None
103 |         ),
104 |         output_parser=(
105 |             create_type_parser(SearchResults)
106 |             if not model_supports_structured_output(selected_model)
107 |             else None
108 |         ),
109 |     )
110 | 
111 | 
112 | # ------- DEFINE UNDERLYING TOOL LOGIC -------
113 | 
114 | # Create a shared connector
115 | ssl_context = ssl.create_default_context()
116 | ssl_context.check_hostname = False
117 | ssl_context.verify_mode = ssl.CERT_NONE
118 | ssl_context.set_ciphers(
119 |     "DEFAULT:@SECLEVEL=1"
120 | )  # Add this line to allow older cipher suites
121 | 
122 | 
123 | class SerperClient:
124 |     """A client for the Serper API to perform Google searches."""
125 | 
126 |     def __init__(self, filter_agent: ResearchAgent, api_key: str = None):
127 |         self.filter_agent = filter_agent
128 |         self.api_key = api_key or os.getenv("SERPER_API_KEY")
129 |         if not self.api_key:
130 |             raise ValueError(
131 |                 "No API key provided. Set SERPER_API_KEY environment variable."
132 |             )
133 | 
134 |         self.url = "https://google.serper.dev/search"
135 |         self.headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"}
136 | 
137 |     async def search(
138 |         self, query: str, filter_for_relevance: bool = True, max_results: int = 5
139 |     ) -> List[WebpageSnippet]:
140 |         """Perform a Google search using Serper API and fetch basic details for top results.
141 | 
142 |         Args:
143 |             query: The search query
144 |             num_results: Maximum number of results to return (max 10)
145 | 
146 |         Returns:
147 |             Dictionary with search results
148 |         """
149 |         connector = aiohttp.TCPConnector(ssl=ssl_context)
150 |         async with aiohttp.ClientSession(connector=connector) as session:
151 |             async with session.post(
152 |                 self.url, headers=self.headers, json={"q": query, "autocorrect": False}
153 |             ) as response:
154 |                 response.raise_for_status()
155 |                 results = await response.json()
156 |                 results_list = [
157 |                     WebpageSnippet(
158 |                         url=result.get("link", ""),
159 |                         title=result.get("title", ""),
160 |                         description=result.get("snippet", ""),
161 |                     )
162 |                     for result in results.get("organic", [])
163 |                 ]
164 | 
165 |         if not results_list:
166 |             return []
167 | 
168 |         if not filter_for_relevance:
169 |             return results_list[:max_results]
170 | 
171 |         return await self._filter_results(results_list, query, max_results=max_results)
172 | 
173 |     async def _filter_results(
174 |         self, results: List[WebpageSnippet], query: str, max_results: int = 5
175 |     ) -> List[WebpageSnippet]:
176 |         serialized_results = [
177 |             result.model_dump() if isinstance(result, WebpageSnippet) else result
178 |             for result in results
179 |         ]
180 | 
181 |         user_prompt = f"""
182 |         Original search query: {query}
183 |         
184 |         Search results to analyze:
185 |         {json.dumps(serialized_results, indent=2)}
186 |         
187 |         Return {max_results} search results or less.
188 |         """
189 | 
190 |         try:
191 |             result = await ResearchRunner.run(self.filter_agent, user_prompt)
192 |             output = result.final_output_as(SearchResults)
193 |             return output.results_list
194 |         except Exception as e:
195 |             print("Error filtering results:", str(e))
196 |             return results[:max_results]
197 | 
198 | 
199 | class SearchXNGClient:
200 |     """A client for the SearchXNG API to perform Google searches."""
201 | 
202 |     def __init__(self, filter_agent: ResearchAgent):
203 |         self.filter_agent = filter_agent
204 |         self.host = os.getenv("SEARCHXNG_HOST")
205 |         if not self.host.endswith("/search"):
206 |             self.host = (
207 |                 f"{self.host}/search"
208 |                 if not self.host.endswith("/")
209 |                 else f"{self.host}search"
210 |             )
211 | 
212 |     async def search(
213 |         self, query: str, filter_for_relevance: bool = True, max_results: int = 5
214 |     ) -> List[WebpageSnippet]:
215 |         """Perform a search using SearchXNG API."""
216 |         connector = aiohttp.TCPConnector(ssl=ssl_context)
217 |         async with aiohttp.ClientSession(connector=connector) as session:
218 |             params = {
219 |                 "q": query,
220 |                 "format": "json",
221 |             }
222 | 
223 |             async with session.get(self.host, params=params) as response:
224 |                 response.raise_for_status()
225 |                 results = await response.json()
226 | 
227 |                 results_list = [
228 |                     WebpageSnippet(
229 |                         url=result.get("url", ""),
230 |                         title=result.get("title", ""),
231 |                         description=result.get("content", ""),
232 |                     )
233 |                     for result in results.get("results", [])
234 |                 ]
235 | 
236 |         if not results_list:
237 |             return []
238 | 
239 |         if not filter_for_relevance:
240 |             return results_list[:max_results]
241 | 
242 |         return await self._filter_results(results_list, query, max_results=max_results)
243 | 
244 |     async def _filter_results(
245 |         self, results: List[WebpageSnippet], query: str, max_results: int = 5
246 |     ) -> List[WebpageSnippet]:
247 |         serialized_results = [
248 |             result.model_dump() if isinstance(result, WebpageSnippet) else result
249 |             for result in results
250 |         ]
251 | 
252 |         user_prompt = f"""
253 |         Original search query: {query}
254 |         
255 |         Search results to analyze:
256 |         {json.dumps(serialized_results, indent=2)}
257 |         
258 |         Return {max_results} search results or less.
259 |         """
260 | 
261 |         try:
262 |             result = await ResearchRunner.run(self.filter_agent, user_prompt)
263 |             output = result.final_output_as(SearchResults)
264 |             return output.results_list
265 |         except Exception as e:
266 |             print("Error filtering results:", str(e))
267 |             return results[:max_results]
268 | 
269 | 
270 | async def scrape_urls(items: List[WebpageSnippet]) -> List[ScrapeResult]:
271 |     """Fetch text content from provided URLs.
272 | 
273 |     Args:
274 |         items: List of SearchEngineResult items to extract content from
275 | 
276 |     Returns:
277 |         List of ScrapeResult objects which have the following fields:
278 |             - url: The URL of the search result
279 |             - title: The title of the search result
280 |             - description: The description of the search result
281 |             - text: The full text content of the search result
282 |     """
283 |     connector = aiohttp.TCPConnector(ssl=ssl_context)
284 |     async with aiohttp.ClientSession(connector=connector) as session:
285 |         # Create list of tasks for concurrent execution
286 |         tasks = []
287 |         for item in items:
288 |             if item.url:  # Skip empty URLs
289 |                 tasks.append(fetch_and_process_url(session, item))
290 | 
291 |         # Execute all tasks concurrently and gather results
292 |         results = await asyncio.gather(*tasks, return_exceptions=True)
293 | 
294 |         # Filter out errors and return successful results
295 |         return [r for r in results if isinstance(r, ScrapeResult)]
296 | 
297 | 
298 | async def fetch_and_process_url(
299 |     session: aiohttp.ClientSession, item: WebpageSnippet
300 | ) -> ScrapeResult:
301 |     """Helper function to fetch and process a single URL."""
302 | 
303 |     if not is_valid_url(item.url):
304 |         return ScrapeResult(
305 |             url=item.url,
306 |             title=item.title,
307 |             description=item.description,
308 |             text=f"Error fetching content: URL contains restricted file extension",
309 |         )
310 | 
311 |     try:
312 |         async with session.get(item.url, timeout=8) as response:
313 |             if response.status == 200:
314 |                 content = await response.text()
315 |                 # Run html_to_text in a thread pool to avoid blocking
316 |                 text_content = await asyncio.get_event_loop().run_in_executor(
317 |                     None, html_to_text, content
318 |                 )
319 |                 text_content = text_content[
320 |                     :CONTENT_LENGTH_LIMIT
321 |                 ]  # Trim content to avoid exceeding token limit
322 |                 return ScrapeResult(
323 |                     url=item.url,
324 |                     title=item.title,
325 |                     description=item.description,
326 |                     text=text_content,
327 |                 )
328 |             else:
329 |                 # Instead of raising, return a WebSearchResult with an error message
330 |                 return ScrapeResult(
331 |                     url=item.url,
332 |                     title=item.title,
333 |                     description=item.description,
334 |                     text=f"Error fetching content: HTTP {response.status}",
335 |                 )
336 |     except Exception as e:
337 |         # Instead of raising, return a WebSearchResult with an error message
338 |         return ScrapeResult(
339 |             url=item.url,
340 |             title=item.title,
341 |             description=item.description,
342 |             text=f"Error fetching content: {str(e)}",
343 |         )
344 | 
345 | 
346 | def html_to_text(html_content: str) -> str:
347 |     """
348 |     Strips out all of the unnecessary elements from the HTML context to prepare it for text extraction / LLM processing.
349 |     """
350 |     # Parse the HTML using lxml for speed
351 |     soup = BeautifulSoup(html_content, "lxml")
352 | 
353 |     # Extract text from relevant tags
354 |     tags_to_extract = ("h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "blockquote")
355 | 
356 |     # Use a generator expression for efficiency
357 |     extracted_text = "\n".join(
358 |         element.get_text(strip=True)
359 |         for element in soup.find_all(tags_to_extract)
360 |         if element.get_text(strip=True)
361 |     )
362 | 
363 |     return extracted_text
364 | 
365 | 
366 | def is_valid_url(url: str) -> bool:
367 |     """Check that a URL does not contain restricted file extensions."""
368 |     if any(
369 |         ext in url
370 |         for ext in [
371 |             ".pdf",
372 |             ".doc",
373 |             ".xls",
374 |             ".ppt",
375 |             ".zip",
376 |             ".rar",
377 |             ".7z",
378 |             ".txt",
379 |             ".js",
380 |             ".xml",
381 |             ".css",
382 |             ".png",
383 |             ".jpg",
384 |             ".jpeg",
385 |             ".gif",
386 |             ".ico",
387 |             ".svg",
388 |             ".webp",
389 |             ".mp3",
390 |             ".mp4",
391 |             ".avi",
392 |             ".mov",
393 |             ".wmv",
394 |             ".flv",
395 |             ".wma",
396 |             ".wav",
397 |             ".m4a",
398 |             ".m4v",
399 |             ".m4b",
400 |             ".m4p",
401 |             ".m4u",
402 |         ]
403 |     ):
404 |         return False
405 |     return True
406 | 


--------------------------------------------------------------------------------
/deep_researcher/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx-labs/agents-deep-research/4e3ce35be10a59dfdbe4180ce60085a6e5ffb7ff/deep_researcher/utils/__init__.py


--------------------------------------------------------------------------------
/deep_researcher/utils/markdown.css:
--------------------------------------------------------------------------------
1 | body {
2 |     font-family: Arial, sans-serif;
3 |     font-size: 14px;
4 |     line-height: 1.8;
5 |     color: #000;
6 | }


--------------------------------------------------------------------------------
/deep_researcher/utils/md_to_pdf.py:
--------------------------------------------------------------------------------
1 | import os
2 | from md2pdf import md2pdf
3 | 
4 | curdir = os.path.dirname(os.path.abspath(__file__))
5 | css_path = os.path.join(curdir, "markdown.css")
6 | 
7 | def md_to_pdf(md_text: str, pdf_file_path: str):
8 |     md2pdf(pdf_file_path, md_text, css_file_path=css_path)
9 | 


--------------------------------------------------------------------------------
/deep_researcher/utils/os.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | def get_env_with_prefix(base_name: str, prefix: str = "DR_", default: Optional[str] = None) -> Optional[str]:
 5 |     """
 6 |     Retrieves an environment variable, checking for a prefixed version first.
 7 | 
 8 |     Args:
 9 |         base_name: The base name of the environment variable (e.g., "OPENAI_API_KEY").
10 |         prefix: The prefix to check for (e.g., "DR_"). Defaults to "DR_".
11 |         default: The default value to return if neither the prefixed nor the
12 |                  base variable is found.
13 | 
14 |     Returns:
15 |         The value of the environment variable, or the default value, or None.
16 |     """
17 |     prefixed_name = f"{prefix}{base_name}"
18 |     value = os.getenv(prefixed_name)
19 |     if value is not None:
20 |         return value
21 |     return os.getenv(base_name, default)
22 | 


--------------------------------------------------------------------------------
/examples/deep_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example usage of the DeepResearcher to produce a report.
 3 | 
 4 | See deep_output.txt for the console output from running this script, and deep_output.pdf for the final report
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11 | from deep_researcher import DeepResearcher
12 | 
13 | manager = DeepResearcher(
14 |     max_iterations=3,
15 |     max_time_minutes=10,
16 |     verbose=True,
17 |     tracing=True
18 | )
19 | 
20 | query = "Write a report on Plato - who was he, what were his main works " \
21 |         "and what are the main philosophical ideas he's known for"
22 | 
23 | report = asyncio.run(
24 |     manager.run(
25 |         query
26 |     )
27 | )
28 | 
29 | print("\n=== Final Report ===")
30 | print(report)


--------------------------------------------------------------------------------
/examples/iterative_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example usage of the IterativeResearcher to produce a report.
 3 | 
 4 | See iterative_output.txt for the console output from running this script, and iterative_output.pdf for the final report
 5 | """
 6 | 
 7 | import asyncio
 8 | import os
 9 | import sys
10 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11 | from deep_researcher import IterativeResearcher
12 | 
13 | manager = IterativeResearcher(
14 |     max_iterations=5,
15 |     max_time_minutes=10,
16 |     verbose=True,
17 |     tracing=True
18 | )
19 | 
20 | query = "Write a report on Plato - who was he, what were his main works " \
21 |         "and what are the main philosophical ideas he's known for"
22 | output_length = "5 pages"
23 | output_instructions = ""
24 | 
25 | report = asyncio.run(
26 |     manager.run(
27 |         query, 
28 |         output_length=output_length, 
29 |         output_instructions=output_instructions
30 |     )
31 | )
32 | 
33 | print("\n=== Final Report ===")
34 | print(report)


--------------------------------------------------------------------------------
/examples/report_plan_example.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from deep_researcher.agents.planner_agent import planner_agent, ReportPlan
 3 | from agents import gen_trace_id, trace
 4 | from deep_researcher import ResearchRunner
 5 | 
 6 | 
 7 | async def run_report_planner(query):
 8 |     trace_id = gen_trace_id()
 9 | 
10 |     with trace("Deep Research trace", trace_id=trace_id):
11 |         print(f"View trace: https://platform.openai.com/traces/{trace_id}")
12 |         result = await ResearchRunner.run(planner_agent, query)
13 |         plan = result.final_output_as(ReportPlan)
14 |         return plan
15 | 
16 | 
17 | user_query = "Provide a detailed overview of the company Quantera (quantera.io) from an investor's perspective"
18 | 
19 | plan = asyncio.run(run_report_planner(user_query))
20 | 
21 | print(f"BACKGROUND CONTEXT:\n{plan.background_context if plan.background_context else 'No background context'}")
22 | 
23 | print("\nREPORT OUTLINE:\n")
24 | for section in plan.report_outline:
25 |     print(f"Section: {section.title}")
26 |     print(f"Key question: {section.key_question}\n")
27 |     


--------------------------------------------------------------------------------
/examples/sample_output/labour_policies.md:
--------------------------------------------------------------------------------
 1 | User Query:
 2 | 
 3 | Provide a breakdown of the policies that the UK Labour government has implemented since they won the latest election
 4 |  
 5 | Parameters:
 6 | * Reasoning Model: o3-mini
 7 | * Main Model: gpt-4o-mini
 8 | * Fast Model: gpt-4o-mini
 9 | * Mode (Simple or Deep): Simple
10 | * Max Iterations: 4
11 | 
12 | Cost: $0.02
13 | 
14 | ---
15 | 
16 | # Breakdown of UK Labour Government Policies Post July 2024 Election
17 | 
18 | ## Introduction
19 | 
20 | The UK Labour Party, led by Keir Starmer, achieved a significant electoral victory on July 4, 2024, marking its return to power after a 14-year hiatus. This victory resulted in securing **412 seats** in the House of Commons, representing a **174-seat majority** and a decisive shift in the political landscape from the Conservative Party, which plummeted to **121 seats**. Despite a relatively modest national vote share of **34%**, this result underscores a complex voter dynamic and a significant challenge for the Labour government as it embarks on a wide-ranging policy agenda aimed at addressing national economic and social issues.
21 | 
22 | As Labour begins to implement its policies, the focus has been on immediate economic recovery, social welfare enhancement, and environmental sustainability. This document provides a detailed breakdown of key policies and initiatives undertaken by the Labour government since taking office, focusing on various areas including economic strategies, healthcare revisions, education reform, housing, and broader social policies.
23 | 
24 | ## 1. Economic Policies and Initiatives
25 | 
26 | ### 1.1 Overview of Economic Context
27 | Entering office in a challenging economic environment marked by stagnation since the 2008 financial crisis, Labour is focused on revitalizing growth and productivity. Acknowledging the inherited economic difficulties, the government promises to stimulate investment through strategic fiscal interventions.
28 | 
29 | ### 1.2 Tax and Spend Plans
30 | Labour's economic strategy includes a commitment to raise **£8 billion** through various tax reforms, particularly targeting wealthier individuals and large corporations.
31 | 
32 | - **VAT on Private Schools**: By introducing a **20% VAT on private school fees**, the government aims to generate an estimated **£1.6 billion annually** to fund **6,500 new teachers** in state schools. This policy is designed to address teacher shortages and improve educational outcomes, though it has faced criticism regarding its implementation and acceptance among parents ([Institute for Government](https://www.instituteforgovernment.org.uk/comment/2025-make-or-break-government)).
33 |   
34 | - **Windfall Tax on Energy Companies**: In light of rising energy prices, Labour plans to impose a windfall tax on major energy companies, with the intention of reallocating funds to support social services and green initiatives. This move aims not only to generate immediate revenue but also to address issues of energy poverty and sustainability.
35 | 
36 | ### 1.3 Infrastructure Investment
37 | A critical component of Labour’s economic agenda involves substantial investment in infrastructure.
38 | 
39 | - **National Infrastructure Plans**: Labour has set ambitious targets to build **1.5 million new homes** over five years, echoing building levels not seen since the 1960s. This initiative is aimed at combating the severe housing crisis, yet current construction rates highlight significant challenges to meeting these goals ([BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)).
40 | 
41 | - **Investment in Green Energy**: The government has proposed a **£23.7 billion** investment in renewable energy initiatives through a new energy company, **GB Energy**, aiming to create approximately **650,000 jobs** by 2030. This includes commitments to upgrade national energy infrastructure, significantly enhancing energy efficiency ([BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)).
42 | 
43 | ## 2. Healthcare Policies
44 | 
45 | ### 2.1 Commitment to NHS Reform
46 | Healthcare reform remains a top priority for the Labour government, particularly in addressing longstanding issues in the National Health Service (NHS).
47 | 
48 | - **Staffing Increase**: Labour pledges to recruit **8,500 new mental health staff** to bolster mental health services, alongside plans to provide an additional **40,000 GP appointments per week** to alleviate waiting times and enhance patient access. This commitment reflects a broader strategy to improve community health services ([Lexology](https://www.lexology.com/library/detail.aspx?g=978c7b0c-83bf-495f-a7f8-ac4614102e61)).
49 | 
50 | - **Investment in Diagnostic Facilities**: The government has committed to **doubling the number of cancer scanners** and has introduced a *Dentistry Rescue Plan*, aimed at increasing urgent dental appointments by **700,000**, addressing a significant gap in dental access ([Lexology](https://www.lexology.com/library/detail.aspx?g=978c7b0c-83bf-495f-a7f8-ac4614102e61)).
51 | 
52 | ## 3. Education Reforms
53 | 
54 | ### 3.1 Focus on Education Investment
55 | Education is a cornerstone of Labour’s policy vision, with a multifaceted approach aimed at improving outcomes and addressing systemic inequities.
56 | 
57 | - **Investment in Teacher Recruitment**: By reallocating funds raised from VAT on private school fees, the government intends to recruit additional teaching staff and improve educational infrastructure through direct investments in schools ([BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)).
58 | 
59 | - **Support for Mental Health in Schools**: With initiatives such as the establishment of **Young Futures hubs**, Labour emphasizes mental health parity with physical health. These hubs are designed to provide integrated care for children and young adults, addressing the rising rates of mental health issues exacerbated by the pandemic ([BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)).
60 | 
61 | ## 4. Housing Strategy
62 | 
63 | ### 4.1 Housing Development Initiatives
64 | Labour’s ambitious housing strategy seeks to alleviate the ongoing housing crisis exacerbated by years of underinvestment.
65 | 
66 | - **Home Building Targets**: Committed to building **1.5 million homes** in England during the next five years, Labour's housing policy also includes mitigating barriers to planning and increasing the supply of affordable housing for first-time buyers ([BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)).
67 | 
68 | - **Affordability Measures**: The government plans to enhance affordability by stimulating housing markets in underserved areas, thereby addressing both supply and demand challenges in the housing sector.
69 | 
70 | ## 5. Environmental Policies
71 | 
72 | ### 5.1 Green Policy Commitment
73 | Labour’s environmental initiative seeks to address climate change while fostering economic growth through green investments.
74 | 
75 | - **Green Prosperity Plan**: With a total commitment of **£23.7 billion** toward sustainable energy advancements, Labour aims to position the UK as a leader in renewable energy, countering previous government policies that extended fossil fuel usage timelines ([BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)).
76 | 
77 | - **Combating Climate Change**: The return of a ban on new petrol and diesel cars by **2030** reflects a robust commitment to reducing carbon emissions and encouraging a shift to sustainable transportation.
78 | 
79 | ## 6. Social Policies and Welfare Reforms
80 | 
81 | ### 6.1 Enhancing Social Safety Nets
82 | Labour is focused on bolstering social safety net policies to better support the vulnerable populations within the UK.
83 | 
84 | - **Employment Rights Bill**: Introduced on **October 10, 2024**, this bill aims to enhance job security and living standards. Key provisions include expanding workers' rights and improving conditions in low-paid sectors, reflecting Labour’s commitment to address inequalities in the workforce ([GOV.UK](https://www.gov.uk/government/publications/employment-rights-bill-factsheets)).
85 | 
86 | - **Childcare and Universal Support**: Proposals include expanding access to affordable childcare services, thereby facilitating greater participation of parents in the workforce, alongside universal welfare support adjustments aimed at reducing poverty ([GOV.UK](https://www.gov.uk/government/publications/employment-rights-bill-factsheets)).
87 | 
88 | ## Conclusion
89 | 
90 | Labour's early policies under the leadership of Keir Starmer reflect an ambitious agenda addressing immediate economic recovery, healthcare reform, and sustainable growth. While these policies are designed to tackle significant societal issues, ongoing challenges related to implementation delays, economic pressures, and public expectations underscore the complexity of governance in the current UK political landscape. As Labour navigates these multifaceted challenges, the effectiveness of its initiatives will be critical in shaping public perception and securing lasting change.
91 | 
92 | ### References
93 | 
94 | 1. [BBC](https://www.bbc.com/news/articles/cyxx1lq50nlo)
95 | 2. [Institute for Government](https://www.instituteforgovernment.org.uk/comment/2025-make-or-break-government)
96 | 3. [Lexology](https://www.lexology.com/library/detail.aspx?g=978c7b0c-83bf-495f-a7f8-ac4614102e61)
97 | 4. [GOV.UK](https://www.gov.uk/government/publications/employment-rights-bill-factsheets)


--------------------------------------------------------------------------------
/examples/sample_output/quantera_market_size.md:
--------------------------------------------------------------------------------
 1 | User Query:
 2 | 
 3 | What are the size, growth and dynamics of the market that the company Quantera (quantera.io) operates in?
 4 |  
 5 | Parameters:
 6 | * Reasoning Model: o3-mini
 7 | * Main Model: gpt-4o-mini
 8 | * Fast Model: gpt-4o-mini
 9 | * Mode (Simple or Deep): Simple
10 | * Max Iterations: 4
11 | 
12 | Cost: $0.02
13 | 
14 | ---
15 | 
16 | # Market Analysis of Quantera: Size, Growth, and Dynamics in Financial Technology
17 | 
18 | ## Overview of Quantera and Its Market Position
19 | 
20 | Quantera operates within the **financial technology (FinTech)** sector, specifically targeting private equity firms and investment companies. It leverages **artificial intelligence (AI)** and automation to enhance research and due diligence processes, allowing firms to efficiently gather, analyze, and report on data related to their investment pipelines. Launched by QX Labs, the platform excels in integrating various data sources—internal, subscription-based, and public—into a single, streamlined research system. Quantera's primary niche focuses on **AI-driven data automation for investment research and diligence**, enabling firms to produce customizable, automated reports that optimize their decision-making processes.
21 | 
22 | The company stands out in the financial landscape due to its emphasis on the integration of diverse data streams and the automation of research tasks, addressing the growing demand for efficiency and actionable insights in investment strategies.
23 | 
24 | ## Growth Characteristics of the FinTech and Investment Research Markets
25 | 
26 | ### Current Market Size and Future Projections
27 | 
28 | The global **FinTech market**, where Quantera operates, is undergoing rapid growth. It is currently valued at approximately **$209.7 billion** and is projected to reach **$258.83 billion by 2025**, representing a compound annual growth rate (CAGR) of **25.18%** from 2025 to 2029, with a projected market value of **$644.6 billion** by 2029 [1]. This explosive growth is driven by multiple factors, such as the increasing adoption of digital banking solutions and advancements in technology—including AI and automation—that enhance operational efficiencies and customer engagement.
29 | 
30 | In parallel, the **AI in asset management market** was valued at **$2.61 billion in 2022**, with a projected CAGR of **24.5%** from 2023 to 2030 [2]. This aligns with the trends seen across the FinTech sector, further indicating a substantial and expanding market for AI tools that facilitate investment decision-making.
31 | 
32 | ### Key Growth Drivers in FinTech and AI
33 | 
34 | 1. **Technological Advancements**: The proliferation of AI technologies, such as natural language processing and machine learning, has revolutionized investment strategies by providing deeper insights into vast datasets. Companies increasingly invest in these technologies to enhance operational efficiency and reduce risks [2].
35 | 
36 | 2. **Increased Competition**: The competitive landscape in investment management has surged, leading to greater demand for both technological solutions and data-driven strategies. Investment firms are now compelled to adopt advanced analytics to remain competitive and meet client demands effectively.
37 | 
38 | 3. **Changing Consumer Behavior**: As investors lean towards data-centric strategies, there is a rising expectation for transparency and quick turnarounds in reporting and analytics, which platforms like Quantera successfully address by streamlining research processes.
39 | 
40 | ### Market Dynamics and Trends
41 | 
42 | Several notable trends emerge as defining characteristics of the market that Quantera operates within:
43 | 
44 | - **Mergers and Acquisitions (M&A)**: Analysts predict a rise in M&A activities within the FinTech space as companies seek to consolidate resources, increase operational efficiencies, and scale their technological capabilities.
45 | 
46 | - **Generative AI Integration**: There is a sharp increase in interest in generative AI applications within investment research, indicating a shift towards utilizing AI not just for data analysis, but also for generating insights and automating client interactions [3].
47 | 
48 | - **Sustainability**: The trend of sustainable investing continues to grow, driven by consumer awareness and regulatory pushes towards environmentally-friendly practices. This shift necessitates the integration of sustainability metrics into investment analysis, creating additional layers of data for platforms like Quantera to process.
49 | 
50 | ### Investment Research Technology Trends
51 | 
52 | The technological landscape in investment research is being reshaped by several key developments:
53 | 
54 | 1. **Cloud Computing**: The transition to cloud-based solutions provides the necessary infrastructure for scalable operations, allowing firms to access data and analytics tools from anywhere—fundamental for modern investment strategies [4].
55 | 
56 | 2. **AI-Powered Analytics**: Firms are increasingly adopting AI-driven analytics to enhance accuracy and efficiency in predicting market trends, driving demand for platforms that can integrate these capabilities seamlessly.
57 | 
58 | 3. **Cybersecurity**: With heightened data sensitivity in financial institutions, cybersecurity remains a critical investment area. Companies recognize the need to bolster their defenses against potential breaches, driving demand for secure platforms like Quantera that handle sensitive financial data.
59 | 
60 | ### Addressable Market for AI-Driven Investment Tools
61 | 
62 | #### Total Addressable Market (TAM)
63 | 
64 | Estimates indicate that the **Total Addressable Market (TAM)** for AI in the investment sector is poised for substantial growth. The **Alternative Assets Industry** was reported to have approximately **$13.1 trillion** in Assets Under Management (AUM) as of mid-2023, with annual technological budget allocations of about **$13 billion** [5]. This suggests a significant opportunity exists for AI solutions that can improve operational efficiencies and enhance decision-making frameworks within investment firms.
65 | 
66 | #### Serviceable Addressable Market (SAM)
67 | 
68 | Early insights suggest that only **24%** of private equity firms currently utilize AI technologies effectively, highlighting a vast potential market for AI integration [6]. The high stakes and low margins in private equity necessitate swift, informed decision-making, indicating a critical need for the solutions that Quantera and others in the sector offer.
69 | 
70 | ### Serviceable Obtainable Market (SOM)
71 | 
72 | The **Serviceable Obtainable Market (SOM)** defines the realistic segment of the market that firms can capture. Successful companies in the investment technology space often target a SOM that ranges from **1%** to **5%** of the TAM for startups, while established firms may aim for **20%** to **30%** [7]. This strategic analysis helps firms like Quantera carve out realistic revenue targets and operational goals amidst growing competition.
73 | 
74 | ## Conclusion
75 | 
76 | The market in which Quantera operates is characterized by rapid technological advancement, a strong competitive landscape, and substantial growth projections. The FinTech sector is on a robust growth trajectory, driven by the increasing need for efficient investment processes and the evolution of consumer expectations around data access and analysis. As AI technologies continue to evolve, their integration into investment research will be vital, helping firms like Quantera harness growth opportunities in an increasingly data-driven world.
77 | 
78 | In summary, with a combination of technological prowess, strategic positioning, and a growing demand for automation and insights in investment research, Quantera is well-placed to leverage its capabilities as the market expands.
79 | 
80 | ## References
81 | 
82 | [1] https://explodingtopics.com/blog/fintech-market  
83 | [2] https://www.grandviewresearch.com/industry-analysis/artificial-intelligence-asset-management-market  
84 | [3] https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/the-top-trends-in-tech  
85 | [4] https://www.capgemini.com/insights/research-library/investment-trends-2025/  
86 | [5] https://medium.com/@guillemsague/what-is-the-tam-of-ai-in-the-investment-industry-43398d506ce7  
87 | [6] https://agencyanalytics.com/kpi-definitions/serviceable-obtainable-market  
88 | [7] https://agencyanalytics.com/kpi-definitions/serviceable-obtainable-market
89 | 


--------------------------------------------------------------------------------
/examples/sample_output/tesla.md:
--------------------------------------------------------------------------------
  1 | User Query:
  2 |  
  3 | Write a detailed report on Tesla, including:
  4 | * History of the company
  5 | * Key products and services
  6 | * Detailed insight into how their manufacturing process works
  7 | * Deep-dive on Tesla's relationship with China both for sales and manufacturing
  8 | * Detailed overview of their battery technology and key breakthroughs
  9 | * Financial performance
 10 |  
 11 | Parameters:
 12 | * Reasoning Model: o3-mini
 13 | * Main Model: gpt-4o-mini
 14 | * Fast Model: gpt-4o-mini
 15 | * Mode (Simple or Deep): Deep
 16 | * Max Iterations: 3
 17 | 
 18 | Cost: $0.18
 19 | 
 20 | ---
 21 | 
 22 | # Tesla: History, Products, Manufacturing, Market Dynamics, Battery Technology, and Financial Performance
 23 | 
 24 | ## Table of Contents
 25 | 
 26 | 1. History of Tesla, Inc.
 27 | 2. Tesla's Key Products and Services
 28 | 3. Manufacturing Process of Tesla Vehicles
 29 | 4. Tesla and China: A Complex Relationship
 30 | 5. Tesla's Battery Technology and Breakthroughs
 31 | 6. Tesla's Financial Performance
 32 | 
 33 | ## History of Tesla, Inc.
 34 | 
 35 | Tesla, Inc. has had a transformative impact on the automotive and energy sectors since its inception in 2003. Below is a comprehensive overview of the company's key milestones and challenges throughout its history.
 36 | 
 37 | ### Key Milestones
 38 | 
 39 | 1. **Founding and Early Days (2003-2008)**:
 40 |    - Tesla was founded by Martin Eberhard and Marc Tarpenning in July 2003. The company aimed to accelerate the world's transition to sustainable energy through electric vehicles (EVs).
 41 |    - In 2004, Elon Musk invested $7.5 million in Tesla during its Series A funding and became chairman, setting the stage for his integral role in the company's vision and strategy.
 42 | 
 43 | 2. **Introduction of the Tesla Roadster (2008)**:
 44 |    - In March 2008, Tesla launched the Roadster, its first production vehicle. The Roadster was groundbreaking for being one of the first highway-legal electric vehicles to use lithium-ion battery cells, achieving a range of over 200 miles per charge.
 45 |    - This vehicle demonstrated that electric cars could offer performance comparable to gasoline-powered sports cars and sold around 2,450 units.
 46 | 
 47 | 3. **IPO and Public Presence (2010)**:
 48 |    - Tesla went public in June 2010, raising $226 million. This IPO was crucial for the company as it provided the necessary capital to expand production and further invest in R&D.
 49 | 
 50 | 4. **Model S Launch (2012)**:
 51 |    - The Model S was launched in June 2012, receiving critical acclaim for its technology, safety features, and performance.
 52 |    - It was awarded multiple accolades, including the 2013 World Green Car of the Year and the highest safety rating from the National Highway Traffic Safety Administration (NHTSA).
 53 | 
 54 | 5. **Gigafactory Development (2014-2016)**:
 55 |    - In 2014, Tesla announced plans for its Gigafactory in Nevada, aimed at producing lithium-ion batteries at scale. This move was pivotal for reducing battery costs and securing a steady supply of batteries for its vehicles.
 56 |    - The Gigafactory officially opened in July 2016 and plays a critical role in Tesla's long-term sustainability strategy.
 57 | 
 58 | 6. **Model 3 Production (2017)**:
 59 |    - The Model 3, aimed at a broader consumer market, was unveiled in 2016 and began production in 2017. The vehicle garnered over 400,000 pre-orders, indicating strong consumer interest.
 60 |    - However, production challenges during the ramp-up phase highlighted the difficulties Tesla faced in achieving manufacturing efficiencies at scale.
 61 | 
 62 | 7. **Global Expansion and New Products (2018-2020)**:
 63 |    - Tesla expanded its market presence internationally, entering China with the Shanghai Gigafactory, which started production in late 2019.
 64 |    - The introduction of the Model Y in 2020 further diversified Tesla's offerings, competing in the growing SUV market.
 65 | 
 66 | 8. **Sustainability Initiatives (2020-Present)**:
 67 |    - Tesla's annual Impact Reports showcase its commitment to sustainability, detailing reductions in carbon emissions and local manufacturing efforts. For example, they detail how localized production minimizes logistics-related emissions [1](https://www.tesla.com/ns_videos/2020-tesla-impact-report.pdf).
 68 |    - In 2021, Tesla laid out its Master Plan Part 3, emphasizing its goal of advancing a sustainable energy economy while advocating for less material extraction and efficient investment strategies [2](https://www.tesla.com/ns_videos/Tesla-Master-Plan-Part-3.pdf).
 69 | 
 70 | ### Key Challenges
 71 | 
 72 | 1. **Production and Scalability Issues**:
 73 |    - Throughout its history, Tesla has faced significant challenges related to production scalability. The Model 3 production ramp-up exemplified this, with delays and quality issues plaguing initial outputs.
 74 |    - The company’s ambition to produce and deliver vehicles at scale led to what Musk termed "production hell," impacting its ability to meet consumer demand.
 75 | 
 76 | 2. **Financial Instability**:
 77 |    - Tesla has experienced numerous periods of financial instability, often operating at a loss during its early years. The company relied heavily on fundraising and capital markets for survival before achieving consistent profitability in 2020.
 78 | 
 79 | 3. **Leadership and Executive Turnover**:
 80 |    - Tesla has seen multiple changes in its executive team, which have created instability at times. While Elon Musk has been a constant and influential figure, other major positions, including CFO and CTO roles, have seen high turnover as the company navigates its growth challenges.
 81 | 
 82 | 4. **Market Competition**:
 83 |    - The automotive market is increasingly competitive, with established car manufacturers and new entrants bolstering their electric vehicle lineups, creating pressure on Tesla to innovate and differentiate itself continually.
 84 | 
 85 | 5. **Regulatory and Manufacturing Obstacles**:
 86 |    - Tesla has encountered various regulatory hurdles, including safety regulations and environmental standards, particularly in its international markets like China and Europe. Navigating these landscapes has required agility and foresight.
 87 | 
 88 | ### Conclusion
 89 | 
 90 | Tesla, Inc.'s journey over the past two decades encapsulates a blend of innovative breakthroughs and significant challenges. The company's milestones illustrate its pioneering role in the electric vehicle market and commitment to sustainable energy. However, overcoming production hurdles, maintaining financial stability, and navigating growing competition remain ongoing challenges that will define Tesla's future trajectory.
 91 | 
 92 | ## Tesla, Inc.: Main Products and Services and Their Contribution to Market Position
 93 | 
 94 | Tesla, Inc. has established itself as a leader in the electric vehicle (EV) and clean energy sectors with a robust portfolio of products and services. This comprehensive analysis highlights Tesla's key offerings, including electric vehicles, energy products, and advanced software services, and discusses how these elements contribute to its strong market position.
 95 | 
 96 | ### 1. Electric Vehicles (EVs)
 97 | 
 98 | Tesla's core product line comprises several electric vehicle models, each designed to cater to differing consumer needs:
 99 | 
100 | #### **Model S**  
101 | - **Range**: Approximately 410 miles (EPA estimated).  
102 | - **Acceleration**: 0-60 mph in 3.1 seconds.  
103 | - **Powertrain**: Dual Motor with a peak power of 670 hp.  
104 | - **Market Position**: The Model S is positioned as a luxury EV, combining high performance with cutting-edge technology, emphasizing Tesla's commitment to sustainability.
105 | 
106 | #### **Model 3**  
107 | - **Range**: Varies by configuration; designed for affordability while maintaining long-distance efficiency.  
108 | - **Charging Speed**: Can achieve up to 185 miles added in just 15 minutes with Supercharging.  
109 | - **Market Position**: Targeted towards the mass market, Model 3 represents Tesla's strategy to offer compelling electric alternatives to traditional gasoline vehicles, thereby increasing EV adoption.
110 | 
111 | #### **Model X**  
112 | - **Features**: Standard All-Wheel Drive (AWD) and extensive storage capacity, emphasizing luxury and utility.  
113 | - **Performance**: Known for the highest towing capacity of any electric SUV.  
114 | - **Market Position**: This model appeals to families seeking both functionality and advanced technology in their vehicles.
115 | 
116 | #### **Model Y**  
117 | - **Specifications**: Versatile design accommodating passengers and cargo.  
118 | - **Market Position**: The growing demand for crossovers makes Model Y integral to Tesla's expansion strategy, targeting a wide audience including families and young urban professionals.
119 | 
120 | #### **Impact on Market Position**  
121 | The release and success of these models have propelled Tesla to produce approximately **1.85 million vehicles** in 2023, reflecting a **35% increase** year-over-year (YoY) and a **38% rise** in deliveries to **1.81 million vehicles**. This growth is indicative of Tesla's strong market appeal and operational effectiveness in meeting rising consumer demand [3](https://www.tesla.com/models).
122 | 
123 | ### 2. Energy Products
124 | 
125 | In addition to electric vehicles, Tesla has made significant strides in the renewable energy sector with products that promote sustainability and energy independence:
126 | 
127 | #### **Powerwall**  
128 | - **Energy Capacity**: 13.5 kWh usable energy.  
129 | - **Power Output**: 5.8 kW continuous, supporting storage of solar energy for residential use.  
130 | - **Market Position**: The Powerwall integrates seamlessly with solar panels, enabling home users to store excess energy for later use. This versatility appeals to environmentally-conscious consumers looking to reduce energy costs.
131 | 
132 | #### **Powerpack**  
133 | - **Designed for**: Commercial applications, scalable to large businesses or utilities.  
134 | - **Energy Storage**: Each unit provides 210 kWh, suitable for extensive energy demands.  
135 | - **Market Position**: The Powerpack is essential for larger applications, reinforcing Tesla's presence in the commercial energy market.
136 | 
137 | #### **Solar Roof**  
138 | - **Functionality**: Combines roofing with solar energy generation.  
139 | - **Market Position**: Attractively marketed towards homeowners, it offers a dual solution of aesthetics and energy generation, further cementing Tesla's innovative image in the renewable energy sector.
140 | 
141 | #### **Impact on Market Position**  
142 | Tesla's energy products have gained traction among both residential and commercial consumers, contributing to a growing interest in renewable energy solutions. This pivot into energy complements Tesla's overall brand messaging of sustainability and technological advancement, enhancing its competitive edge in the market for energy storage and solar generation [4](https://www.tesla.com/energy).
143 | 
144 | ### 3. Software Services
145 | 
146 | An integral part of Tesla's innovation lies in its software offerings, specifically Autopilot and Full Self-Driving (FSD) features:
147 | 
148 | #### **Autopilot**  
149 | - **Functionality**: Provides automated steering, acceleration, and braking within lanes.  
150 | - **Market Position**: Positions Tesla as a frontrunner in automotive technology, emphasizing safety and advanced driving convenience.
151 | 
152 | #### **Full Self-Driving (FSD)**  
153 | - **Capabilities**: Includes features such as speed adaptations based on detected limits, although full autonomy remains under development.  
154 | - **Subscription Model**: Offers a flexible payment structure for users with vehicles equipped with the requisite hardware.  
155 | - **Market Position**: This innovative approach to vehicle software reaffirms Tesla's position in the tech-savvy automotive landscape, attracting consumers interested in future-focused driving technology [5](https://www.tesla.com/support/autopilot).
156 | 
157 | ### Conclusion
158 | 
159 | Tesla, Inc. has cultivated a commanding market presence through its diverse array of electric vehicles, energy products, and advanced software services. Each segment of Tesla’s offerings plays a pivotal role in fortifying its reputation for innovation, sustainability, and performance, thereby appealing strongly to a variety of consumers and solidifying its leadership in both the EV and energy markets.
160 | 
161 | By emphasizing technological advancements and sustainability, Tesla aligns itself with evolving consumer values, ensuring its market position remains robust amidst increasing competition in the automotive and renewable energy sectors.
162 | 
163 | ## Manufacturing Process of Tesla Vehicles
164 | 
165 | Tesla, Inc. stands as a trailblazer in both electric vehicle (EV) manufacturing and sustainable practices. The company's innovative manufacturing processes leverage advanced technologies and strategic methodologies that not only enhance operational efficiency but also minimize environmental impact. This detailed analysis presents insights into how Tesla achieves these objectives through its manufacturing systems, automation, robotics, and sustainable practices.
166 | 
167 | ### Overview of Tesla's Manufacturing Capabilities
168 | 
169 | #### Gigafactories: A Pillar of Production
170 | 
171 | Tesla's production capabilities are heavily anchored in its Gigafactories, notably:
172 | 
173 | - **Gigafactory Nevada**: This facility is pivotal in Tesla's battery production, being one of the highest-volume plants globally for electric motors, vehicle powertrains, and batteries. The scale and efficiency at Gigafactory Nevada enable Tesla to meet the rising demand for both electric vehicles and energy storage solutions [6](https://www.tesla.com/giga-nevada).
174 | 
175 | - **Battery Product Range**: Tesla develops various battery systems, such as the **Powerwall** for residential energy storage and the **Megapack** for utility-scale applications. These products enhance users' energy independence while contributing to grid reliability, supporting the shift toward renewable energy [7](https://www.tesla.com/megapack).
176 | 
177 | - **Development of Megafactory**: Tesla's upcoming Megafactory aims to be one of the world’s largest utility-scale battery manufacturing plants, capable of producing upwards of 10,000 Megapacks yearly, equating to around 40 GWh [8](https://www.tesla.com/megafactory). This expansion underlines Tesla's commitment to securing its place in the renewable energy sector.
178 | 
179 | ### Automation and Robotics in Manufacturing
180 | 
181 | Tesla utilizes cutting-edge automation and robotic technologies to streamline its production processes, achieving unprecedented efficiency:
182 | 
183 | 1. **AI-Driven Robotics**: The integration of artificial intelligence (AI) enhances both vehicle and robotics functionality. Tesla focuses heavily on using AI for vision and planning, which is crucial for the autonomous operations of robots within the manufacturing environment [9](https://www.tesla.com/AI).
184 | 
185 | 2. **Collaborative Robotic Systems**: Tesla employs a combination of robotics and human labor in its assembly lines. Robots perform repetitive, physically demanding tasks, enabling human workers to engage in more complex aspects of production [10](https://www.tesla.com/manufacturing). This hybrid workforce enhances productivity while minimizing workplace injuries.
186 | 
187 | 3. **Expanding Careers in Automation**: The company consistently hires skilled labor for roles that advance their automation capabilities, emphasizing a commitment to innovation within the manufacturing sector [11](https://www.tesla.com/careers/search/job/automation-engineer-manufacturing-engineering-225514).
188 | 
189 | ### Innovative Manufacturing Techniques
190 | 
191 | Tesla's manufacturing process is characterized by several innovative techniques designed to boost productivity and sustainability:
192 | 
193 | - **First Principles Approach**: This method prioritizes foundational principles over conventional practices when addressing manufacturing challenges. As a result, Tesla can develop uniquely tailored solutions that bolster efficiency (https://www.tesla.com/impact).
194 | 
195 | - **Sustainability-Led Design**: New Gigafactories, particularly in Berlin, incorporate advanced designs focusing on minimizing environmental impact while maximizing production efficiency. Such features promote sustainable practices across all facets of Tesla's operations [13](https://www.tesla.com/giga-berlin).
196 | 
197 | - **Continuous Engineering Development**: Tesla's engineering teams are committed to ongoing innovation in tooling and manufacturing processes, essential for adapting to future production needs while maintaining quality standards [14](https://www.tesla.com/engineering).
198 | 
199 | ### Cost Efficiency Compared to Conventional Manufacturing
200 | 
201 | Tesla's revolutionary manufacturing techniques significantly lower production costs compared to traditional automakers. Key factors include:
202 | 
203 | - **Advanced Production Methods**: Techniques like gigapresses and integrated assembly lines reduce the overall complexity and number of parts needed, thus lowering labor and material costs [15](https://www.tesla.com/gigapress).
204 | 
205 | - **Economies of Scale**: As production expands across its Gigafactories, Tesla achieves diminishing unit costs. This contrasts with traditional automotive companies, which frequently depend on a fragmented supply chain [16](https://www.tesla.com/supplychain).
206 | 
207 | - **Cost Reduction Innovation**: Reports indicate that Tesla’s production costs are declining as the company continually seeks efficiencies in battery manufacturing and operational processes [17](https://www.tesla.com/financials).
208 | 
209 | ### Manufacturing Efficiency and Sustainability Metrics
210 | 
211 | #### Production Metrics
212 | 
213 | Recent figures highlight Tesla’s manufacturing prowess. In Q4 2023, Tesla produced approximately **495,000 vehicles**, marking a year-on-year increase of **35%** [18](https://ir.tesla.com/press-release/tesla-vehicle-production-deliveries-and-date-financial-results-webcast-fourth-quarter-2023).
214 | 
215 | #### Sustainability Outcomes
216 | 
217 | Tesla’s commitment to sustainability has tangible results:
218 | - **CO₂ Emissions Avoided**: In 2023, Tesla drivers avoided over **20 million metric tons of CO₂e emissions**, contributing significantly to global emission reduction efforts [19](https://www.tesla.com/ns_videos/2023-tesla-impact-report-highlights.pdf).
219 | - **Individual Vehicle Impact**: Each Tesla vehicle can potentially prevent about **51 metric tons of CO₂e** emissions over its lifespan compared to traditional combustion-engine vehicles [12](https://www.tesla.com/impact).
220 | 
221 | ### Conclusion
222 | 
223 | Tesla's innovative manufacturing processes exemplify a dedication to both efficiency and sustainability. Through the utilization of Gigafactories, cutting-edge automation, and a unique first principles approach, Tesla has positioned itself at the forefront of the automotive and energy sectors. The company not only enhances production efficiency but also significantly contributes to the reduction of carbon emissions, reaffirming its leadership in sustainable manufacturing.
224 | 
225 | ## Tesla and China: A Complex Relationship
226 | 
227 | Tesla's relationship with China is multifaceted, encompassing strategic partnerships, manufacturing capacities, market competition, sales dynamics, and regulatory challenges. This interaction is crucial for Tesla as China represents one of its largest markets and a significant part of its operational framework.
228 | 
229 | ### 1. Strategic Partnerships
230 | 
231 | Tesla has increasingly pursued local partnerships to strengthen its presence and operational efficiency in China. Notably, the company has engaged with Alibaba to leverage the tech giant's expertise in cloud services. This collaboration aims to enhance Tesla's Full Self-Driving (FSD) technology by utilizing Alibaba's data services, showcasing Tesla's adaptive strategy to align with local technological ecosystems and regulatory frameworks [20](https://www.ainvest.com/news/tesla-alibaba-potential-powerhouse-partnership-china-autonomous-driving-revolution-2503/).
232 | 
233 | These partnerships are a reflection of Tesla’s broader strategy to navigate the complexities of operating in China, where understanding local consumer behavior and regulations is critical for sustained growth. Collaborating with established local firms not only aids in compliance but also fosters innovation, allowing Tesla to enhance its technological capabilities while solidifying its market position.
234 | 
235 | ### 2. Manufacturing Footprint: Gigafactory Shanghai
236 | 
237 | Tesla's Gigafactory in Shanghai, established without a local partner, marks a significant milestone in its operational strategy. This facility is now integral to Tesla’s global supply chain, producing over 750,000 vehicles annually and accounting for more than 50% of Tesla's global deliveries [21](https://en.wikipedia.org/wiki/Gigafactory_Shanghai). The Shanghai plant achieved remarkable production milestones—producing its 3 millionth vehicle just 13 months after reaching 2 million, indicating efficient manufacturing operations. Each vehicle takes approximately 30 seconds to roll off the assembly line, supported by a 95% automation rate, which enhances productivity significantly [22](https://cnevpost.com/2024/01/10/automakers-nev-market-share-in-china-in-2023/).
238 | 
239 | Furthermore, substantial local government support has bolstered Giga Shanghai, which received over $600 million in financial backing and tax incentives, underscoring the reciprocal relationship between Tesla and local authorities. This support not only aids in operational cost reductions but also provides a considerable tax influx to the municipality.  
240 | 
241 | ### 3. Sales Dynamics and Market Position
242 | 
243 | Tesla's sales figures in China reflect both challenges and successes. Despite a slight decline in market share to approximately 7.8% by the end of 2023, down from nearly 9% the previous year, Tesla still generated about **$21.75 billion** from its Chinese operations, accounting for 22.5% of the company’s total revenue [23](https://www.teslarati.com/tesla-china-overall-revenue-2023/). In 2023, Tesla sold around **603,664 vehicles** in China, constituting nearly 35% of global deliveries for Q4 alone [24](https://www.goodcarbadcar.net/tesla-china-sales-figures/).
244 | 
245 | Tesla’s position in the rapidly evolving market has been significantly challenged by local competitors, particularly BYD, which commands approximately **35%** of the NEV market. BYD's aggressive pricing and innovative products have intensified competition, pushing Tesla to reevaluate its strategies in promoting models like the Model 3 and Model Y, which are tailored to the Chinese consumer market. Analysts suggest that Tesla must innovate in design, service offerings, and product variety to maintain relevance as consumer preferences shift toward cost-effective alternatives.
246 | 
247 | ### 4. Regulatory Challenges
248 | 
249 | Navigating the regulatory framework in China presents a unique set of challenges for Tesla, particularly regarding its Full Self-Driving technology. The Chinese government has instituted stringent data security laws requiring that all data collected by Tesla’s vehicles must be stored locally. This regulation has delayed the deployment of FSD capabilities due to compliance hurdles with the Ministry of Industry and Information Technology [25](https://www.voanews.com/a/tesla-clears-key-regulatory-hurdles-for-self-driving-in-china-during-musk-visit/7588990.html).
250 | 
251 | Elon Musk's recent visit to China highlighted ongoing discussions about operational hurdles and regulatory compliance, indicating a supportive but cautious regulatory environment. Successful navigation of these regulatory landscapes is essential for Tesla to maintain its competitive edge and operational viability in China.
252 | 
253 | ### 5. Competitive Landscape and Future Implications
254 | 
255 | The Chinese EV market continues to pose significant competitive pressures, particularly from BYD and other local manufacturers. With BYD's introduction of advanced technologies like ultra-fast charging and augmented driving assistance systems, Tesla is prompted to potentially revise its pricing strategies to remain viable. This competitive pressure is compounded by changes in consumer behavior, as price sensitivity increases amid economic fluctuations within China.
256 | 
257 | Despite the challenges, Tesla's operational success hinges on its ability to adapt to local consumer preferences and regulatory requirements. The ongoing success of Giga Shanghai is critical, as it not only contributes significantly to Tesla's production goals but also serves as a model for how foreign companies can effectively engage in the Chinese market.
258 | 
259 | In summary, Tesla’s strategic partnerships, robust manufacturing operations, intricate regulatory navigation, and response to competitive pressures are key dynamics shaping its relationship with China. As Tesla continues to leverage local expertise and adapt its strategies, the company's trajectories in this vital market will be pivotal for its global standing in the electric vehicle sector.
260 | 
261 | ## Key Advancements in Tesla's Battery Technology and Their Influence on the EV Market
262 | 
263 | Tesla has consistently set the pace in the electric vehicle (EV) industry, particularly through its innovations in battery technology. These advancements not only enhance the performance and affordability of its vehicles but also significantly influence the broader EV market landscape. Below is a comprehensive overview of Tesla's key battery technology advancements, their implications for EV pricing strategies, and how they reshape the competitive dynamics within the industry.
264 | 
265 | ## Overview of Tesla's Battery Technology Innovations
266 | 
267 | ### Sustainable Production Practices
268 | Tesla's commitment to sustainability is evident in its efforts to produce battery components in-house, notably targeting cathode materials. The company's focus on reducing reliance on traditional materials aligns with its objectives to lower production costs while enhancing efficiency. This in-house production method is designed to streamline the supply chain, reduce environmental impacts, and enable more controlled manufacturing processes.
269 | 
270 | ### Key Innovations
271 | 
272 | 1. **Nickel-Rich Cathode Materials**:  
273 |    Tesla's use of nickel-rich cathodes has become a standard in its battery technology. These materials offer higher energy density and contain significantly less cobalt, easing some ethical concerns tied to cobalt mining. This shift not only improves battery performance but also lowers costs associated with raw materials, especially cobalt, which has become increasingly expensive due to supply chain issues and ethical concerns [26](https://www.tesla.com/blog/tesla-lithium-refinery-groundbreaking).
274 | 
275 | 2. **Acid-Free Lithium Refining**:   
276 |    Tesla is pushing boundaries with its acid-free lithium refining process, which aims to minimize the environmental footprint traditionally associated with lithium extraction. This innovation is critical as the industry seeks to scale up production without imposing heavy ecological costs. By minimizing the use of harmful chemicals in lithium processing, Tesla is setting a new standard for sustainable battery production [27](https://www.tesla.com/ns_videos/2021-tesla-impact-report.pdf).
277 | 
278 | 3. **4680 Battery Cell Design**:  
279 |    The 4680 battery cell, a major technological leap for Tesla, is notable for its larger format and innovations like a tabless design that enhances energy density and thermal performance while simplifying manufacturing processes. This new design is expected to reduce costs further due to the efficiencies gained from in-house production at Tesla's Gigafactory, as the company aims to control quality and manage costs more effectively [28](https://www.tesla.com/careers/search/job/sr-cell-engineer-4680-global-quality-239886).
280 | 
281 | ### Future Directions
282 | Tesla has indicated a focus on continuing improvements in battery cell and pack design, aiming for enhanced safety and longevity. Their ongoing research and development initiatives emphasize recycling processes to recover battery materials, which aligns with their goal of supporting the lifecycle management of EV batteries and sustainability in EV adoption [19](https://www.tesla.com/ns_videos/2023-tesla-impact-report-highlights.pdf).
283 | 
284 | ## Impact on EV Pricing Strategies
285 | 
286 | Tesla's advancements in battery technology have substantially influenced pricing strategies within the EV market. The integration of technologies such as the 4680 battery cell, which could cut production costs by up to **56%**, has positioned Tesla to offer more competitively priced vehicles. These advancements in production efficiency directly lead to lower retail prices for consumers, making electric vehicles more accessible [29](https://www.advicescout.com/tesla-battery-technology/?srsltid=AfmBOooVevQtJVdR4KBUKtXv-j48szhaBqy7zd7l4mr2OYf05cjaSwud).
287 | 
288 | ### Current Pricing Trends
289 | - **Cost Reductions**: Recent data indicates that battery costs have decreased significantly—by about **50%**—over the past few years. This drop has allowed Tesla to reduce the proportion of battery costs in vehicle prices; for instance, battery costs now account for only **7.5%** of the price of the Tesla Model 3, down from **15%** before [30](https://www.fastmarkets.com/insights/electric-vehicle-economics-how-lithium-ion-battery-costs-impact-ev-prices/).
290 | - **Affordable EV Models**: As Tesla improves its battery affordability, newer models like the **Model 3** and upcoming vehicles such as the **Cybertruck** and the **Roadster** are expected to penetrate broader consumer markets, thus further enhancing Tesla's market share in the EV landscape [31](https://goldmansachs.com/insights/articles/electric-vehicle-battery-prices-are-expected-to-fall-almost-50-percent-by-2025).
291 | 
292 | ## Implications for the EV Market
293 | 
294 | Tesla’s relentless focus on battery technology is setting new standards across the EV industry. By lowering battery production costs, Tesla not only strengthens its own position but also compels competitors to innovate similarly, reshaping the competitive dynamics of the EV market.
295 | 
296 | ### Competitive Landscape & Industry Response
297 | - **Market Dynamics**: Other manufacturers are increasingly pressured to enhance their battery technologies and offerings to maintain competitiveness against Tesla. Innovations from companies like **CATL** and **BYD** in battery performance and sustainability highlight the urgency for continual technological advancements across the industry [32](https://synergyfiles.com/2025/02/new-battery-technology-2025/).
298 | - **Global Influence**: Tesla's pricing strategies and technological advancements could make a significant impact on global EV adoption rates. Reduced prices and improved performance are expected to encourage more consumers to switch from traditional gasoline cars to electric vehicles, thus further entrenching EVs in the global automotive market [33](https://www.iea.org/reports/global-ev-outlook-2024).
299 | 
300 | ### Future Considerations
301 | Looking ahead, if battery prices are projected to fall below **$100 per kWh** by 2026, it could catalyze an even more rapid adoption of EVs, particularly as manufacturers begin exploring alternative chemistries and expand their product offerings [34](https://electrek.co/2024/12/10/ev-battery-prices-plummeting-great-news-for-buyers/).
302 | 
303 | ## Conclusion
304 | 
305 | In summary, Tesla’s advancements in battery technology, including innovations in cathode materials, manufacturing processes, and the development of the 4680 cell, play a critical role in lowering production costs and EV pricing. These changes are vital for the acceleration of electric vehicle adoption, impacting not only Tesla's market position but also the broader landscape of the EV industry. As Tesla continues to push the envelope on performance and sustainability, the ramifications of these developments will resonate throughout the automotive sector, challenging manufacturers worldwide to adapt and innovate.
306 | 
307 | ## Tesla's Financial Performance in 2025 and Influencing Factors
308 | 
309 | ### Overview of Financial Metrics for 2025
310 | 
311 | Tesla's financial outlook for 2025 shows a promising trajectory, underpinned by strategic expansions and technological advancements. Detailed insights acquired from various reports indicate expectations of robust revenue growth, controlled profit margins, and improvement in operational efficiency as the company navigates the next fiscal year.
312 | 
313 | ### Revenue Projections
314 | 
315 | Tesla anticipates significant revenue growth in 2025, with forecasts estimating revenues around **$127.61 billion**. This increase is attributed to enhanced production capacities and strategic sales expansions across various global markets. The company's sustained investment in its infrastructure, particularly through the development of new Gigafactories, is central to achieving these revenue goals. Increased vehicle deliveries are expected to be a crucial driver, as Tesla ramps up production to meet escalating consumer demand for electric vehicles (EVs) [35](https://ir.tesla.com/_flysystem/s3/sec/000162828025002993/tsla-20250129-gen.pdf).
316 | 
317 | ### Profit Margins
318 | 
319 | Despite rising costs associated with raw materials and logistics, Tesla is maintaining its focus on healthy profit margins. The company effectively manages its gross margins through improved production efficiency and economies of scale. Reports suggest that Tesla will continue to adapt its cost management strategies, leveraging innovation in sourcing materials to offset potential cost pressures. This approach will be vital in preserving profitability while also expanding sales .
320 | 
321 | ## Key Growth Factors
322 | 
323 | ### 1. Production Capacity
324 | 
325 | Tesla's aggressive scaling of production capacities is a cornerstone of its business strategy for 2025. The ongoing investments in Gigafactories are essential for increasing output, directly correlating to an anticipated escalation in vehicle sales. Analysts predict that successful implementation of this expanded capacity will significantly bolster revenue streams .
326 | 
327 | ### 2. Market Expansion
328 | 
329 | The company is actively entering new regional markets, thereby increasing its global footprint. The introduction of new models tailored to diverse consumer preferences is expected to further enhance Tesla's market share and drive revenue increases. As Tesla targets different demographics and price points, it plans to compete more effectively in both established and emerging markets [37](https://www.ainvest.com/news/tesla-2025-pain-believer-perspective-2502/).
330 | 
331 | ### 3. Technological Innovations
332 | 
333 | Technological progress remains a pivotal growth driver for Tesla. The company's focus on advancements in Full Self-Driving (FSD) capabilities and battery technology is anticipated to enhance its product portfolio significantly. Such innovations not only appeal to tech-savvy consumers but also bolster investor confidence, projecting a potential market cap reaching up to **$2 trillion** by the end of 2025 .
334 | 
335 | ### 4. Cost Management
336 | 
337 | Strategic innovations and technological advancements have placed Tesla in a position to minimize costs effectively. These measures are likely to have a positive impact on profit margins in the upcoming fiscal year, reducing vulnerability to external price fluctuations in raw materials and logistics [36](https://www.accessoriesfortesla.com/post/understanding-the-future-of-tesla-stock-insights-and-predictions-for-2025).
338 | 
339 | ## Challenges and Competitive Landscape
340 | 
341 | As Tesla navigates its financial path in 2025, it faces several significant challenges that could impact its performance.
342 | 
343 | ### Increased Competition
344 | 
345 | The rise of competitors, notably BYD and NIO from China, poses a substantial threat to Tesla's market share. BYD has reported aggressively rising revenues and overtook Tesla in annual sales in 2024, which has intensified the competition landscape. Analysts indicate that Tesla will need to react proactively to maintain its market position, potentially involving strategic pricing adjustments and increased efficiencies [38](https://nypost.com/2025/03/26/business/byd-aims-to-double-overseas-sales-to-800k-in-2025-as-tesla-competition-heats-up/).
346 | 
347 | ### Regulatory Challenges
348 | 
349 | Tesla's expansion into new markets could be hindered by varying regulatory frameworks and compliance requirements. Stricter automotive regulations in some regions may challenge operational capabilities and impact cost structures, necessitating strategic adaptations from the company .
350 | 
351 | ### Market Conditions and Sentiment
352 | 
353 | Broader economic conditions can adversely affect consumer demand and overall market sentiment towards Tesla. Economic downturns lead to decreased consumer spending, potentially impacting EV sales. The company's stock value is influenced by market perceptions, which can be volatile and reflective of both internal performance and external economic factors .
354 | 
355 | ### Brand Image and Leadership Influence
356 | 
357 | Elon Musk's public persona has garnered mixed responses, affecting brand perception among certain consumer segments. Controversies tied to Musk's political affiliations have provoked backlash, potentially driving away customers who feel alienated. Analytics suggest that maintaining a positive brand image is critical for Tesla to retain its loyal customer base and attract new buyers [39](https://www.barchart.com/story/news/31658195/analysts-are-seriously-divided-on-tesla-stock-is-120-or-550-more-likely-for-tsla-in-2025).
358 | 
359 | ## Conclusion
360 | 
361 | As Tesla heads toward 2025, its financial performance is set to reflect a blend of strategic initiatives and external pressures. While the company is well-positioned for growth through innovations and market expansions, it must navigate significant challenges posed by competitors and economic conditions. Investors and stakeholders are advised to closely monitor Tesla's upcoming financial reports and announcements, particularly the Q4 2024 results due on January 29, 2025, to gain further insights into the company's performance and strategic outlook moving forward [40](https://ir.tesla.com/).
362 | 
363 | ## References:
364 | 
365 | [1] https://www.tesla.com/ns_videos/2020-tesla-impact-report.pdf  
366 | [2] https://www.tesla.com/ns_videos/Tesla-Master-Plan-Part-3.pdf  
367 | [3] https://www.tesla.com/models  
368 | [4] https://www.tesla.com/energy  
369 | [5] https://www.tesla.com/support/autopilot  
370 | [6] https://www.tesla.com/giga-nevada  
371 | [7] https://www.tesla.com/megapack  
372 | [8] https://www.tesla.com/megafactory  
373 | [9] https://www.tesla.com/AI  
374 | [10] https://www.tesla.com/manufacturing  
375 | [11] https://www.tesla.com/careers/search/job/automation-engineer-manufacturing-engineering-225514  
376 | [12] https://www.tesla.com/impact  
377 | [13] https://www.tesla.com/giga-berlin  
378 | [14] https://www.tesla.com/engineering  
379 | [15] https://www.tesla.com/gigapress  
380 | [16] https://www.tesla.com/supplychain  
381 | [17] https://www.tesla.com/financials  
382 | [18] https://ir.tesla.com/press-release/tesla-vehicle-production-deliveries-and-date-financial-results-webcast-fourth-quarter-2023  
383 | [19] https://www.tesla.com/ns_videos/2023-tesla-impact-report-highlights.pdf  
384 | [20] https://www.ainvest.com/news/tesla-alibaba-potential-powerhouse-partnership-china-autonomous-driving-revolution-2503/  
385 | [21] https://en.wikipedia.org/wiki/Gigafactory_Shanghai  
386 | [22] https://cnevpost.com/2024/01/10/automakers-nev-market-share-in-china-in-2023/  
387 | [23] https://www.teslarati.com/tesla-china-overall-revenue-2023/  
388 | [24] https://www.goodcarbadcar.net/tesla-china-sales-figures/  
389 | [25] https://www.voanews.com/a/tesla-clears-key-regulatory-hurdles-for-self-driving-in-china-during-musk-visit/7588990.html  
390 | [26] https://www.tesla.com/blog/tesla-lithium-refinery-groundbreaking  
391 | [27] https://www.tesla.com/ns_videos/2021-tesla-impact-report.pdf  
392 | [28] https://www.tesla.com/careers/search/job/sr-cell-engineer-4680-global-quality-239886  
393 | [29] https://www.advicescout.com/tesla-battery-technology/?srsltid=AfmBOooVevQtJVdR4KBUKtXv-j48szhaBqy7zd7l4mr2OYf05cjaSwud  
394 | [30] https://www.fastmarkets.com/insights/electric-vehicle-economics-how-lithium-ion-battery-costs-impact-ev-prices/  
395 | [31] https://goldmansachs.com/insights/articles/electric-vehicle-battery-prices-are-expected-to-fall-almost-50-percent-by-2025  
396 | [32] https://synergyfiles.com/2025/02/new-battery-technology-2025/  
397 | [33] https://www.iea.org/reports/global-ev-outlook-2024  
398 | [34] https://electrek.co/2024/12/10/ev-battery-prices-plummeting-great-news-for-buyers/  
399 | [35] https://ir.tesla.com/_flysystem/s3/sec/000162828025002993/tsla-20250129-gen.pdf  
400 | [36] https://www.accessoriesfortesla.com/post/understanding-the-future-of-tesla-stock-insights-and-predictions-for-2025  
401 | [37] https://www.ainvest.com/news/tesla-2025-pain-believer-perspective-2502/  
402 | [38] https://nypost.com/2025/03/26/business/byd-aims-to-double-overseas-sales-to-800k-in-2025-as-tesla-competition-heats-up/  
403 | [39] https://www.barchart.com/story/news/31658195/analysts-are-seriously-divided-on-tesla-stock-is-120-or-550-more-likely-for-tsla-in-2025  
404 | [40] https://ir.tesla.com/  


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel"]
3 | build-backend = "setuptools.build_meta" 
4 | 
5 | [tool.pytest.ini_options]
6 | asyncio_mode = "auto"
7 | asyncio_default_fixture_loop_scope = "function"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | python-dotenv
3 | aiohttp
4 | asyncio
5 | beautifulsoup4
6 | lxml
7 | pydantic
8 | openai-agents==0.0.7
9 | md2pdf


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | LATEST_VERSION = "0.0.10"
 4 | 
 5 | exclude_packages = []
 6 | 
 7 | with open("README.md", "r", encoding="utf-8") as fh:
 8 |     long_description = fh.read()
 9 |     
10 | with open("requirements.txt", "r") as f:
11 |     reqs = [line.strip() for line in f if not any(pkg in line for pkg in exclude_packages)]
12 | 
13 | setup(
14 |     name="deep-researcher",
15 |     version=LATEST_VERSION,
16 |     author="Jai Juneja",
17 |     author_email="jai@qxlabs.com",
18 |     description="A package for performing deep research using AI agents, implemented using the OpenAI Agents SDK",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     url="https://github.com/qx-labs/agents-deep-research",
22 |     package_dir={'deep_researcher': 'deep_researcher'},
23 |     packages=find_packages(),
24 |     classifiers=[
25 |         "License :: OSI Approved :: Apache Software License",
26 |         "Operating System :: OS Independent",
27 |         "Intended Audience :: Developers",
28 |         "Intended Audience :: Education",
29 |         "Intended Audience :: Science/Research",
30 |         "Programming Language :: Python :: 3",
31 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
32 |     ],
33 |     python_requires=">=3.10",
34 |     install_requires=reqs,
35 |     extras_require={
36 |         'dev': [
37 |             'pytest',
38 |             'pytest-asyncio'
39 |         ]
40 |     },
41 |     entry_points={
42 |         'console_scripts': [
43 |             'deep-researcher=deep_researcher.main:cli_entry',
44 |         ],
45 |     },
46 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx-labs/agents-deep-research/4e3ce35be10a59dfdbe4180ce60085a6e5ffb7ff/tests/__init__.py


--------------------------------------------------------------------------------
/tests/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model configuration file for tests.
 3 | 
 4 | Note: The appropriate environment variables need to be set up for each provider/model tested.
 5 | """
 6 | 
 7 | # ==== FOR TESTING DIFFERENT MODEL PROVIDERS ====
 8 | 
 9 | # Different model providers and corresponding models to be tested
10 | # Modify as needed to test different models
11 | # Note that this list of models is only used for basic testing
12 | PROVIDERS_TO_TEST = {
13 |     'openai': 'gpt-4o-mini',
14 |     'azureopenai': 'gpt-4o-mini',
15 |     'anthropic': 'claude-3-5-sonnet-latest',
16 |     'gemini': 'gemini-2.0-flash',
17 |     'deepseek': 'deepseek-chat',
18 |     'openrouter': 'google/gemma-3-4b-it:free',
19 | }
20 | 
21 | # ==== FOR TESTING ALL AGENTS, TOOLS AND STRUCTURED OUTPUTS ====
22 | 
23 | SEARCH_PROVIDER = 'serper'
24 | 
25 | # Note that the models need to support tool use
26 | 
27 | REASONING_MODEL_PROVIDER = 'openai'
28 | REASONING_MODEL = 'gpt-4o-mini'
29 | 
30 | MAIN_MODEL_PROVIDER = 'openai'
31 | MAIN_MODEL = 'gpt-4o-mini'
32 | 
33 | FAST_MODEL_PROVIDER = 'openai'
34 | FAST_MODEL = 'gpt-4o-mini'


--------------------------------------------------------------------------------
/tests/test_model_providers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test suite for different model providers, as specified in tests/config.py.
 3 | 
 4 | These tests check that:
 5 | - The model providers correctly return a response when run by a simple agent
 6 | - The model returns the expected content in its output
 7 | """
 8 | import pytest
 9 | from deep_researcher.agents.writer_agent import init_writer_agent
10 | from deep_researcher import ResearchRunner
11 | from deep_researcher import LLMConfig
12 | from .config import PROVIDERS_TO_TEST, SEARCH_PROVIDER
13 | 
14 | 
15 | @pytest.mark.parametrize("provider, model", list(PROVIDERS_TO_TEST.items()))
16 | @pytest.mark.asyncio
17 | async def test_model_provider(provider, model):
18 |     config = LLMConfig(
19 |         search_provider=SEARCH_PROVIDER,
20 |         reasoning_model_provider=provider,
21 |         reasoning_model=model,
22 |         main_model_provider=provider,
23 |         main_model=model,
24 |         fast_model_provider=provider,
25 |         fast_model=model,
26 |     )
27 |     agent = init_writer_agent(config)
28 | 
29 |     query = "What is the temperature in London today?"
30 |     findings = "The temperature in London today is 36 degrees Celsius. Source: weather.com"
31 | 
32 |     input_str = f"""
33 |     QUERY: {query}
34 | 
35 |     FINDINGS: {findings}
36 |     """
37 |     result = await ResearchRunner.run(agent, input_str)
38 |     output_str = result.final_output
39 | 
40 |     print(f"Testing {provider}/{model}:\n{output_str}\n")
41 | 
42 |     assert isinstance(output_str, str), f"The model {provider}/{model} failed to return a string"
43 |     assert len(output_str) > 0, f"The model {provider}/{model} failed to return any output"
44 |     assert "36" in output_str, f"The model {provider}/{model} failed to return the correct output"
45 | 


--------------------------------------------------------------------------------
/tests/test_reformat_references.py:
--------------------------------------------------------------------------------
  1 | from inspect import cleandoc
  2 | 
  3 | 
  4 | def test_reformat_references():
  5 |     from deep_researcher.agents.long_writer_agent import reformat_references
  6 |     # Test Case 1: First section, no existing references
  7 |     draft = cleandoc("""# Research Report
  8 | 
  9 |     ## Table of Contents
 10 | 
 11 |     1. Introduction
 12 |     2. Market Analysis
 13 |     3. Competitive Landscape
 14 |     4. Conclusion
 15 | 
 16 |     """)
 17 |     section_markdown_1 = cleandoc(
 18 |     """## Introduction
 19 | 
 20 |     This report examines the technology landscape in 2023 [1](https://tech-trends.com/2023). 
 21 |     The rapid advancement of AI has been a major trend [2](https://ai-report.org/trends).
 22 |     Cloud computing continues to evolve with new paradigms [3](https://cloud-computing.org/evolution).
 23 |     """
 24 |     )
 25 |     references_1 = [
 26 |         "[1] https://tech-trends.com/2023",
 27 |         "[2] https://ai-report.org/trends",
 28 |         "[3] https://cloud-computing.org/evolution"
 29 |     ]
 30 | 
 31 |     section_markdown_2 = cleandoc(
 32 |     """## Market Analysis
 33 | 
 34 |     The global tech market is expected to reach $5 trillion by 2025 [1](https://market-research.com/tech-forecast).
 35 |     AI technologies, as mentioned earlier [2](https://ai-report.org/trends), are driving significant growth.
 36 |     Open source software is gaining prominence [3](https://opensource-trends.org/report).
 37 |     Cloud providers are seeing 30% year-over-year growth [4](https://cloud-computing.org/evolution).
 38 |     """
 39 |     )
 40 |     references_2 = [
 41 |         "[1] https://market-research.com/tech-forecast",
 42 |         "[2] https://ai-report.org/trends",
 43 |         "[3] https://opensource-trends.org/report",
 44 |         "[4] https://cloud-computing.org/evolution"
 45 |     ]
 46 |     
 47 |     section_markdown_3 = cleandoc(
 48 |     """## Competitive Landscape
 49 | 
 50 |     Major players in the tech industry include established giants [1](https://tech-giants.com/report) and emerging startups.
 51 |     New AI models [2](https://ai-report.org/trends) are disrupting traditional software development.
 52 |     The open source ecosystem [3](https://opensource-trends.org/report) creates a collaborative environment.
 53 |     Venture capital investment reached $150 billion in 2022 [4](https://vc-funding.org/tech-2022).
 54 |     Cloud market share is dominated by three major providers [5](https://cloud-computing.org/evolution).
 55 |     """
 56 |     )
 57 |     references_3 = [
 58 |         "[1] https://tech-giants.com/report",
 59 |         "[2] https://ai-report.org/trends",
 60 |         "[3] https://opensource-trends.org/report",
 61 |         "[4] https://vc-funding.org/tech-2022",
 62 |         "[5] https://cloud-computing.org/evolution"
 63 |     ]
 64 |     
 65 |     # Process each section in sequence
 66 |     all_references = []
 67 | 
 68 |     # Process the first section
 69 |     section_markdown_1, all_references = reformat_references(section_markdown_1, references_1, all_references)
 70 |     draft += section_markdown_1 + "\n\n"
 71 |     
 72 |     # Process the second section
 73 |     section_markdown_2, all_references = reformat_references(section_markdown_2, references_2, all_references)
 74 |     draft += section_markdown_2 + "\n\n"
 75 |     
 76 |     # Process the third section
 77 |     section_markdown_3, all_references = reformat_references(section_markdown_3, references_3, all_references)
 78 |     draft += section_markdown_3 + "\n\n"
 79 | 
 80 |     # Produce the final report
 81 |     final_report = draft + "## References:\n\n" + "\n".join(all_references)
 82 | 
 83 |     expected_final_report = cleandoc(
 84 |         """# Research Report
 85 | 
 86 |         ## Table of Contents
 87 | 
 88 |         1. Introduction
 89 |         2. Market Analysis
 90 |         3. Competitive Landscape
 91 |         4. Conclusion## Introduction
 92 | 
 93 |         This report examines the technology landscape in 2023 [1](https://tech-trends.com/2023). 
 94 |         The rapid advancement of AI has been a major trend [2](https://ai-report.org/trends).
 95 |         Cloud computing continues to evolve with new paradigms [3](https://cloud-computing.org/evolution).
 96 | 
 97 |         ## Market Analysis
 98 | 
 99 |         The global tech market is expected to reach $5 trillion by 2025 [4](https://market-research.com/tech-forecast).
100 |         AI technologies, as mentioned earlier [2](https://ai-report.org/trends), are driving significant growth.
101 |         Open source software is gaining prominence [5](https://opensource-trends.org/report).
102 |         Cloud providers are seeing 30% year-over-year growth [3](https://cloud-computing.org/evolution).
103 | 
104 |         ## Competitive Landscape
105 | 
106 |         Major players in the tech industry include established giants [6](https://tech-giants.com/report) and emerging startups.
107 |         New AI models [2](https://ai-report.org/trends) are disrupting traditional software development.
108 |         The open source ecosystem [5](https://opensource-trends.org/report) creates a collaborative environment.
109 |         Venture capital investment reached $150 billion in 2022 [7](https://vc-funding.org/tech-2022).
110 |         Cloud market share is dominated by three major providers [3](https://cloud-computing.org/evolution).
111 | 
112 |         ## References:
113 | 
114 |         [1] https://tech-trends.com/2023
115 |         [2] https://ai-report.org/trends
116 |         [3] https://cloud-computing.org/evolution
117 |         [4] https://market-research.com/tech-forecast
118 |         [5] https://opensource-trends.org/report
119 |         [6] https://tech-giants.com/report
120 |         [7] https://vc-funding.org/tech-2022
121 |         """
122 |     )
123 | 
124 |     assert final_report == expected_final_report
125 | 


--------------------------------------------------------------------------------
/tests/test_reformat_section_headings.py:
--------------------------------------------------------------------------------
 1 | from inspect import cleandoc
 2 | 
 3 | 
 4 | def test_reformat_section_headings():
 5 |     from deep_researcher.agents.long_writer_agent import reformat_section_headings
 6 | 
 7 |     # Test case 1: Empty input
 8 |     empty_input = ""
 9 |     assert reformat_section_headings(empty_input) == ""
10 | 
11 |     # Test case 2: Input with no headings
12 |     no_headings = "This is just regular text\nwith multiple lines\nbut no headings"
13 |     assert reformat_section_headings(no_headings) == no_headings
14 | 
15 |     # Test case 3: Input with h1 heading (should become h2)
16 |     h1_input = "# Main Title\nSome content\n## Subtitle"
17 |     expected_h1 = "## Main Title\nSome content\n### Subtitle"
18 |     assert reformat_section_headings(h1_input) == expected_h1
19 | 
20 |     # Test case 4: Input with h3 heading (should become h2)
21 |     h3_input = "### Deep Title\nSome content\n#### Subtitle"
22 |     expected_h3 = "## Deep Title\nSome content\n### Subtitle"
23 |     assert reformat_section_headings(h3_input) == expected_h3
24 | 
25 |     # Test case 5: Mixed heading levels
26 |     mixed_input = cleandoc(
27 |         """### Section
28 |         Some content
29 |         # Big Title
30 |         More content
31 |         #### Subsection
32 |         """
33 |     )
34 |     expected_mixed = cleandoc(
35 |         """## Section
36 |         Some content
37 |         # Big Title
38 |         More content
39 |         ### Subsection
40 |         """
41 |     )
42 |     assert reformat_section_headings(mixed_input) == expected_mixed
43 | 


--------------------------------------------------------------------------------
/tests/test_research_agents.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test suite for the main research agents.
  3 | 
  4 | These tests check that:
  5 | - The agents correctly format their output as the correct type (e.g. structured outputs where applicable)
  6 | - The agents produce the expected output content
  7 | """
  8 | import pytest
  9 | from .config import \
 10 |     SEARCH_PROVIDER, \
 11 |     REASONING_MODEL_PROVIDER, \
 12 |     REASONING_MODEL, \
 13 |     MAIN_MODEL_PROVIDER, \
 14 |     MAIN_MODEL, \
 15 |     FAST_MODEL_PROVIDER, \
 16 |     FAST_MODEL
 17 | from deep_researcher import LLMConfig, ResearchRunner
 18 | from deep_researcher.agents.knowledge_gap_agent import init_knowledge_gap_agent, KnowledgeGapOutput
 19 | from deep_researcher.agents.long_writer_agent import init_long_writer_agent, LongWriterOutput, write_report, ReportDraft
 20 | from deep_researcher.agents.planner_agent import init_planner_agent, ReportPlan
 21 | from deep_researcher.agents.proofreader_agent import init_proofreader_agent, ReportDraft, ReportDraftSection
 22 | from deep_researcher.agents.writer_agent import init_writer_agent
 23 | from deep_researcher.agents.thinking_agent import init_thinking_agent
 24 | from deep_researcher.agents.tool_selector_agent import init_tool_selector_agent, AgentSelectionPlan, AgentTask
 25 | 
 26 | # Configuration for LLMs
 27 | config = LLMConfig(
 28 |     search_provider=SEARCH_PROVIDER,
 29 |     reasoning_model_provider=REASONING_MODEL_PROVIDER,
 30 |     reasoning_model=REASONING_MODEL,
 31 |     main_model_provider=MAIN_MODEL_PROVIDER,
 32 |     main_model=MAIN_MODEL,
 33 |     fast_model_provider=FAST_MODEL_PROVIDER,
 34 |     fast_model=FAST_MODEL
 35 | )
 36 | 
 37 | 
 38 | @pytest.mark.asyncio
 39 | async def test_knowledge_gap_agent():
 40 |     """Test the KnowledgeGapAgent."""
 41 |     agent = init_knowledge_gap_agent(config)
 42 | 
 43 |     # Example of a user input that requires research to complete (i.e. knowledge gaps exist)
 44 |     initial_user_input = """
 45 |     ORIGINAL QUERY: What is the founding history of the company QX Labs (qxlabs.com)?
 46 | 
 47 |     HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
 48 |     - No research has been carried out yet.
 49 |     """
 50 |     result = await ResearchRunner.run(agent, initial_user_input)
 51 |     agent_output = result.final_output_as(KnowledgeGapOutput)
 52 | 
 53 |     assert isinstance(agent_output, KnowledgeGapOutput), "The KnowledgeGapAgent is not correctly formatting its structured output"
 54 |     assert agent_output.research_complete is False, "The KnowledgeGapAgent is not correctly evaluating research as incomplete"
 55 |     assert len(agent_output.outstanding_gaps) > 0, "The KnowledgeGapAgent is not correctly identifying knowledge gaps"
 56 |     
 57 |     # Example of a user input that doesn't require further research to complete (i.e. no knowledge gaps exist)
 58 |     final_user_input = """
 59 |     ORIGINAL QUERY: What is the capital of France?
 60 | 
 61 |     HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
 62 |     - Thinking: I need to run a web search to find the capital of France
 63 |     - Action: Running WebSearchAgent with query 'France capital city'
 64 |     - Findings: The capital of France is Paris
 65 |     """
 66 |     result = await ResearchRunner.run(agent, final_user_input)
 67 |     agent_output = result.final_output_as(KnowledgeGapOutput)
 68 | 
 69 |     assert isinstance(agent_output, KnowledgeGapOutput), "The KnowledgeGapAgent is not correctly formatting its structured output"
 70 |     assert agent_output.research_complete is True, "The KnowledgeGapAgent is not correctly evaluating research as complete"
 71 |     assert len(agent_output.outstanding_gaps) == 0, "The KnowledgeGapAgent is not correctly identifying knowledge gaps"
 72 | 
 73 | 
 74 | @pytest.mark.asyncio
 75 | async def test_long_writer_agent():
 76 |     """Test the LongWriterAgent."""
 77 |     agent = init_long_writer_agent(config)
 78 |     user_input = """
 79 |     ===========================================================
 80 |     ORIGINAL QUERY: Tell me about common dog breeds.
 81 | 
 82 |     CURRENT REPORT DRAFT: # Dog Breeds Report\n\n## Table of Contents\n1. Introduction\n2. Golden Retrievers\n\n## Introduction\n\nThis is a report about dog breeds.
 83 | 
 84 |     TITLE OF NEXT SECTION TO WRITE: Golden Retrievers
 85 | 
 86 |     DRAFT OF NEXT SECTION: Golden Retrievers are friendly dogs [1](https://goldies.com/dogs). They are good family pets.
 87 |     ===========================================================
 88 |     """
 89 |     result = await ResearchRunner.run(agent, user_input)
 90 |     agent_output = result.final_output_as(LongWriterOutput)
 91 | 
 92 |     assert isinstance(agent_output, LongWriterOutput), "The LongWriterAgent is not correctly formatting its structured output"
 93 |     assert "golden retrievers" in agent_output.next_section_markdown.lower(), "The LongWriterAgent is not correctly writing markdown content"
 94 |     assert len(agent_output.references) >= 1, "No references are present in the output of the LongWriterAgent"
 95 |     assert "goldies.com" in agent_output.references[0], "The correct reference is not present in the output of the LongWriterAgent"
 96 | 
 97 | 
 98 | @pytest.mark.asyncio
 99 | async def test_planner_agent():
100 |     """Test the PlannerAgent."""
101 |     agent = init_planner_agent(config)
102 |     # NOTE: This agent uses tools (search/crawl) which might make the test slow/flaky
103 |     # For a robust unit test, we might mock these tools, but for now, we'll run it.
104 |     user_input = """
105 |     QUERY: What are the main features of the Python programming language?
106 |     """
107 |     result = await ResearchRunner.run(agent, user_input)
108 |     agent_output = result.final_output_as(ReportPlan)
109 | 
110 |     assert isinstance(agent_output, ReportPlan), "The PlannerAgent is not correctly formatting its structured output"
111 |     assert len(agent_output.report_outline) > 0, "The PlannerAgent is not producing a report outline"
112 |     assert "python" in agent_output.report_title.lower(), "The PlannerAgent is not correctly naming the report"
113 | 
114 | 
115 | @pytest.mark.asyncio
116 | async def test_proofreader_agent():
117 |     """Test the ProofreaderAgent."""
118 |     agent = init_proofreader_agent(config)
119 |     query = "Write a short summary of the sun."
120 |     draft = ReportDraft(
121 |         sections=[
122 |             ReportDraftSection(section_title="Introduction", section_content="The sun is a star. It is big [1]."),
123 |             ReportDraftSection(section_title="Details", section_content="The sun provides light [2]. It is very hot."),
124 |             ReportDraftSection(section_title="References", section_content="[1] https://example.com/sun [2] https://example.com/light")
125 |         ]
126 |     )
127 |     user_input = f"""
128 |     ====
129 |     QUERY:
130 |     {query}
131 | 
132 |     REPORT DRAFT:
133 |     {draft.model_dump_json()}
134 |     ====
135 |     """
136 |     result = await ResearchRunner.run(agent, user_input)
137 |     final_report = result.final_output
138 | 
139 |     assert isinstance(final_report, str), "The ProofreaderAgent is not correctly formatting its output as a string"
140 |     assert "## Introduction" in final_report, "The ProofreaderAgent is not correctly adding section headings"
141 | 
142 | 
143 | @pytest.mark.asyncio
144 | async def test_writer_agent():
145 |     """Test the WriterAgent."""
146 |     agent = init_writer_agent(config)
147 |     user_input = """
148 |     ===========================================================
149 |     QUERY: Who is the founder of QX Labs?
150 | 
151 |     FINDINGS:
152 |     - QX Labs was founded by Jai Juneja [1].
153 |     - It is based in London[2].
154 |     - Sources:
155 |       [1] https://qxlabs.com
156 |       [2] https://qxlabs.com/about
157 |     ===========================================================
158 |     """
159 |     result = await ResearchRunner.run(agent, user_input)
160 |     final_report = result.final_output # Writer outputs raw markdown
161 | 
162 |     assert isinstance(final_report, str), "The WriterAgent is not correctly formatting its output as a string"
163 |     assert "jai" in final_report.lower(), "The WriterAgent is not producing the expected markdown content"
164 |     assert "qxlabs.com" in final_report, "The WriterAgent is not correctly citing references"
165 | 
166 | 
167 | @pytest.mark.asyncio
168 | async def test_thinking_agent():
169 |     """Test the ThinkingAgent."""
170 |     agent = init_thinking_agent(config)
171 |     user_input = """
172 |     ORIGINAL QUERY: What is the story of QX Labs (qxlabs.com)?
173 | 
174 |     HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
175 |     - Iteration 1:
176 |         - Thought: I need learn more about what QX Labs is and what they do
177 |         - Action: Run SiteCrawlerAgent for qxlabs.com
178 |         - Findings: QX Labs is an AI research company that develops workflow automation tools.
179 |     - Iteration 2:
180 |         - Thought: I need to find out more about the founding history of QX Labs.
181 |         - Action: Run WebSearchAgent for 'QX Labs founding history'
182 |         - Findings: QX Labs was founded in 2023 by Jai Juneja and is based in London.
183 |     """
184 |     result = await ResearchRunner.run(agent, user_input)
185 |     thoughts = result.final_output # Thinking agent outputs raw text
186 | 
187 |     assert isinstance(thoughts, str), "The ThinkingAgent is not correctly formatting its output as a string"
188 |     assert len(thoughts) > 0, "The ThinkingAgent is not producing any output"
189 |     assert any([keyword in thoughts.lower() for keyword in ['jai', 'iteration', 'qx', 'london', '2023', 'found', 'automation']]), "The ThinkingAgent is not producing any relevant thoughts"
190 | 
191 | 
192 | @pytest.mark.asyncio
193 | async def test_tool_selector_agent():
194 |     """Test the ToolSelectorAgent."""
195 |     agent = init_tool_selector_agent(config)
196 |     user_input = """
197 |     ORIGINAL QUERY: Provide a comprehensive overview of the company QX Labs (qxlabs.com)
198 | 
199 |     KNOWLEDGE GAP TO ADDRESS: Need to find out what products or services QX Labs (qxlabs.com) offers.
200 | 
201 |     BACKGROUND CONTEXT: QX Labs is an AI research company that develops workflow automation tools.
202 | 
203 |     HISTORY OF ACTIONS, FINDINGS AND THOUGHTS: No research has been carried out yet.
204 |     """
205 |     result = await ResearchRunner.run(agent, user_input)
206 |     agent_output = result.final_output_as(AgentSelectionPlan)
207 | 
208 |     assert isinstance(agent_output, AgentSelectionPlan), "The ToolSelectorAgent is not correctly formatting its output as an AgentSelectionPlan"
209 |     assert len(agent_output.tasks) > 0, "The ToolSelectorAgent is not producing any tasks"
210 |     assert agent_output.tasks[0].agent in ["WebSearchAgent", "SiteCrawlerAgent"], "The ToolSelectorAgent is not correctly specifying an agent to hand off to"
211 | 


--------------------------------------------------------------------------------
/tests/test_tool_agents.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test suite for the tool agents (search_agent and crawl_agent).
 3 | 
 4 | These tests check that:
 5 | - The tool agents successfully call the search/crawl tools
 6 | - The search and crawl tools correctly return results
 7 | - The tool agents correctly process the tool output and parse it into the correct format
 8 | """
 9 | import pytest
10 | from .config import \
11 |     SEARCH_PROVIDER, \
12 |     REASONING_MODEL_PROVIDER, \
13 |     REASONING_MODEL, \
14 |     MAIN_MODEL_PROVIDER, \
15 |     MAIN_MODEL, \
16 |     FAST_MODEL_PROVIDER, \
17 |     FAST_MODEL
18 | from deep_researcher import LLMConfig, ResearchRunner
19 | from deep_researcher.agents.tool_agents import ToolAgentOutput
20 | from deep_researcher.agents.tool_agents.search_agent import init_search_agent
21 | from deep_researcher.agents.tool_agents.crawl_agent import init_crawl_agent
22 | from deep_researcher.agents.tool_selector_agent import AgentTask
23 | 
24 | config = LLMConfig(
25 |     search_provider=SEARCH_PROVIDER,
26 |     reasoning_model_provider=REASONING_MODEL_PROVIDER,
27 |     reasoning_model=REASONING_MODEL,
28 |     main_model_provider=MAIN_MODEL_PROVIDER,
29 |     main_model=MAIN_MODEL,
30 |     fast_model_provider=FAST_MODEL_PROVIDER,
31 |     fast_model=FAST_MODEL
32 | )
33 | 
34 | 
35 | @pytest.mark.asyncio
36 | async def test_search_agent():
37 |     """Test the WebSearchAgent."""
38 |     search_agent = init_search_agent(config)
39 |     agent_task = AgentTask(
40 |         gap="Need to determine the capital of France",
41 |         agent="WebSearchAgent",
42 |         query="France capital city"
43 |     )
44 |     result = await ResearchRunner.run(search_agent, agent_task.model_dump_json())
45 |     agent_output = result.final_output_as(ToolAgentOutput)
46 | 
47 |     assert isinstance(agent_output, ToolAgentOutput), "The WebSearchAgent is not correctly formatting its output as a ToolAgentOutput"
48 |     assert len(agent_output.output) > 0, "The WebSearchAgent is not correctly retrieving and parsing data from the search tool"
49 |     assert "paris" in agent_output.output.lower(), "The WebSearchAgent is not correctly retrieving and parsing data from the search tool"
50 | 
51 | 
52 | @pytest.mark.asyncio
53 | async def test_crawl_agent():
54 |     """Test the SiteCrawlerAgent."""
55 |     crawl_agent = init_crawl_agent(config)
56 |     agent_task = AgentTask(
57 |         gap="Need to determine what the website is about",
58 |         agent="SiteCrawlerAgent",
59 |         query="What is the purpose of this website?",
60 |         entity_website="https://crawler-test.com/"
61 |     )
62 |     result = await ResearchRunner.run(crawl_agent, agent_task.model_dump_json())
63 |     agent_output = result.final_output_as(ToolAgentOutput)
64 | 
65 |     assert isinstance(agent_output, ToolAgentOutput), "The SiteCrawlerAgent is not correctly formatting its output as a ToolAgentOutput"
66 |     assert len(agent_output.output) > 0, "The SiteCrawlerAgent is not correctly retrieving and parsing data from the crawl tool"
67 |     assert "test" in agent_output.output.lower(), "The SiteCrawlerAgent is not correctly retrieving and parsing data from the crawl tool"
68 | 


--------------------------------------------------------------------------------