├── .dockerignore ├── .env.example ├── .github ├── dependabot.yml └── workflows │ └── docker-bulid.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README-zh_CN.md ├── README.md ├── backend ├── __init__.py ├── report_type │ ├── __init__.py │ ├── basic_report │ │ ├── __init__.py │ │ └── basic_report.py │ └── detailed_report │ │ ├── README.md │ │ ├── __init__.py │ │ └── detailed_report.py ├── server.py ├── utils.py └── websocket_manager.py ├── cli.py ├── docker-compose.yml ├── docs ├── CNAME ├── README.md ├── babel.config.js ├── blog │ ├── 2023-09-22-gpt-researcher │ │ ├── architecture.png │ │ ├── index.md │ │ └── planner.jpeg │ ├── 2023-11-12-openai-assistant │ │ ├── diagram-1.png │ │ ├── diagram-assistant.jpeg │ │ └── index.md │ ├── 2024-05-19-gptr-langgraph │ │ ├── architecture.jpeg │ │ ├── blog-langgraph.jpeg │ │ └── index.md │ └── authors.yml ├── docs │ ├── contribute.md │ ├── examples │ │ ├── examples.ipynb │ │ └── examples.md │ ├── faq.md │ ├── gpt-researcher │ │ ├── config.md │ │ ├── example.md │ │ ├── getting-started.md │ │ ├── introduction.md │ │ ├── langgraph.md │ │ ├── llms.md │ │ ├── pip-package.md │ │ ├── roadmap.md │ │ ├── tailored-research.md │ │ └── troubleshooting.md │ ├── reference │ │ ├── config │ │ │ ├── config.md │ │ │ └── singleton.md │ │ ├── processing │ │ │ ├── html.md │ │ │ └── text.md │ │ └── sidebar.json │ └── welcome.md ├── docusaurus.config.js ├── package.json ├── pydoc-markdown.yml ├── sidebars.js ├── src │ ├── components │ │ ├── HomepageFeatures.js │ │ └── HomepageFeatures.module.css │ ├── css │ │ └── custom.css │ └── pages │ │ ├── index.js │ │ └── index.module.css ├── static │ ├── .nojekyll │ └── img │ │ ├── architecture.png │ │ ├── banner1.jpg │ │ ├── examples.png │ │ ├── favicon.ico │ │ ├── gptr-logo.png │ │ ├── gptresearcher.png │ │ ├── leaderboard.png │ │ ├── multi-agent.png │ │ └── tavily.png └── yarn.lock ├── examples ├── pip-run.ipynb └── sample_report.py ├── frontend ├── index.html ├── pdf_styles.css ├── scripts.js ├── static │ ├── academicResearchAgentAvatar.png │ ├── businessAnalystAgentAvatar.png │ ├── computerSecurityanalystAvatar.png │ ├── defaultAgentAvatar.JPG │ ├── favicon.ico │ ├── financeAgentAvatar.png │ ├── gptr-logo.png │ ├── mathAgentAvatar.png │ └── travelAgentAvatar.png └── styles.css ├── gpt_researcher ├── README.md ├── __init__.py ├── config │ ├── __init__.py │ └── config.py ├── context │ ├── __init__.py │ ├── compression.py │ └── retriever.py ├── document │ ├── __init__.py │ └── document.py ├── llm_provider │ ├── __init__.py │ ├── azureopenai │ │ ├── __init__.py │ │ └── azureopenai.py │ ├── google │ │ ├── __init__.py │ │ └── google.py │ └── openai │ │ ├── __init__.py │ │ └── openai.py ├── master │ ├── __init__.py │ ├── agent.py │ ├── functions.py │ └── prompts.py ├── memory │ ├── __init__.py │ └── embeddings.py ├── retrievers │ ├── __init__.py │ ├── bing │ │ ├── __init__.py │ │ └── bing.py │ ├── duckduckgo │ │ ├── __init__.py │ │ └── duckduckgo.py │ ├── google │ │ ├── __init__.py │ │ └── google.py │ ├── searx │ │ ├── __init__.py │ │ └── searx.py │ ├── serpapi │ │ ├── __init__.py │ │ └── serpapi.py │ ├── serper │ │ ├── __init__.py │ │ └── serper.py │ ├── tavily_search │ │ ├── __init__.py │ │ └── tavily_search.py │ └── yahoo │ │ ├── __init__.py │ │ └── yahoo.py ├── scraper │ ├── __init__.py │ ├── arxiv │ │ ├── __init__.py │ │ └── arxiv.py │ ├── beautiful_soup │ │ ├── __init__.py │ │ └── beautiful_soup.py │ ├── newspaper │ │ ├── __init__.py │ │ └── newspaper.py │ ├── pymupdf │ │ ├── __init__.py │ │ └── pymupdf.py │ ├── scraper.py │ └── web_base_loader │ │ ├── __init__.py │ │ └── web_base_loader.py └── utils │ ├── __init__.py │ ├── enum.py │ ├── llm.py │ └── validators.py ├── main.py ├── multi_agents ├── README.md ├── agent.py ├── agents │ ├── __init__.py │ ├── editor.py │ ├── master.py │ ├── publisher.py │ ├── researcher.py │ ├── reviewer.py │ ├── reviser.py │ ├── utils │ │ ├── __init__.py │ │ ├── file_formats.py │ │ ├── llms.py │ │ ├── pdf_styles.css │ │ └── views.py │ └── writer.py ├── langgraph.json ├── main.py ├── memory │ ├── __init__.py │ ├── draft.py │ └── research.py ├── requirements.txt └── task.json ├── poetry.lock ├── poetry.toml ├── pyproject.toml ├── requirements.txt ├── scraping ├── js │ └── overlay.js ├── processing │ ├── __init__.py │ ├── html.py │ └── text.py ├── scrape_skills.py └── web_scrape.py ├── setup.py └── tests ├── all-6-report-types.py └── documents-report-source.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | TAVILY_API_KEY= 3 | LANGCHAIN_API_KEY= 4 | DOC_PATH=./my-docs -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | - package-ecosystem: "docker" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | -------------------------------------------------------------------------------- /.github/workflows/docker-bulid.yml: -------------------------------------------------------------------------------- 1 | name: GitHub Actions Workflow 2 | run-name: ${{ github.actor }} has started docker build workflow. 3 | on: 4 | pull_request: 5 | types: [opened, edited, ready_for_review] 6 | jobs: 7 | docker: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Git checkout 11 | uses: actions/checkout@master 12 | - name: Set up QEMU 13 | uses: docker/setup-qemu-action@v2 14 | - name: Set up Docker Buildx 15 | uses: docker/setup-buildx-action@v2 16 | with: 17 | driver: docker 18 | - name: Build Dockerfile 19 | uses: docker/build-push-action@v4 20 | with: 21 | push: false 22 | tags: assafelovic/gpt-researcher:latest 23 | file: Dockerfile 24 | 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #Ignore env containing secrets 2 | .env 3 | .venv 4 | .envrc 5 | 6 | #Ignore Virtual Env 7 | env/ 8 | venv/ 9 | .venv/ 10 | 11 | # Other Environments 12 | ENV/ 13 | env.bak/ 14 | venv.bak/ 15 | 16 | #Ignore generated outputs 17 | outputs/ 18 | 19 | #Ignore my local docs 20 | my-docs/ 21 | 22 | #Ignore pycache 23 | **/__pycache__/ 24 | 25 | #Ignore mypy cache 26 | .mypy_cache/ 27 | node_modules 28 | .idea 29 | .DS_Store 30 | .docusaurus 31 | build 32 | docs/build 33 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | Assaf.elovic@gmail.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to GPT Researcher 2 | First off, we'd like to welcome and thank you for your interest and effort in contributing to our open source project ❤️. Contributions of all forms are welcome, from new features and bug fixes, to documentation and more. 3 | 4 | We are on a mission to build the #1 AI agent for comprehensive, unbiased, and factual research online. And we need your support to achieve this grand vision. 5 | 6 | Please take a moment to review this document in order to make the contribution process easy and effective for everyone involved. 7 | 8 | ## Reporting Issues 9 | 10 | If you come across any issue or have an idea for an improvement, don't hesitate to create an issue on GitHub. Describe your problem in sufficient detail, providing as much relevant information as possible. This way, we can reproduce the issue before attempting to fix it or respond appropriately. 11 | 12 | ## Contributing Code 13 | 14 | 1. **Fork the repository and create your branch from `master`.** 15 | If it's not an urgent bug fix, you should branch from `master` and work on the feature or fix in there. 16 | 17 | 2. **Conduct your changes.** 18 | Make your changes following best practices for coding in the project's language. 19 | 20 | 3. **Test your changes.** 21 | Make sure your changes pass all the tests if there are any. If the project doesn't have automated testing infrastructure, test your changes manually to confirm they behave as expected. 22 | 23 | 4. **Follow the coding style.** 24 | Ensure your code adheres to the coding conventions used throughout the project, that includes indentation, accurate comments, etc. 25 | 26 | 5. **Commit your changes.** 27 | Make your git commits informative and concise. This is very helpful for others when they look at the git log. 28 | 29 | 6. **Push to your fork and submit a pull request.** 30 | When your work is ready and passes tests, push your branch to your fork of the repository and submit a pull request from there. 31 | 32 | 7. **Pat your back and wait for the review.** 33 | Your work is done, congratulations! Now sit tight. The project maintainers will review your submission as soon as possible. They might suggest changes or ask for improvements. Both constructive conversation and patience are key to the collaboration process. 34 | 35 | 36 | ## Documentation 37 | 38 | If you would like to contribute to the project's documentation, please follow the same steps: fork the repository, make your changes, test them, and submit a pull request. 39 | 40 | Documentation is a vital part of any software. It's not just about having good code. Ensuring that the users and contributors understand what's going on, how to use the software or how to contribute, is crucial. 41 | 42 | We're grateful for all our contributors, and we look forward to building the world's leading AI research agent hand-in-hand with you. Let's harness the power of Open Source and AI to change the world together! -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11.4-slim-bullseye as install-browser 2 | 3 | RUN apt-get update \ 4 | && apt-get satisfy -y \ 5 | "chromium, chromium-driver (>= 115.0)" \ 6 | && chromium --version && chromedriver --version 7 | 8 | RUN apt-get update \ 9 | && apt-get install -y --fix-missing firefox-esr wget \ 10 | && wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz \ 11 | && tar -xvzf geckodriver* \ 12 | && chmod +x geckodriver \ 13 | && mv geckodriver /usr/local/bin/ 14 | 15 | FROM install-browser as gpt-researcher-install 16 | 17 | ENV PIP_ROOT_USER_ACTION=ignore 18 | 19 | RUN mkdir /usr/src/app 20 | WORKDIR /usr/src/app 21 | 22 | COPY ./requirements.txt ./requirements.txt 23 | RUN pip install -r requirements.txt 24 | 25 | FROM gpt-researcher-install AS gpt-researcher 26 | 27 | RUN useradd -ms /bin/bash gpt-researcher \ 28 | && chown -R gpt-researcher:gpt-researcher /usr/src/app 29 | 30 | USER gpt-researcher 31 | 32 | COPY --chown=gpt-researcher:gpt-researcher ./ ./ 33 | 34 | EXPOSE 8000 35 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Assaf Elovic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/backend/__init__.py -------------------------------------------------------------------------------- /backend/report_type/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic_report.basic_report import BasicReport 2 | from .detailed_report.detailed_report import DetailedReport 3 | 4 | __all__ = [ 5 | "BasicReport", 6 | "DetailedReport" 7 | ] -------------------------------------------------------------------------------- /backend/report_type/basic_report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/backend/report_type/basic_report/__init__.py -------------------------------------------------------------------------------- /backend/report_type/basic_report/basic_report.py: -------------------------------------------------------------------------------- 1 | from gpt_researcher.master.agent import GPTResearcher 2 | from fastapi import WebSocket 3 | 4 | class BasicReport(): 5 | def __init__(self, query: str, report_type: str, report_source:str, source_urls, config_path: str, websocket: WebSocket): 6 | self.query = query 7 | self.report_type = report_type 8 | self.report_source = report_source 9 | self.source_urls = source_urls 10 | self.config_path = config_path 11 | self.websocket = websocket 12 | 13 | async def run(self): 14 | # Initialize researcher 15 | researcher = GPTResearcher(self.query, self.report_type, self.report_source, self.source_urls, self.config_path, self.websocket) 16 | 17 | # Run research 18 | await researcher.conduct_research() 19 | 20 | # and generate report 21 | report = await researcher.write_report() 22 | 23 | return report -------------------------------------------------------------------------------- /backend/report_type/detailed_report/README.md: -------------------------------------------------------------------------------- 1 | ## Detailed Reports 2 | 3 | Introducing long and detailed reports, with a completely new architecture inspired by the latest [STORM](https://arxiv.org/abs/2402.14207) paper. 4 | 5 | In this method we do the following: 6 | 7 | 1. Trigger Initial GPT Researcher report based on task 8 | 2. Generate subtopics from research summary 9 | 3. For each subtopic the headers of the subtopic report are extracted and accumulated 10 | 4. For each subtopic a report is generated making sure that any information about the headers accumulated until now are not re-generated. 11 | 5. An additional introduction section is written along with a table of contents constructed from the entire report. 12 | 6. The final report is constructed by appending these : Intro + Table of contents + Subsection reports -------------------------------------------------------------------------------- /backend/report_type/detailed_report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/backend/report_type/detailed_report/__init__.py -------------------------------------------------------------------------------- /backend/server.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect 2 | from fastapi.staticfiles import StaticFiles 3 | from fastapi.templating import Jinja2Templates 4 | from pydantic import BaseModel 5 | from backend.websocket_manager import WebSocketManager 6 | from backend.utils import write_md_to_pdf, write_md_to_word, write_text_to_md 7 | import time 8 | import json 9 | import os 10 | 11 | 12 | class ResearchRequest(BaseModel): 13 | task: str 14 | report_type: str 15 | agent: str 16 | 17 | 18 | app = FastAPI() 19 | 20 | app.mount("/site", StaticFiles(directory="./frontend"), name="site") 21 | app.mount("/static", StaticFiles(directory="./frontend/static"), name="static") 22 | 23 | templates = Jinja2Templates(directory="./frontend") 24 | 25 | manager = WebSocketManager() 26 | 27 | 28 | # Dynamic directory for outputs once first research is run 29 | @app.on_event("startup") 30 | def startup_event(): 31 | if not os.path.isdir("outputs"): 32 | os.makedirs("outputs") 33 | app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") 34 | 35 | @app.get("/") 36 | async def read_root(request: Request): 37 | return templates.TemplateResponse('index.html', {"request": request, "report": None}) 38 | 39 | 40 | @app.websocket("/ws") 41 | async def websocket_endpoint(websocket: WebSocket): 42 | await manager.connect(websocket) 43 | try: 44 | while True: 45 | data = await websocket.receive_text() 46 | if data.startswith("start"): 47 | json_data = json.loads(data[6:]) 48 | task = json_data.get("task") 49 | report_type = json_data.get("report_type") 50 | filename = f"task_{int(time.time())}_{task}" 51 | report_source = json_data.get("report_source") 52 | if task and report_type: 53 | report = await manager.start_streaming(task, report_type, report_source, websocket) 54 | # Saving report as pdf 55 | pdf_path = await write_md_to_pdf(report, filename) 56 | # Saving report as docx 57 | docx_path = await write_md_to_word(report, filename) 58 | # Returning the path of saved report files 59 | md_path = await write_text_to_md(report, filename) 60 | await websocket.send_json({"type": "path", "output": {"pdf": pdf_path, "docx": docx_path, "md": md_path}}) 61 | else: 62 | print("Error: not enough parameters provided.") 63 | 64 | except WebSocketDisconnect: 65 | await manager.disconnect(websocket) 66 | 67 | -------------------------------------------------------------------------------- /backend/utils.py: -------------------------------------------------------------------------------- 1 | import aiofiles 2 | import urllib 3 | import mistune 4 | 5 | 6 | async def write_to_file(filename: str, text: str) -> None: 7 | """Asynchronously write text to a file in UTF-8 encoding. 8 | 9 | Args: 10 | filename (str): The filename to write to. 11 | text (str): The text to write. 12 | """ 13 | # Convert text to UTF-8, replacing any problematic characters 14 | text_utf8 = text.encode('utf-8', errors='replace').decode('utf-8') 15 | 16 | async with aiofiles.open(filename, "w", encoding='utf-8') as file: 17 | await file.write(text_utf8) 18 | 19 | 20 | async def write_text_to_md(text: str, filename: str = "") -> str: 21 | """Writes text to a Markdown file and returns the file path. 22 | 23 | Args: 24 | text (str): Text to write to the Markdown file. 25 | 26 | Returns: 27 | str: The file path of the generated Markdown file. 28 | """ 29 | file_path = f"outputs/{filename[:60]}.md" 30 | await write_to_file(file_path, text) 31 | return file_path 32 | 33 | 34 | async def write_md_to_pdf(text: str, filename: str = "") -> str: 35 | """Converts Markdown text to a PDF file and returns the file path. 36 | 37 | Args: 38 | text (str): Markdown text to convert. 39 | 40 | Returns: 41 | str: The encoded file path of the generated PDF. 42 | """ 43 | file_path = f"outputs/{filename[:60]}.pdf" 44 | 45 | try: 46 | from md2pdf.core import md2pdf 47 | md2pdf(file_path, 48 | md_content=text, 49 | #md_file_path=f"{file_path}.md", 50 | css_file_path="./frontend/pdf_styles.css", 51 | base_url=None) 52 | print(f"Report written to {file_path}.pdf") 53 | except Exception as e: 54 | print(f"Error in converting Markdown to PDF: {e}") 55 | return "" 56 | 57 | encoded_file_path = urllib.parse.quote(file_path) 58 | return encoded_file_path 59 | 60 | 61 | async def write_md_to_word(text: str, filename: str = "") -> str: 62 | """Converts Markdown text to a DOCX file and returns the file path. 63 | 64 | Args: 65 | text (str): Markdown text to convert. 66 | 67 | Returns: 68 | str: The encoded file path of the generated DOCX. 69 | """ 70 | file_path = f"outputs/{filename[:60]}.docx" 71 | 72 | try: 73 | from docx import Document 74 | from htmldocx import HtmlToDocx 75 | # Convert report markdown to HTML 76 | html = mistune.html(text) 77 | # Create a document object 78 | doc = Document() 79 | # Convert the html generated from the report to document format 80 | HtmlToDocx().add_html_to_document(html, doc) 81 | 82 | # Saving the docx document to file_path 83 | doc.save(file_path) 84 | 85 | print(f"Report written to {file_path}") 86 | 87 | encoded_file_path = urllib.parse.quote(file_path) 88 | return encoded_file_path 89 | 90 | except Exception as e: 91 | print(f"Error in converting Markdown to DOCX: {e}") 92 | return "" 93 | -------------------------------------------------------------------------------- /backend/websocket_manager.py: -------------------------------------------------------------------------------- 1 | # connect any client to gpt-researcher using websocket 2 | import asyncio 3 | import datetime 4 | from typing import Dict, List 5 | 6 | from fastapi import WebSocket 7 | 8 | from backend.report_type import BasicReport, DetailedReport 9 | 10 | from gpt_researcher.utils.enum import ReportType 11 | 12 | 13 | class WebSocketManager: 14 | """Manage websockets""" 15 | 16 | def __init__(self): 17 | """Initialize the WebSocketManager class.""" 18 | self.active_connections: List[WebSocket] = [] 19 | self.sender_tasks: Dict[WebSocket, asyncio.Task] = {} 20 | self.message_queues: Dict[WebSocket, asyncio.Queue] = {} 21 | 22 | async def start_sender(self, websocket: WebSocket): 23 | """Start the sender task.""" 24 | queue = self.message_queues.get(websocket) 25 | if not queue: 26 | return 27 | 28 | while True: 29 | message = await queue.get() 30 | if websocket in self.active_connections: 31 | try: 32 | await websocket.send_text(message) 33 | except: 34 | break 35 | else: 36 | break 37 | 38 | async def connect(self, websocket: WebSocket): 39 | """Connect a websocket.""" 40 | await websocket.accept() 41 | self.active_connections.append(websocket) 42 | self.message_queues[websocket] = asyncio.Queue() 43 | self.sender_tasks[websocket] = asyncio.create_task( 44 | self.start_sender(websocket)) 45 | 46 | async def disconnect(self, websocket: WebSocket): 47 | """Disconnect a websocket.""" 48 | if websocket in self.active_connections: 49 | self.active_connections.remove(websocket) 50 | self.sender_tasks[websocket].cancel() 51 | await self.message_queues[websocket].put(None) 52 | del self.sender_tasks[websocket] 53 | del self.message_queues[websocket] 54 | 55 | async def start_streaming(self, task, report_type, report_source, websocket): 56 | """Start streaming the output.""" 57 | report = await run_agent(task, report_type, report_source, websocket) 58 | return report 59 | 60 | 61 | async def run_agent(task, report_type, report_source, websocket): 62 | """Run the agent.""" 63 | # measure time 64 | start_time = datetime.datetime.now() 65 | # add customized JSON config file path here 66 | config_path = "" 67 | # Instead of running the agent directly run it through the different report type classes 68 | if report_type == ReportType.DetailedReport.value: 69 | researcher = DetailedReport(query=task, report_type=report_type, report_source=report_source, 70 | source_urls=None, config_path=config_path, websocket=websocket) 71 | else: 72 | researcher = BasicReport(query=task, report_type=report_type, report_source=report_source, 73 | source_urls=None, config_path=config_path, websocket=websocket) 74 | 75 | report = await researcher.run() 76 | # measure time 77 | end_time = datetime.datetime.now() 78 | await websocket.send_json({"type": "logs", "output": f"\nTotal run time: {end_time - start_time}\n"}) 79 | 80 | return report 81 | -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides a command line interface for the GPTResearcher class. 3 | 4 | Usage: 5 | 6 | ```shell 7 | python cli.py "" --report_type 8 | ``` 9 | 10 | """ 11 | import asyncio 12 | import argparse 13 | from argparse import RawTextHelpFormatter 14 | from uuid import uuid4 15 | 16 | from dotenv import load_dotenv 17 | 18 | from gpt_researcher import GPTResearcher 19 | from gpt_researcher.utils.enum import ReportType 20 | 21 | # ============================================================================= 22 | # CLI 23 | # ============================================================================= 24 | 25 | cli = argparse.ArgumentParser( 26 | description="Generate a research report.", 27 | # Enables the use of newlines in the help message 28 | formatter_class=RawTextHelpFormatter) 29 | 30 | # ===================================== 31 | # Arg: Query 32 | # ===================================== 33 | 34 | cli.add_argument( 35 | # Position 0 argument 36 | "query", 37 | type=str, 38 | help="The query to conduct research on.") 39 | 40 | # ===================================== 41 | # Arg: Report Type 42 | # ===================================== 43 | 44 | choices = [report_type.value for report_type in ReportType] 45 | 46 | report_type_descriptions = { 47 | ReportType.ResearchReport.value: "Summary - Short and fast (~2 min)", 48 | ReportType.DetailedReport.value: "Detailed - In depth and longer (~5 min)", 49 | ReportType.ResourceReport.value: "", 50 | ReportType.OutlineReport.value: "", 51 | ReportType.CustomReport.value: "", 52 | ReportType.SubtopicReport.value: "" 53 | } 54 | 55 | cli.add_argument( 56 | "--report_type", 57 | type=str, 58 | help="The type of report to generate. Options:\n" + "\n".join( 59 | f" {choice}: {report_type_descriptions[choice]}" for choice in choices 60 | ), 61 | # Deserialize ReportType as a List of strings: 62 | choices=choices, 63 | required=True) 64 | 65 | # ============================================================================= 66 | # Main 67 | # ============================================================================= 68 | 69 | 70 | async def main(args): 71 | """ 72 | Conduct research on the given query, generate the report, and write 73 | it as a markdown file to the output directory. 74 | """ 75 | researcher = GPTResearcher( 76 | query=args.query, 77 | report_type=args.report_type) 78 | 79 | await researcher.conduct_research() 80 | 81 | report = await researcher.write_report() 82 | 83 | # Write the report to a file 84 | artifact_filepath = f"outputs/{uuid4()}.md" 85 | with open(artifact_filepath, "w") as f: 86 | f.write(report) 87 | 88 | print(f"Report written to '{artifact_filepath}'") 89 | 90 | if __name__ == "__main__": 91 | load_dotenv() 92 | args = cli.parse_args() 93 | asyncio.run(main(args)) 94 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | gpt-researcher: 4 | image: kramer1346/gpt-researcher 5 | build: ./ 6 | environment: 7 | OPENAI_API_KEY: ${OPENAI_API_KEY} 8 | TAVILY_API_KEY: ${TAVILY_API_KEY} 9 | LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} 10 | ports: 11 | - 8000:8000 -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | docs.gptr.dev -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. 4 | 5 | ## Prerequisites 6 | 7 | To build and test documentation locally, begin by downloading and installing [Node.js](https://nodejs.org/en/download/), and then installing [Yarn](https://classic.yarnpkg.com/en/). 8 | On Windows, you can install via the npm package manager (npm) which comes bundled with Node.js: 9 | 10 | ```console 11 | npm install --global yarn 12 | ``` 13 | 14 | ## Installation 15 | 16 | ```console 17 | pip install pydoc-markdown 18 | cd website 19 | yarn install 20 | ``` 21 | 22 | ## Local Development 23 | 24 | Navigate to the website folder and run: 25 | 26 | ```console 27 | pydoc-markdown 28 | yarn start 29 | ``` 30 | 31 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. 32 | -------------------------------------------------------------------------------- /docs/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /docs/blog/2023-09-22-gpt-researcher/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/blog/2023-09-22-gpt-researcher/architecture.png -------------------------------------------------------------------------------- /docs/blog/2023-09-22-gpt-researcher/planner.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/blog/2023-09-22-gpt-researcher/planner.jpeg -------------------------------------------------------------------------------- /docs/blog/2023-11-12-openai-assistant/diagram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/blog/2023-11-12-openai-assistant/diagram-1.png -------------------------------------------------------------------------------- /docs/blog/2023-11-12-openai-assistant/diagram-assistant.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/blog/2023-11-12-openai-assistant/diagram-assistant.jpeg -------------------------------------------------------------------------------- /docs/blog/2024-05-19-gptr-langgraph/architecture.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/blog/2024-05-19-gptr-langgraph/architecture.jpeg -------------------------------------------------------------------------------- /docs/blog/2024-05-19-gptr-langgraph/blog-langgraph.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/blog/2024-05-19-gptr-langgraph/blog-langgraph.jpeg -------------------------------------------------------------------------------- /docs/blog/authors.yml: -------------------------------------------------------------------------------- 1 | assafe: 2 | name: Assaf Elovic 3 | title: Creator @ GPT Researcher 4 | url: https://github.com/assafelovic 5 | image_url: https://lh3.googleusercontent.com/a/ACg8ocJtrLku69VG_2Y0sJa5mt66gIGNaEBX5r_mgE6CRPEb7A=s96-c 6 | -------------------------------------------------------------------------------- /docs/docs/contribute.md: -------------------------------------------------------------------------------- 1 | # Contribute 2 | 3 | We highly welcome contributions! Please check out [contributing](https://github.com/assafelovic/gpt-researcher/blob/master/CONTRIBUTING.md) if you're interested. 4 | 5 | Please check out our [roadmap](https://trello.com/b/3O7KBePw/gpt-researcher-roadmap) page and reach out to us via our [Discord community](https://discord.gg/2pFkc83fRq) if you're interested in joining our mission. -------------------------------------------------------------------------------- /docs/docs/examples/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ### Run PIP Package 4 | ```python 5 | from gpt_researcher import GPTResearcher 6 | import asyncio 7 | 8 | 9 | async def main(): 10 | """ 11 | This is a sample script that shows how to run a research report. 12 | """ 13 | # Query 14 | query = "What happened in the latest burning man floods?" 15 | 16 | # Report Type 17 | report_type = "research_report" 18 | 19 | # Initialize the researcher 20 | researcher = GPTResearcher(query=query, report_type=report_type, config_path=None) 21 | # Conduct research on the given query 22 | await researcher.conduct_research() 23 | # Write the report 24 | report = await researcher.write_report() 25 | 26 | return report 27 | 28 | 29 | if __name__ == "__main__": 30 | asyncio.run(main()) 31 | ``` -------------------------------------------------------------------------------- /docs/docs/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | ### How do I get started? 4 | It really depends on what you're aiming for. 5 | 6 | If you're looking to connect your AI application to the internet with Tavily tailored API, check out the [Tavily API](https://docs.tavily.com/docs/tavily-api/introductionn) documentation. 7 | If you're looking to build and deploy our open source autonomous research agent GPT Researcher, please see [GPT Researcher](/docs/gpt-researcher/introduction) documentation. 8 | You can also check out demos and examples for inspiration [here](/docs/examples/examples). 9 | ### What is GPT Researcher? 10 | GPT Researcher is a popular open source autonomous research agent that takes care of the tedious task of research for you, by scraping, filtering and aggregating over 20+ web sources per a single research task. 11 | 12 | GPT Researcher is built with best practices for leveraging LLMs (prompt engineering, RAG, chains, embeddings, etc), and is optimized for quick and efficient research. It is also fully customizable and can be tailored to your specific needs. 13 | 14 | To learn more about GPT Researcher, check out the [documentation page](/docs/gpt-researcher/introduction). 15 | ### How much does each research run cost? 16 | A research task using GPT Researcher costs around $0.01 per a single run (for GPT-4 usage). We're constantly optimizing LLM calls to reduce costs and improve performance. 17 | ### How do you ensure the report is factual and accurate? 18 | we do our best to ensure that the information we provide is factual and accurate. We do this by using multiple sources, and by using proprietary AI to score and rank the most relevant and accurate information. We also use proprietary AI to filter out irrelevant information and sources. 19 | 20 | Lastly, by using RAG and other techniques, we ensure that the information is relevant to the context of the research task, leading to more accurate generative AI content and reduced hallucinations. 21 | 22 | ### What are your plans for the future? 23 | We're constantly working on improving our products and services. We're currently working on improving our search API together with design partners, and adding more data sources to our search engine. We're also working on improving our research agent GPT Researcher, and adding more features to it while growing our amazing open source community. 24 | 25 | If you're interested in our roadmap or looking to collaborate, check out our [roadmap page](https://trello.com/b/3O7KBePw/gpt-researcher-roadmap). 26 | 27 | Feel free to [contact us](mailto:assafelovic@gmail.com) if you have any further questions or suggestions! -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/config.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | The config.py enables you to customize GPT Researcher to your specific needs and preferences. 4 | 5 | Thanks to our amazing community and contributions, GPT Researcher supports multiple LLMs and Retrievers. 6 | In addition, GPT Researcher can be tailored to various report formats (such as APA), word count, research iterations depth, etc. 7 | 8 | GPT Researcher defaults to our recommended suite of integrations: [OpenAI](https://platform.openai.com/docs/overview) for LLM calls and [Tavily API](https://app.tavily.com) for retrieving realtime online information. 9 | 10 | As seen below, OpenAI still stands as the superior LLM. We assume it will stay this way for some time, and that prices will only continue to decrease, while performance and speed increase over time. 11 | 12 |
13 | 14 |
15 | 16 | Here is an example of the default config.py file found in `/gpt_researcher/config/`: 17 | 18 | ```python 19 | import os 20 | def __init__(self, config_file: str = None): 21 | """Initialize the config class.""" 22 | self.config_file = os.path.expanduser(config_file) if config_file else os.getenv('CONFIG_FILE') 23 | self.retriever = os.getenv('RETRIEVER', "tavily") 24 | self.embedding_provider = os.getenv('EMBEDDING_PROVIDER', 'openai') 25 | self.llm_provider = os.getenv('LLM_PROVIDER', "openai") 26 | self.fast_llm_model = os.getenv('FAST_LLM_MODEL', "gpt-3.5-turbo-16k") 27 | self.smart_llm_model = os.getenv('SMART_LLM_MODEL', "gpt-4o") 28 | self.fast_token_limit = int(os.getenv('FAST_TOKEN_LIMIT', 2000)) 29 | self.smart_token_limit = int(os.getenv('SMART_TOKEN_LIMIT', 4000)) 30 | self.browse_chunk_max_length = int(os.getenv('BROWSE_CHUNK_MAX_LENGTH', 8192)) 31 | self.summary_token_limit = int(os.getenv('SUMMARY_TOKEN_LIMIT', 700)) 32 | self.temperature = float(os.getenv('TEMPERATURE', 0.55)) 33 | self.user_agent = os.getenv('USER_AGENT', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " 34 | "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0") 35 | self.max_search_results_per_query = int(os.getenv('MAX_SEARCH_RESULTS_PER_QUERY', 5)) 36 | self.memory_backend = os.getenv('MEMORY_BACKEND', "local") 37 | self.total_words = int(os.getenv('TOTAL_WORDS', 800)) 38 | self.report_format = os.getenv('REPORT_FORMAT', "APA") 39 | self.max_iterations = int(os.getenv('MAX_ITERATIONS', 3)) 40 | self.agent_role = os.getenv('AGENT_ROLE', None) 41 | self.scraper = os.getenv("SCRAPER", "bs") 42 | self.max_subtopics = os.getenv("MAX_SUBTOPICS", 3) 43 | self.doc_path = os.getenv("DOC_PATH", "") 44 | ``` 45 | To change the default configurations, you can simply add env variables to your `.env` file as named in the config.py file. 46 | 47 | Please note that you can also include your own external JSON file `config.json` by adding the path in the `config_file` param. 48 | 49 | To learn more about additional LLM support you can check out the docs [here](/docs/gpt-researcher/llms). 50 | 51 | You can also change the search engine by modifying the `retriever` param to others such as `duckduckgo`, `bing`, `google`, `serper`, `searx` and more. [Check here](https://github.com/assafelovic/gpt-researcher/tree/master/gpt_researcher/retrievers) for supported retrievers. 52 | 53 | Please note that you might need to sign up and obtain an API key for any of the other supported retrievers and LLM providers. -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/example.md: -------------------------------------------------------------------------------- 1 | # Agent Example 2 | 3 | If you're interested in using GPT Researcher as a standalone agent, you can easily import it into any existing Python project. Below, is an example of calling the agent to generate a research report: 4 | 5 | ```python 6 | from gpt_researcher import GPTResearcher 7 | import asyncio 8 | 9 | # It is best to define global constants at the top of your script 10 | QUERY = "What happened in the latest burning man floods?" 11 | REPORT_TYPE = "research_report" 12 | 13 | async def fetch_report(query, report_type): 14 | """ 15 | Fetch a research report based on the provided query and report type. 16 | """ 17 | researcher = GPTResearcher(query=query, report_type=report_type, config_path=None) 18 | await researcher.conduct_research() 19 | report = await researcher.write_report() 20 | return report 21 | 22 | async def generate_research_report(): 23 | """ 24 | This is a sample script that executes an async main function to run a research report. 25 | """ 26 | report = await fetch_report(QUERY, REPORT_TYPE) 27 | print(report) 28 | 29 | if __name__ == "__main__": 30 | asyncio.run(generate_research_report()) 31 | ``` 32 | 33 | You can further enhance this example to use the returned report as context for generating valuable content such as news article, marketing content, email templates, newsletters, etc. 34 | 35 | You can also use GPT Researcher to gather information about code documentation, business analysis, financial information and more. All of which can be used to complete much more complex tasks that require factual and high quality realtime information. 36 | -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | > **Step 0** - Install Python 3.11 or later. [See here](https://www.tutorialsteacher.com/python/install-python) for a step-by-step guide. 3 | 4 | > **Step 1** - Download the project and navigate to its directory 5 | 6 | ```bash 7 | $ git clone https://github.com/assafelovic/gpt-researcher.git 8 | $ cd gpt-researcher 9 | ``` 10 | 11 | > **Step 3** - Set up API keys using two methods: exporting them directly or storing them in a `.env` file. 12 | 13 | For Linux/Temporary Windows Setup, use the export method: 14 | 15 | ```bash 16 | export OPENAI_API_KEY={Your OpenAI API Key here} 17 | export TAVILY_API_KEY={Your Tavily API Key here} 18 | ``` 19 | 20 | For a more permanent setup, create a `.env` file in the current `gpt-researcher` folder and input the keys as follows: 21 | 22 | ```bash 23 | OPENAI_API_KEY={Your OpenAI API Key here} 24 | TAVILY_API_KEY={Your Tavily API Key here} 25 | ``` 26 | 27 | - **For LLM, we recommend [OpenAI GPT](https://platform.openai.com/docs/guides/gpt)**, but you can use any other LLM model (including open sources) supported by [Langchain Adapter](https://python.langchain.com/docs/guides/adapters/openai), simply change the llm model and provider in config/config.py. 28 | - **For search engine, we recommend [Tavily Search API](https://app.tavily.com)**, but you can also refer to other search engines of your choice by changing the search provider in config/config.py to `"duckduckgo"`, `"googleAPI"`, `"bing"`, `"googleSerp"`, or `"searx"`. Then add the corresponding env API key as seen in the config.py file. 29 | 30 | ## Quickstart 31 | 32 | > **Step 1** - Install dependencies 33 | 34 | ```bash 35 | $ pip install -r requirements.txt 36 | ``` 37 | 38 | > **Step 2** - Run the agent with FastAPI 39 | 40 | ```bash 41 | $ uvicorn main:app --reload 42 | ``` 43 | 44 | > **Step 3** - Go to http://localhost:8000 on any browser and enjoy researching! 45 | 46 | ## Using Virtual Environment or Poetry 47 | Select either based on your familiarity with each: 48 | 49 | ### Virtual Environment 50 | 51 | #### *Establishing the Virtual Environment with Activate/Deactivate configuration* 52 | 53 | Create a virtual environment using the `venv` package with the environment name ``, for example, `env`. Execute the following command in the PowerShell/CMD terminal: 54 | 55 | ```bash 56 | python -m venv env 57 | ``` 58 | 59 | To activate the virtual environment, use the following activation script in PowerShell/CMD terminal: 60 | 61 | ```bash 62 | .\env\Scripts\activate 63 | ``` 64 | 65 | To deactivate the virtual environment, run the following deactivation script in PowerShell/CMD terminal: 66 | 67 | ```bash 68 | deactivate 69 | ``` 70 | 71 | #### *Install the dependencies for a Virtual environment* 72 | 73 | After activating the `env` environment, install dependencies using the `requirements.txt` file with the following command: 74 | 75 | ```bash 76 | python -m pip install -r requirements.txt 77 | ``` 78 | 79 |
80 | 81 | ### Poetry 82 | 83 | #### *Establishing the Poetry dependencies and virtual environment with Poetry version `~1.7.1`* 84 | 85 | Install project dependencies and simultaneously create a virtual environment for the specified project. By executing this command, Poetry reads the project's "pyproject.toml" file to determine the required dependencies and their versions, ensuring a consistent and isolated development environment. The virtual environment allows for a clean separation of project-specific dependencies, preventing conflicts with system-wide packages and enabling more straightforward dependency management throughout the project's lifecycle. 86 | 87 | ```bash 88 | poetry install 89 | ``` 90 | 91 | #### *Activate the virtual environment associated with a Poetry project* 92 | 93 | By running this command, the user enters a shell session within the isolated environment associated with the project, providing a dedicated space for development and execution. This virtual environment ensures that the project dependencies are encapsulated, avoiding conflicts with system-wide packages. Activating the Poetry shell is essential for seamlessly working on a project, as it ensures that the correct versions of dependencies are used and provides a controlled environment conducive to efficient development and testing. 94 | 95 | ```bash 96 | poetry shell 97 | ``` 98 | 99 | ### *Run the app* 100 | > Launch the FastAPI application agent on a *Virtual Environment or Poetry* setup by executing the following command: 101 | ```bash 102 | python -m uvicorn main:app --reload 103 | ``` 104 | > Visit http://localhost:8000 in any web browser and explore your research! 105 | 106 |
107 | 108 | 109 | ## Try it with Docker 110 | 111 | > **Step 1** - Install Docker 112 | 113 | Follow instructions at https://docs.docker.com/engine/install/ 114 | 115 | > **Step 2** - Create .env file with your OpenAI Key or simply export it 116 | 117 | ```bash 118 | $ export OPENAI_API_KEY={Your API Key here} 119 | $ export TAVILY_API_KEY={Your Tavily API Key here} 120 | ``` 121 | 122 | > **Step 3** - Run the application 123 | 124 | ```bash 125 | $ docker-compose up 126 | ``` 127 | 128 | > **Step 4** - Go to http://localhost:8000 on any browser and enjoy researching! 129 | -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | **[GPT Researcher](https://gptr.dev) is an autonomous agent designed for comprehensive online research on a variety of tasks.** 4 | 5 | The agent can produce detailed, factual and unbiased research reports, with customization options for focusing on relevant resources, outlines, and lessons. Inspired by the recent [Plan-and-Solve](https://arxiv.org/abs/2305.04091) and [RAG](https://arxiv.org/abs/2005.11401) papers, GPT Researcher addresses issues of speed, determinism and reliability, offering a more stable performance and increased speed through parallelized agent work, as opposed to synchronous operations. 6 | 7 | ## Why GPT Researcher? 8 | 9 | - To form objective conclusions for manual research tasks can take time, sometimes weeks to find the right resources and information. 10 | - Current LLMs are trained on past and outdated information, with heavy risks of hallucinations, making them almost irrelevant for research tasks. 11 | - Solutions that enable web search (such as ChatGPT + Web Plugin), only consider limited resources and content that in some cases result in superficial conclusions or biased answers. 12 | - Using only a selection of resources can create bias in determining the right conclusions for research questions or tasks. 13 | 14 | ## Architecture 15 | The main idea is to run "planner" and "execution" agents, whereas the planner generates questions to research, and the execution agents seek the most related information based on each generated research question. Finally, the planner filters and aggregates all related information and creates a research report.

16 | The agents leverage both gpt3.5-turbo and gpt-4-turbo (128K context) to complete a research task. We optimize for costs using each only when necessary. **The average research task takes around 3 minutes to complete, and costs ~$0.1.** 17 | 18 |
19 | 20 |
21 | 22 | 23 | More specifically: 24 | * Create a domain specific agent based on research query or task. 25 | * Generate a set of research questions that together form an objective opinion on any given task. 26 | * For each research question, trigger a crawler agent that scrapes online resources for information relevant to the given task. 27 | * For each scraped resources, summarize based on relevant information and keep track of its sources. 28 | * Finally, filter and aggregate all summarized sources and generate a final research report. 29 | 30 | ## Demo 31 | 32 | 33 | ## Tutorials 34 | - [How it Works](https://medium.com/better-programming/how-i-built-an-autonomous-ai-agent-for-online-research-93435a97c6c) 35 | - [How to Install](https://www.loom.com/share/04ebffb6ed2a4520a27c3e3addcdde20?sid=da1848e8-b1f1-42d1-93c3-5b0b9c3b24ea) 36 | - [Live Demo](https://www.loom.com/share/6a3385db4e8747a1913dd85a7834846f?sid=a740fd5b-2aa3-457e-8fb7-86976f59f9b8) 37 | - [Homepage](https://gptr.dev) 38 | 39 | ## Features 40 | - 📝 Generate research, outlines, resources and lessons reports 41 | - 📜 Can generate long and detailed research reports (over 2K words) 42 | - 🌐 Aggregates over 20 web sources per research to form objective and factual conclusions 43 | - 🖥️ Includes an easy-to-use web interface (HTML/CSS/JS) 44 | - 🔍 Scrapes web sources with javascript support 45 | - 📂 Keeps track and context of visited and used web sources 46 | - 📄 Export research reports to PDF, Word and more... 47 | 48 | 49 | ## Disclaimer 50 | 51 | This project, GPT Researcher, is an experimental application and is provided "as-is" without any warranty, express or implied. We are sharing codes for academic purposes under the MIT license. Nothing herein is academic advice, and NOT a recommendation to use in academic or research papers. 52 | 53 | Our view on unbiased research claims: 54 | 1. The whole point of our scraping system is to reduce incorrect fact. How? The more sites we scrape the less chances of incorrect data. We are scraping 20 per research, the chances that they are all wrong is extremely low. 55 | 2. We do not aim to eliminate biases; we aim to reduce it as much as possible. **We are here as a community to figure out the most effective human/llm interactions.** 56 | 3. In research, people also tend towards biases as most have already opinions on the topics they research about. This tool scrapes many opinions and will evenly explain diverse views that a biased person would never have read. 57 | 58 | **Please note that the use of the GPT-4 language model can be expensive due to its token usage.** By utilizing this project, you acknowledge that you are responsible for monitoring and managing your own token usage and the associated costs. It is highly recommended to check your OpenAI API usage regularly and set up any necessary limits or alerts to prevent unexpected charges. 59 | -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/langgraph.md: -------------------------------------------------------------------------------- 1 | # GPTR x LangGraph 2 | 3 | We are strong advocates for the future of AI agents, envisioning a world where autonomous agents communicate and collaborate as a cohesive team to undertake and complete complex tasks. 4 | 5 | We hold the belief that research is a pivotal element in successfully tackling these complex tasks, ensuring superior outcomes. 6 | 7 | Consider the scenario of developing a coding agent responsible for coding tasks using the latest API documentation and best practices. It would be wise to integrate an agent specializing in research to curate the most recent and relevant documentation, before crafting a technical design that would subsequently be handed off to the coding assistant tasked with generating the code. This approach is applicable across various sectors, including finance, business analysis, healthcare, marketing, and legal, among others. 8 | 9 | One multi-agent framework that we're excited about is [LangGraph](https://python.langchain.com/docs/langgraph/), built by the team at [Langchain](https://www.langchain.com/). 10 | LangGraph is a Python library for building stateful, multi-actor applications with LLMs. It extends the [LangChain Expression Language](https://python.langchain.com/docs/expression_language/) with the ability to coordinate multiple chains (or actors) across multiple steps of computation. 11 | 12 | What's great about LangGraph is that it follows a DAG architecture, enabling each specialized agent to communicate with one another, and subsequently trigger actions among other agents within the graph. 13 | 14 | We've added an example for leveraging [GPT Researcher with LangGraph](https://github.com/assafelovic/gpt-researcher/tree/master/multi_agents) which can be found in `/multi_agents`. 15 | 16 | The example demonstrates a generic use case for an editorial agent team that works together to complete a research report on a given task. 17 | 18 | ## The Multi Agent Team 19 | The research team is made up of 7 AI agents: 20 | - **Chief Editor** - Oversees the research process and manages the team. This is the "master" agent that coordinates the other agents using Langgraph. 21 | - **Researcher** (gpt-researcher) - A specialized autonomous agent that conducts in depth research on a given topic. 22 | - **Editor** - Responsible for planning the research outline and structure. 23 | - **Reviewer** - Validates the correctness of the research results given a set of criteria. 24 | - **Revisor** - Revises the research results based on the feedback from the reviewer. 25 | - **Writer** - Responsible for compiling and writing the final report. 26 | - **Publisher** - Responsible for publishing the final report in various formats. 27 | 28 | ## How it works 29 | Generally, the process is based on the following stages: 30 | 1. Planning stage 31 | 2. Data collection and analysis 32 | 3. Writing and submission 33 | 4. Review and revision 34 | 5. Publication 35 | 36 | ### Architecture 37 |
38 | 39 |
40 |
41 | 42 | ### Steps 43 | More specifically (as seen in the architecture diagram) the process is as follows: 44 | - Browser (gpt-researcher) - Browses the internet for initial research based on the given research task. 45 | - Editor - Plans the report outline and structure based on the initial research. 46 | - For each outline topic (in parallel): 47 | - Researcher (gpt-researcher) - Runs an in depth research on the subtopics and writes a draft. 48 | - Reviewer - Validates the correctness of the draft given a set of criteria and provides feedback. 49 | - Revisor - Revises the draft until it is satisfactory based on the reviewer feedback. 50 | - Writer - Compiles and writes the final report including an introduction, conclusion and references section from the given research findings. 51 | - Publisher - Publishes the final report to multi formats such as PDF, Docx, Markdown, etc. 52 | 53 | ## How to run 54 | 1. Install required packages: 55 | ```bash 56 | pip install -r requirements.txt 57 | ``` 58 | 2. Run the application: 59 | ```bash 60 | python main.py 61 | ``` 62 | 63 | ## Usage 64 | To change the research query and customize the report, edit the `task.json` file in the main directory. 65 | #### Task.json contains the following fields: 66 | - `query` - The research query or task. 67 | - `model` - The OpenAI LLM to use for the agents. 68 | - `max_sections` - The maximum number of sections in the report. Each section is a subtopic of the research query. 69 | - `publish_formats` - The formats to publish the report in. The reports will be written in the `output` directory. 70 | - `follow_guidelines` - If true, the research report will follow the guidelines below. It will take longer to complete. If false, the report will be generated faster but may not follow the guidelines. 71 | - `guidelines` - A list of guidelines that the report must follow. 72 | - `verbose` - If true, the application will print detailed logs to the console. 73 | 74 | #### For example: 75 | ```json 76 | { 77 | "query": "Is AI in a hype cycle?", 78 | "model": "gpt-4o", 79 | "max_sections": 3, 80 | "publish_formats": { 81 | "markdown": true, 82 | "pdf": true, 83 | "docx": true 84 | }, 85 | "follow_guidelines": true, 86 | "guidelines": [ 87 | "The report MUST fully answer the original question", 88 | "The report MUST be written in apa format", 89 | "The report MUST be written in english" 90 | ], 91 | "verbose": true 92 | } 93 | ``` 94 | -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/llms.md: -------------------------------------------------------------------------------- 1 | # Configure LLM 2 | As described in the [introduction](/docs/gpt-researcher/config), the default LLM is OpenAI due to its superior performance and speed. 3 | However, GPT Researcher supports various open/closed source LLMs, and you can easily switch between them by adding the `LLM_PROVIDER` env variable and corresponding configuration params. 4 | 5 | Below you can find how to configure the various supported LLMs. 6 | 7 | ## OpenAI 8 | 9 | ## Ollama 10 | 11 | ## Groq 12 | 13 | GroqCloud provides advanced AI hardware and software solutions designed to deliver amazingly fast AI inference performance. 14 | To leverage Groq in GPT-Researcher, you will need a GroqCloud account and an API Key. (__NOTE:__ Groq has a very _generous free tier_.) 15 | 16 | - You can signup here: [https://console.groq.com/login](https://console.groq.com/login) 17 | - Once you are logged in, you can get an API Key here: [https://console.groq.com/keys](https://console.groq.com/keys) 18 | 19 | - Once you have an API key, you will need to add it to your `systems environment` using the variable name: 20 | `GROQ_API_KEY="*********************"` 21 | 22 | 23 | And finally, you will need to configure the GPT-Researcher Provider and Model variables: 24 | 25 | ```bash 26 | # To use Groq set the llm provider to groq 27 | LLM_PROVIDER=groq 28 | 29 | # Set one of the LLM models supported by Groq 30 | FAST_LLM_MODEL=Mixtral-8x7b-32768 31 | 32 | # Set one of the LLM models supported by Groq 33 | SMART_LLM_MODEL=Mixtral-8x7b-32768 34 | 35 | # The temperature to use defaults to 0.55 36 | TEMPERATURE=0.55 37 | ``` 38 | 39 | __NOTE:__ As of the writing of this Doc (May 2024), the available Language Models from Groq are: 40 | 41 | * Llama3-70b-8192 42 | * Llama3-8b-8192 43 | * Mixtral-8x7b-32768 44 | * Gemma-7b-it 45 | 46 | ## Anthropic 47 | 48 | ... 49 | -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/pip-package.md: -------------------------------------------------------------------------------- 1 | # PIP Package 2 | [![PyPI version](https://badge.fury.io/py/gpt-researcher.svg)](https://badge.fury.io/py/gpt-researcher) 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/assafelovic/gpt-researcher/blob/master/examples/pip-run.ipynb) 4 | 5 | 🌟 **Exciting News!** Now, you can integrate `gpt-researcher` with your apps seamlessly! 6 | 7 | ## Steps to Install GPT Researcher 🛠️ 8 | 9 | Follow these easy steps to get started: 10 | 11 | 0. **Pre-requisite**: Ensure Python 3.10+ is installed on your machine 💻 12 | 1. **Install gpt-researcher**: Grab the official package from [PyPi](https://pypi.org/project/gpt-researcher/). 13 | 14 | ```bash 15 | pip install gpt-researcher 16 | ``` 17 | 18 | 2. **Environment Variables:** Create a .env file with your OpenAI API key or simply export it 19 | 20 | ```bash 21 | export OPENAI_API_KEY={Your OpenAI API Key here} 22 | ``` 23 | 24 | ```bash 25 | export TAVILY_API_KEY={Your Tavily API Key here} 26 | ``` 27 | 28 | 3. **Start using GPT Researcher in your own codebase** 29 | 30 | ## Example Usage 📝 31 | 32 | ```python 33 | from gpt_researcher import GPTResearcher 34 | import asyncio 35 | 36 | 37 | from gpt_researcher import GPTResearcher 38 | import asyncio 39 | 40 | 41 | async def get_report(query: str, report_type: str) -> str: 42 | researcher = GPTResearcher(query, report_type) 43 | research_result = await researcher.conduct_research() 44 | report = await researcher.write_report() 45 | return report 46 | 47 | if __name__ == "__main__": 48 | query = "what team may win the NBA finals?" 49 | report_type = "research_report" 50 | 51 | report = asyncio.run(get_report(query, report_type)) 52 | print(report) 53 | ``` 54 | 55 | ## Specific Examples 🌐 56 | 57 | ### Example 1: Research Report 📚 58 | 59 | ```python 60 | query = "Latest developments in renewable energy technologies" 61 | report_type = "research_report" 62 | ``` 63 | 64 | ### Example 2: Resource Report 📋 65 | 66 | ```python 67 | query = "List of top AI conferences in 2023" 68 | report_type = "resource_report" 69 | ``` 70 | 71 | ### Example 3: Outline Report 📝 72 | 73 | ```python 74 | query = "Outline for an article on the impact of AI in education" 75 | report_type = "outline_report" 76 | ``` 77 | 78 | ## Integration with Web Frameworks 🌍 79 | 80 | ### FastAPI Example 81 | 82 | ```python 83 | from fastapi import FastAPI 84 | from gpt_researcher import GPTResearcher 85 | import asyncio 86 | 87 | app = FastAPI() 88 | 89 | @app.get("/report/{report_type}") 90 | async def get_report(query: str, report_type: str) -> dict: 91 | researcher = GPTResearcher(query, report_type) 92 | research_result = await researcher.conduct_research() 93 | report = await researcher.write_report() 94 | return {"report": report} 95 | 96 | # Run the server 97 | # uvicorn main:app --reload 98 | ``` 99 | 100 | ### Flask Example 101 | 102 | **Pre-requisite**: Install flask with the async extra. 103 | 104 | ```bash 105 | pip install 'flask[async]' 106 | ``` 107 | 108 | ```python 109 | from flask import Flask, request 110 | from gpt_researcher import GPTResearcher 111 | 112 | app = Flask(__name__) 113 | 114 | @app.route('/report/', methods=['GET']) 115 | async def get_report(report_type): 116 | query = request.args.get('query') 117 | researcher = GPTResearcher(query, report_type) 118 | research_result = await researcher.conduct_research() 119 | report = await researcher.write_report() 120 | return report 121 | 122 | # Run the server 123 | # flask run 124 | ``` 125 | **Run the server** 126 | 127 | ```bash 128 | flask run 129 | ``` 130 | 131 | **Example Request** 132 | 133 | ```bash 134 | curl -X GET "http://localhost:5000/report/research_report?query=what team may win the nba finals?" 135 | ``` 136 | 137 | **Note**: The above code snippets are just examples. You can customize them as per your requirements. 138 | -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/roadmap.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | We're constantly working on additional features and improvements to our products and services. We're also working on new products and services to help you build better AI applications using [GPT Researcher](https://gptr.dev). 4 | 5 | Our vision is to build the #1 autonomous research agent for AI developers and researchers, and we're excited to have you join us on this journey! 6 | 7 | The roadmap is prioritized based on the following goals: Performance, Quality, Modularity and Conversational flexibility. The roadmap is public and can be found [here](https://trello.com/b/3O7KBePw/gpt-researcher-roadmap). 8 | 9 | Interested in collaborating or contributing? Check out our [contributing page](/docs/contribute) for more information. -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/tailored-research.md: -------------------------------------------------------------------------------- 1 | # Tailored Research 2 | The GPT Researcher package allows you to tailor the research to your needs such as researching on specific sources or local documents, and even specify the agent prompt instruction upon which the research is conducted. 3 | 4 | ### Research on Specific Sources 📚 5 | 6 | You can specify the sources you want the GPT Researcher to research on by providing a list of URLs. The GPT Researcher will then conduct research on the provided sources. 7 | 8 | ```python 9 | from gpt_researcher import GPTResearcher 10 | import asyncio 11 | 12 | async def get_report(query: str, report_type: str, sources: list) -> str: 13 | researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) 14 | await researcher.conduct_research() 15 | report = await researcher.write_report() 16 | return report 17 | 18 | if __name__ == "__main__": 19 | query = "What are the latest advancements in AI?" 20 | report_type = "research_report" 21 | sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] 22 | 23 | report = asyncio.run(get_report(query, report_type, sources)) 24 | print(report) 25 | ``` 26 | 27 | ### Specify Agent Prompt 📝 28 | 29 | You can specify the agent prompt instruction upon which the research is conducted. This allows you to guide the research in a specific direction and tailor the report layout. 30 | Simply pass the prompt as the `query` argument to the `GPTResearcher` class and the "custom_report" `report_type`. 31 | 32 | ```python 33 | from gpt_researcher import GPTResearcher 34 | import asyncio 35 | 36 | async def get_report(prompt: str, report_type: str) -> str: 37 | researcher = GPTResearcher(query=prompt, report_type=report_type) 38 | await researcher.conduct_research() 39 | report = await researcher.write_report() 40 | return report 41 | 42 | if __name__ == "__main__": 43 | report_type = "custom_report" 44 | prompt = "Research the latest advancements in AI and provide a detailed report in APA format including sources." 45 | 46 | report = asyncio.run(get_report(prompt=prompt, report_type=report_type)) 47 | print(report) 48 | ``` 49 | 50 | ### Research on Local Documents 📄 51 | You can instruct the GPT Researcher to research on local documents by providing the path to those documents. Currently supported file formats are: PDF, plain text, CSV, Excel, Markdown, PowerPoint, and Word documents. 52 | 53 | *Step 1*: Add the env variable `DOC_PATH` pointing to the folder where your documents are located. 54 | 55 | For example: 56 | 57 | ```bash 58 | export DOC_PATH="./my-docs" 59 | ``` 60 | 61 | *Step 2*: When you create an instance of the `GPTResearcher` class, pass the `report_source` argument as `"local"`. 62 | 63 | GPT Researcher will then conduct research on the provided documents. 64 | 65 | ```python 66 | from gpt_researcher import GPTResearcher 67 | import asyncio 68 | 69 | async def get_report(query: str, report_type: str, report_source: str) -> str: 70 | researcher = GPTResearcher(query=query, report_type=report_type, report_source=report_source) 71 | await researcher.conduct_research() 72 | report = await researcher.write_report() 73 | return report 74 | 75 | if __name__ == "__main__": 76 | query = "What can you tell me about myself based on my documents?" 77 | report_type = "research_report" 78 | report_source = "local" # "local" or "web" 79 | 80 | report = asyncio.run(get_report(query=query, report_type=report_type, report_source=report_source)) 81 | print(report) 82 | ``` -------------------------------------------------------------------------------- /docs/docs/gpt-researcher/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | We're constantly working to provide a more stable version. If you're running into any issues, please first check out the resolved issues or ask us via our [Discord community](https://discord.gg/2pFkc83fRq). 3 | 4 | ### model: gpt-4 does not exist 5 | This relates to not having permission to use gpt-4 yet. Based on OpenAI, it will be [widely available for all by end of July](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4). 6 | 7 | ### cannot load library 'gobject-2.0-0' 8 | 9 | The issue relates to the library WeasyPrint (which is used to generate PDFs from the research report). Please follow this guide to resolve it: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html 10 | 11 | Or you can install this package manually 12 | 13 | In case of MacOS you can install this lib using 14 | `brew install glib gobject-introspection` 15 | 16 | In case of Linux you can install this lib using 17 | `sudo apt install libglib2.0-dev` 18 | 19 | **cannot load library 'pango'** 20 | 21 | In case of MacOS you can install this lib using 22 | `brew install pango` 23 | 24 | In case of Linux you can install this lib using 25 | `sudo apt install libpango-1.0-0` 26 | 27 | **Workaround for Mac M chip users** 28 | 29 | If the above solutions don't work, you can try the following: 30 | - Install a fresh version of Python 3.11 pointed to brew: 31 | `brew install python@3.11` 32 | - Install the required libraries: 33 | `brew install pango glib gobject-introspection` 34 | - Install the required GPT Researcher Python packages: 35 | `pip3.11 install -r requirements.txt` 36 | - Run the app with Python 3.11 (using brew): 37 | `python3.11 -m uvicorn main:app --reload` 38 | 39 | **Error processing the url** 40 | 41 | We're using [Selenium](https://www.selenium.dev) for site scraping. Some sites fail to be scraped. In these cases, restart and try running again. 42 | 43 | 44 | **Chrome version issues** 45 | 46 | Many users have an issue with their chromedriver because the latest chrome browser version doesn't have a compatible chrome driver yet. 47 | 48 | To downgrade your Chrome web browser using [slimjet](https://www.slimjet.com/chrome/google-chrome-old-version.php), follow these steps. First, visit the website and scroll down to find the list of available older Chrome versions. Choose the version you wish to install 49 | making sure it's compatible with your operating system. 50 | Once you've selected the desired version, click on the corresponding link to download the installer. Before proceeding with the installation, it's crucial to uninstall your current version of Chrome to avoid conflicts. 51 | 52 | It's important to check if the version you downgrade to, has a chromedriver available in the official [chrome driver website](https://chromedriver.chromium.org/downloads) 53 | 54 | **If none of the above work, you can [try out our hosted beta](https://app.tavily.com)** -------------------------------------------------------------------------------- /docs/docs/reference/config/config.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_label: config 3 | title: config.config 4 | --- 5 | 6 | Configuration class to store the state of bools for different scripts access. 7 | 8 | ## Config Objects 9 | 10 | ```python 11 | class Config(metaclass=Singleton) 12 | ``` 13 | 14 | Configuration class to store the state of bools for different scripts access. 15 | 16 | #### \_\_init\_\_ 17 | 18 | ```python 19 | def __init__() -> None 20 | ``` 21 | 22 | Initialize the Config class 23 | 24 | #### set\_fast\_llm\_model 25 | 26 | ```python 27 | def set_fast_llm_model(value: str) -> None 28 | ``` 29 | 30 | Set the fast LLM model value. 31 | 32 | #### set\_smart\_llm\_model 33 | 34 | ```python 35 | def set_smart_llm_model(value: str) -> None 36 | ``` 37 | 38 | Set the smart LLM model value. 39 | 40 | #### set\_fast\_token\_limit 41 | 42 | ```python 43 | def set_fast_token_limit(value: int) -> None 44 | ``` 45 | 46 | Set the fast token limit value. 47 | 48 | #### set\_smart\_token\_limit 49 | 50 | ```python 51 | def set_smart_token_limit(value: int) -> None 52 | ``` 53 | 54 | Set the smart token limit value. 55 | 56 | #### set\_browse\_chunk\_max\_length 57 | 58 | ```python 59 | def set_browse_chunk_max_length(value: int) -> None 60 | ``` 61 | 62 | Set the browse_website command chunk max length value. 63 | 64 | #### set\_openai\_api\_key 65 | 66 | ```python 67 | def set_openai_api_key(value: str) -> None 68 | ``` 69 | 70 | Set the OpenAI API key value. 71 | 72 | #### set\_debug\_mode 73 | 74 | ```python 75 | def set_debug_mode(value: bool) -> None 76 | ``` 77 | 78 | Set the debug mode value. 79 | 80 | ## APIKeyError Objects 81 | 82 | ```python 83 | class APIKeyError(Exception) 84 | ``` 85 | 86 | Exception raised when an API key is not set in config.py or as an environment variable. 87 | 88 | #### check\_openai\_api\_key 89 | 90 | ```python 91 | def check_openai_api_key(cfg) -> None 92 | ``` 93 | 94 | Check if the OpenAI API key is set in config.py or as an environment variable. 95 | 96 | #### check\_tavily\_api\_key 97 | 98 | ```python 99 | def check_tavily_api_key(cfg) -> None 100 | ``` 101 | 102 | Check if the Tavily Search API key is set in config.py or as an environment variable. 103 | 104 | #### check\_google\_api\_key 105 | 106 | ```python 107 | def check_google_api_key(cfg) -> None 108 | ``` 109 | 110 | Check if the Google API key is set in config.py or as an environment variable. 111 | 112 | #### check\_serp\_api\_key 113 | 114 | ```python 115 | def check_serp_api_key(cfg) -> None 116 | ``` 117 | 118 | Check if the SERP API key is set in config.py or as an environment variable. 119 | 120 | #### check\_searx\_url 121 | 122 | ```python 123 | def check_searx_url(cfg) -> None 124 | ``` 125 | 126 | Check if the Searx URL is set in config.py or as an environment variable. 127 | 128 | -------------------------------------------------------------------------------- /docs/docs/reference/config/singleton.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_label: singleton 3 | title: config.singleton 4 | --- 5 | 6 | The singleton metaclass for ensuring only one instance of a class. 7 | 8 | ## Singleton Objects 9 | 10 | ```python 11 | class Singleton(abc.ABCMeta, type) 12 | ``` 13 | 14 | Singleton metaclass for ensuring only one instance of a class. 15 | 16 | #### \_\_call\_\_ 17 | 18 | ```python 19 | def __call__(cls, *args, **kwargs) 20 | ``` 21 | 22 | Call method for the singleton metaclass. 23 | 24 | ## AbstractSingleton Objects 25 | 26 | ```python 27 | class AbstractSingleton(abc.ABC, metaclass=Singleton) 28 | ``` 29 | 30 | Abstract singleton class for ensuring only one instance of a class. 31 | 32 | -------------------------------------------------------------------------------- /docs/docs/reference/processing/html.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_label: html 3 | title: processing.html 4 | --- 5 | 6 | HTML processing functions 7 | 8 | #### extract\_hyperlinks 9 | 10 | ```python 11 | def extract_hyperlinks(soup: BeautifulSoup, 12 | base_url: str) -> list[tuple[str, str]] 13 | ``` 14 | 15 | Extract hyperlinks from a BeautifulSoup object 16 | 17 | **Arguments**: 18 | 19 | - `soup` _BeautifulSoup_ - The BeautifulSoup object 20 | - `base_url` _str_ - The base URL 21 | 22 | 23 | **Returns**: 24 | 25 | List[Tuple[str, str]]: The extracted hyperlinks 26 | 27 | #### format\_hyperlinks 28 | 29 | ```python 30 | def format_hyperlinks(hyperlinks: list[tuple[str, str]]) -> list[str] 31 | ``` 32 | 33 | Format hyperlinks to be displayed to the user 34 | 35 | **Arguments**: 36 | 37 | - `hyperlinks` _List[Tuple[str, str]]_ - The hyperlinks to format 38 | 39 | 40 | **Returns**: 41 | 42 | - `List[str]` - The formatted hyperlinks 43 | 44 | -------------------------------------------------------------------------------- /docs/docs/reference/processing/text.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_label: text 3 | title: processing.text 4 | --- 5 | 6 | Text processing functions 7 | 8 | #### split\_text 9 | 10 | ```python 11 | def split_text(text: str, 12 | max_length: int = 8192) -> Generator[str, None, None] 13 | ``` 14 | 15 | Split text into chunks of a maximum length 16 | 17 | **Arguments**: 18 | 19 | - `text` _str_ - The text to split 20 | - `max_length` _int, optional_ - The maximum length of each chunk. Defaults to 8192. 21 | 22 | 23 | **Yields**: 24 | 25 | - `str` - The next chunk of text 26 | 27 | 28 | **Raises**: 29 | 30 | - `ValueError` - If the text is longer than the maximum length 31 | 32 | #### summarize\_text 33 | 34 | ```python 35 | def summarize_text(url: str, 36 | text: str, 37 | question: str, 38 | driver: Optional[WebDriver] = None) -> str 39 | ``` 40 | 41 | Summarize text using the OpenAI API 42 | 43 | **Arguments**: 44 | 45 | - `url` _str_ - The url of the text 46 | - `text` _str_ - The text to summarize 47 | - `question` _str_ - The question to ask the model 48 | - `driver` _WebDriver_ - The webdriver to use to scroll the page 49 | 50 | 51 | **Returns**: 52 | 53 | - `str` - The summary of the text 54 | 55 | #### scroll\_to\_percentage 56 | 57 | ```python 58 | def scroll_to_percentage(driver: WebDriver, ratio: float) -> None 59 | ``` 60 | 61 | Scroll to a percentage of the page 62 | 63 | **Arguments**: 64 | 65 | - `driver` _WebDriver_ - The webdriver to use 66 | - `ratio` _float_ - The percentage to scroll to 67 | 68 | 69 | **Raises**: 70 | 71 | - `ValueError` - If the ratio is not between 0 and 1 72 | 73 | #### create\_message 74 | 75 | ```python 76 | def create_message(chunk: str, question: str) -> Dict[str, str] 77 | ``` 78 | 79 | Create a message for the chat completion 80 | 81 | **Arguments**: 82 | 83 | - `chunk` _str_ - The chunk of text to summarize 84 | - `question` _str_ - The question to answer 85 | 86 | 87 | **Returns**: 88 | 89 | Dict[str, str]: The message to send to the chat completion 90 | 91 | #### write\_to\_file 92 | 93 | ```python 94 | def write_to_file(filename: str, text: str) -> None 95 | ``` 96 | 97 | Write text to a file 98 | 99 | **Arguments**: 100 | 101 | - `text` _str_ - The text to write 102 | - `filename` _str_ - The filename to write to 103 | 104 | -------------------------------------------------------------------------------- /docs/docs/reference/sidebar.json: -------------------------------------------------------------------------------- 1 | { 2 | "items": [ 3 | { 4 | "items": [ 5 | "reference/config/config", 6 | "reference/config/singleton" 7 | ], 8 | "label": "config", 9 | "type": "category" 10 | }, 11 | { 12 | "items": [ 13 | "reference/processing/html", 14 | "reference/processing/text" 15 | ], 16 | "label": "processing", 17 | "type": "category" 18 | } 19 | ], 20 | "label": "Reference", 21 | "type": "category" 22 | } -------------------------------------------------------------------------------- /docs/docs/welcome.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Hey there! 👋 4 | 5 | We're a team of AI researchers and developers who are passionate about building the next generation of AI assistants. 6 | Our mission is to empower individuals and organizations with accurate, unbiased, and factual information. 7 | 8 | ### GPT Researcher 9 | In this digital age, quickly accessing relevant and trustworthy information is more crucial than ever. However, we've learned that none of today's search engines provide a suitable tool that provides factual, explicit and objective answers without the need to continuously click and explore multiple sites for a given research task. 10 | 11 | This is why we've built the trending open source **[GPT Researcher](https://github.com/assafelovic/gpt-researcher)**. GPT Researcher is an autonomous agent that takes care of the tedious task of research for you, by scraping, filtering and aggregating over 20+ web sources per a single research task. 12 | 13 | To learn more about GPT Researcher, check out the [documentation page](/docs/gpt-researcher/introduction). 14 | 15 | ### Tavily Search API 16 | Tavily Search API is a search engine optimized for LLMs, aimed at efficient, quick and persistent search results. 17 | 18 | To learn how to build your AI application with Tavily Search API, check out the [documentation page](https://docs.tavily.com/docs/tavily-api/introduction). 19 | 20 | To try our API in action, you can now use GPT Researcher on our hosted version [here](https://app.tavily.com/chat) or on our [API Playground](https://app.tavily.com/playground). 21 | 22 | If you're an AI developer looking to integrate your application with our API or seek increased API limits, **[please reach out!](mailto:support@tavily.com)** 23 | -------------------------------------------------------------------------------- /docs/docusaurus.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('@docusaurus/types').DocusaurusConfig} */ 2 | const math = require('remark-math'); 3 | const katex = require('rehype-katex'); 4 | 5 | module.exports = { 6 | title: 'GPT Researcher', 7 | tagline: 'The leading autonomous AI research agent', 8 | url: 'https://docs.gptr.dev', 9 | baseUrl: '/', 10 | onBrokenLinks: 'ignore', 11 | //deploymentBranch: 'master', 12 | onBrokenMarkdownLinks: 'warn', 13 | favicon: 'img/favicon.ico', 14 | organizationName: 'assafelovic', 15 | trailingSlash: false, 16 | projectName: 'gpt-researcher', 17 | themeConfig: { 18 | navbar: { 19 | title: 'GPT Researcher', 20 | logo: { 21 | alt: 'GPT Researcher', 22 | src: 'img/gptr-logo.png', 23 | }, 24 | items: [ 25 | { 26 | type: 'doc', 27 | docId: 'welcome', 28 | position: 'left', 29 | label: 'Docs', 30 | }, 31 | 32 | {to: 'blog', label: 'Blog', position: 'left'}, 33 | { 34 | type: 'doc', 35 | docId: 'faq', 36 | position: 'left', 37 | label: 'FAQ', 38 | }, 39 | { 40 | href: 'mailto:assaf.elovic@gmail.com', 41 | position: 'left', 42 | label: 'Contact', 43 | }, 44 | { 45 | href: 'https://github.com/assafelovic/gpt-researcher', 46 | label: 'GitHub', 47 | position: 'right', 48 | }, 49 | ], 50 | }, 51 | footer: { 52 | style: 'dark', 53 | links: [ 54 | { 55 | title: 'Community', 56 | items: [ 57 | { 58 | label: 'Discord', 59 | href: 'https://discord.gg/8YkBcCED5y', 60 | }, 61 | { 62 | label: 'Twitter', 63 | href: 'https://twitter.com/assaf_elovic', 64 | }, 65 | { 66 | label: 'LinkedIn', 67 | href: 'https://www.linkedin.com/in/assafe/', 68 | }, 69 | ], 70 | }, 71 | { 72 | title: 'Company', 73 | items: [ 74 | { 75 | label: 'Homepage', 76 | href: 'https://gptr.dev', 77 | }, 78 | { 79 | label: 'Contact', 80 | href: 'mailto:assafelovic@gmail.com', 81 | }, 82 | ], 83 | }, 84 | ], 85 | copyright: `Copyright © ${new Date().getFullYear()} GPT Researcher.`, 86 | }, 87 | }, 88 | presets: [ 89 | [ 90 | '@docusaurus/preset-classic', 91 | { 92 | docs: { 93 | sidebarPath: require.resolve('./sidebars.js'), 94 | // Please change this to your repo. 95 | editUrl: 96 | 'https://github.com/assafelovic/gpt-researcher/tree/master/docs', 97 | remarkPlugins: [math], 98 | rehypePlugins: [katex], 99 | }, 100 | theme: { 101 | customCss: require.resolve('./src/css/custom.css'), 102 | }, 103 | }, 104 | ], 105 | ], 106 | stylesheets: [ 107 | { 108 | href: "https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css", 109 | integrity: "sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc", 110 | crossorigin: "anonymous", 111 | }, 112 | ], 113 | 114 | plugins: [ 115 | // ... Your other plugins. 116 | [ 117 | require.resolve("@easyops-cn/docusaurus-search-local"), 118 | { 119 | // ... Your options. 120 | // `hashed` is recommended as long-term-cache of index file is possible. 121 | hashed: true, 122 | blogDir:"./blog/" 123 | // For Docs using Chinese, The `language` is recommended to set to: 124 | // ``` 125 | // language: ["en", "zh"], 126 | // ``` 127 | // When applying `zh` in language, please install `nodejieba` in your project. 128 | }, 129 | ], 130 | ], 131 | }; 132 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "website", 3 | "version": "0.0.0", 4 | "private": true, 5 | "resolutions" :{ 6 | "nth-check":"2.0.1", 7 | "trim":"0.0.3", 8 | "got": "11.8.5", 9 | "node-forge": "1.3.0", 10 | "minimatch": "3.0.5", 11 | "loader-utils": "2.0.4", 12 | "eta": "2.0.0", 13 | "@sideway/formula": "3.0.1", 14 | "http-cache-semantics": "4.1.1" 15 | }, 16 | "scripts": { 17 | "docusaurus": "docusaurus", 18 | "start": "docusaurus start", 19 | "build": "docusaurus build", 20 | "swizzle": "docusaurus swizzle", 21 | "deploy": "docusaurus deploy", 22 | "clear": "docusaurus clear", 23 | "serve": "docusaurus serve", 24 | "write-translations": "docusaurus write-translations", 25 | "write-heading-ids": "docusaurus write-heading-ids" 26 | }, 27 | "dependencies": { 28 | "@docusaurus/core": "0.0.0-4193", 29 | "@docusaurus/preset-classic": "0.0.0-4193", 30 | "@easyops-cn/docusaurus-search-local": "^0.21.1", 31 | "@mdx-js/react": "^1.6.21", 32 | "@svgr/webpack": "^5.5.0", 33 | "clsx": "^1.1.1", 34 | "file-loader": "^6.2.0", 35 | "hast-util-is-element": "1.1.0", 36 | "react": "^17.0.1", 37 | "react-dom": "^17.0.1", 38 | "rehype-katex": "4", 39 | "remark-math": "3", 40 | "trim": "^0.0.3", 41 | "url-loader": "^4.1.1", 42 | "minimatch": "3.0.5" 43 | }, 44 | "browserslist": { 45 | "production": [ 46 | ">0.5%", 47 | "not dead", 48 | "not op_mini all" 49 | ], 50 | "development": [ 51 | "last 1 chrome version", 52 | "last 1 firefox version", 53 | "last 1 safari version" 54 | ] 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /docs/pydoc-markdown.yml: -------------------------------------------------------------------------------- 1 | loaders: 2 | - type: python 3 | search_path: [../docs] 4 | processors: 5 | - type: filter 6 | skip_empty_modules: true 7 | - type: smart 8 | - type: crossref 9 | renderer: 10 | type: docusaurus 11 | docs_base_path: docs 12 | relative_output_path: reference 13 | relative_sidebar_path: sidebar.json 14 | sidebar_top_level_label: Reference 15 | markdown: 16 | escape_html_in_docstring: false 17 | -------------------------------------------------------------------------------- /docs/sidebars.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Creating a sidebar enables you to: 3 | - create an ordered group of docs 4 | - render a sidebar for each doc of that group 5 | - provide next/previous navigation 6 | 7 | The sidebars can be generated from the filesystem, or explicitly defined here. 8 | 9 | Create as many sidebars as you want. 10 | */ 11 | 12 | module.exports = { 13 | docsSidebar: [ 14 | 'welcome', 15 | { 16 | type: 'category', 17 | label: 'GPT Researcher', 18 | collapsible: true, 19 | collapsed: false, 20 | items: [ 21 | 'gpt-researcher/introduction', 22 | 'gpt-researcher/getting-started', 23 | 'gpt-researcher/pip-package', 24 | 'gpt-researcher/example', 25 | 'gpt-researcher/troubleshooting', 26 | ], 27 | }, 28 | { 29 | type: 'category', 30 | label: 'Customization', 31 | collapsible: true, 32 | collapsed: false, 33 | items: [ 34 | 'gpt-researcher/config', 35 | 'gpt-researcher/tailored-research', 36 | 'gpt-researcher/llms', 37 | ] 38 | }, 39 | { 40 | type: 'category', 41 | label: 'Multi-Agent Frameworks', 42 | collapsible: true, 43 | collapsed: false, 44 | items: [ 45 | 'gpt-researcher/langgraph', 46 | ] 47 | }, 48 | {'Examples': [{type: 'autogenerated', dirName: 'examples'}]}, 49 | 'contribute', 50 | ], 51 | // pydoc-markdown auto-generated markdowns from docstrings 52 | referenceSideBar: [require("./docs/reference/sidebar.json")] 53 | }; 54 | -------------------------------------------------------------------------------- /docs/src/components/HomepageFeatures.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import clsx from 'clsx'; 3 | import { Link } from 'react-router-dom'; 4 | import styles from './HomepageFeatures.module.css'; 5 | 6 | const FeatureList = [ 7 | { 8 | title: 'GPT Researcher', 9 | Svg: require('../../static/img/gptr-logo.png').default, 10 | docLink: './docs/gpt-researcher/getting-started', 11 | description: ( 12 | <> 13 | GPT Researcher is an open source autonomous agent designed for comprehensive online research on a variety of tasks. 14 | 15 | ), 16 | }, 17 | /*{ 18 | title: 'Tavily Search API', 19 | Svg: require('../../static/img/tavily.png').default, 20 | docLink: './docs/tavily-api/introduction', 21 | description: ( 22 | <> 23 | Tavily Search API is a search engine optimized for LLMs, optimized for a factual, efficient, and persistent search experience 24 | 25 | ), 26 | },*/ 27 | { 28 | title: 'Multi-Agent Assistant', 29 | Svg: require('../../static/img/multi-agent.png').default, 30 | docLink: './docs/gpt-researcher/langgraph', 31 | description: ( 32 | <> 33 | Learn how a team of AI agents can work together to conduct research on a given topic, from planning to publication. 34 | 35 | ), 36 | }, 37 | { 38 | title: 'Examples and Demos', 39 | Svg: require('../../static/img/examples.png').default, 40 | docLink: './docs/examples/examples', 41 | description: ( 42 | <> 43 | Check out Tavily API in action across multiple frameworks and use cases 44 | 45 | ), 46 | }, 47 | ]; 48 | 49 | function Feature({Svg, title, description, docLink}) { 50 | return ( 51 |
52 |
53 | {/**/} 54 | {title} 55 |
56 |
57 | 58 |

{title}

59 | 60 |

{description}

61 |
62 |
63 | ); 64 | } 65 | 66 | export default function HomepageFeatures() { 67 | return ( 68 |
69 |
70 |
71 | {FeatureList.map((props, idx) => ( 72 | 73 | ))} 74 |
75 |
76 |
77 | ); 78 | } 79 | -------------------------------------------------------------------------------- /docs/src/components/HomepageFeatures.module.css: -------------------------------------------------------------------------------- 1 | /* stylelint-disable docusaurus/copyright-header */ 2 | 3 | .features { 4 | display: flex; 5 | align-items: center; 6 | padding: 2rem 0; 7 | width: 100%; 8 | } 9 | 10 | .featureSvg { 11 | height: 120px; 12 | width: 200px; 13 | } 14 | -------------------------------------------------------------------------------- /docs/src/css/custom.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --ifm-font-size-base: 17px; 3 | --ifm-code-font-size: 90%; 4 | 5 | --ifm-color-primary: #0c4da2; 6 | --ifm-color-primary-dark: rgb(11, 69, 146); 7 | --ifm-color-primary-darker: #0a418a; 8 | --ifm-color-primary-darkest: #083671; 9 | --ifm-color-primary-light: #0d55b2; 10 | --ifm-color-primary-lighter: #0e59ba; 11 | --ifm-color-primary-lightest: #1064d3; 12 | 13 | --ifm-color-emphasis-300: #1064d3; 14 | --ifm-link-color: #1064d3; 15 | --ifm-menu-color-active: #1064d3; 16 | } 17 | 18 | .docusaurus-highlight-code-line { 19 | background-color: rgba(0, 0, 0, 0.1); 20 | display: block; 21 | margin: 0 calc(-1 * var(--ifm-pre-padding)); 22 | padding: 0 var(--ifm-pre-padding); 23 | } 24 | html[data-theme='dark'] .docusaurus-highlight-code-line { 25 | background-color: rgb(0, 0, 0, 0.3); 26 | } 27 | 28 | .admonition-content a { 29 | text-decoration: underline; 30 | font-weight: 600; 31 | color: inherit; 32 | } 33 | 34 | a { 35 | font-weight: 600; 36 | } 37 | 38 | blockquote { 39 | /* samsung blue with lots of transparency */ 40 | background-color: #0c4da224; 41 | } 42 | @media (prefers-color-scheme: dark) { 43 | :root { 44 | --ifm-hero-text-color: white; 45 | } 46 | } 47 | @media (prefers-color-scheme: dark) { 48 | .hero.hero--primary { --ifm-hero-text-color: white;} 49 | } 50 | 51 | @media (prefers-color-scheme: dark) { 52 | blockquote { 53 | --ifm-color-emphasis-300: var(--ifm-color-primary); 54 | /* border-left: 6px solid var(--ifm-color-emphasis-300); */ 55 | } 56 | } 57 | @media (prefers-color-scheme: dark) { 58 | code { 59 | /* background-color: rgb(41, 45, 62); */ 60 | } 61 | } 62 | 63 | 64 | /* Docusaurus still defaults to their green! */ 65 | @media (prefers-color-scheme: dark) { 66 | .react-toggle-thumb { 67 | border-color: var(--ifm-color-primary) !important; 68 | } 69 | } 70 | 71 | 72 | .header-github-link:hover { 73 | opacity: 0.6; 74 | } 75 | 76 | .header-github-link:before { 77 | content: ''; 78 | width: 24px; 79 | height: 24px; 80 | display: flex; 81 | background: url("data:image/svg+xml,%3Csvg viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12'/%3E%3C/svg%3E") 82 | no-repeat; 83 | } 84 | 85 | html[data-theme='dark'] .header-github-link:before { 86 | background: url("data:image/svg+xml,%3Csvg viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath fill='white' d='M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12'/%3E%3C/svg%3E") 87 | no-repeat; 88 | } 89 | -------------------------------------------------------------------------------- /docs/src/pages/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import clsx from 'clsx'; 3 | import Layout from '@theme/Layout'; 4 | import Link from '@docusaurus/Link'; 5 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; 6 | import styles from './index.module.css'; 7 | import HomepageFeatures from '../components/HomepageFeatures'; 8 | 9 | function HomepageHeader() { 10 | const {siteConfig} = useDocusaurusContext(); 11 | return ( 12 |
13 |
14 |

{siteConfig.title}

15 |

{siteConfig.tagline}

16 |
17 | 20 | Getting Started - 5 min ⏱️ 21 | 22 |
23 |
24 |
25 | ); 26 | } 27 | 28 | export default function Home() { 29 | const {siteConfig} = useDocusaurusContext(); 30 | return ( 31 | 34 | 35 |
36 | 37 |
38 |
39 | ); 40 | } 41 | -------------------------------------------------------------------------------- /docs/src/pages/index.module.css: -------------------------------------------------------------------------------- 1 | /* stylelint-disable docusaurus/copyright-header */ 2 | 3 | /** 4 | * CSS files with the .module.css suffix will be treated as CSS modules 5 | * and scoped locally. 6 | */ 7 | 8 | .heroBanner { 9 | padding: 5rem 0; 10 | text-align: center; 11 | position: relative; 12 | overflow: hidden; 13 | } 14 | 15 | @media screen and (max-width: 966px) { 16 | .heroBanner { 17 | padding: 2rem; 18 | } 19 | } 20 | 21 | .buttons { 22 | display: flex; 23 | align-items: center; 24 | justify-content: center; 25 | } 26 | -------------------------------------------------------------------------------- /docs/static/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/.nojekyll -------------------------------------------------------------------------------- /docs/static/img/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/architecture.png -------------------------------------------------------------------------------- /docs/static/img/banner1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/banner1.jpg -------------------------------------------------------------------------------- /docs/static/img/examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/examples.png -------------------------------------------------------------------------------- /docs/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/favicon.ico -------------------------------------------------------------------------------- /docs/static/img/gptr-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/gptr-logo.png -------------------------------------------------------------------------------- /docs/static/img/gptresearcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/gptresearcher.png -------------------------------------------------------------------------------- /docs/static/img/leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/leaderboard.png -------------------------------------------------------------------------------- /docs/static/img/multi-agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/multi-agent.png -------------------------------------------------------------------------------- /docs/static/img/tavily.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/docs/static/img/tavily.png -------------------------------------------------------------------------------- /examples/pip-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "id": "byPgKYhAE6gn" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "os.environ['OPENAI_API_KEY'] = 'your_openai_api_key'\n", 27 | "os.environ['TAVILY_API_KEY'] = 'your_tavily_api_key' # Get a free key here: https://app.tavily.com" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "source": [ 33 | "!pip install -U gpt-researcher nest_asyncio" 34 | ], 35 | "metadata": { 36 | "id": "-rXET3OZLxwH" 37 | }, 38 | "execution_count": null, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "source": [ 44 | "import nest_asyncio # required for notebooks\n", 45 | "nest_asyncio.apply()\n", 46 | "\n", 47 | "from gpt_researcher import GPTResearcher\n", 48 | "import asyncio\n", 49 | "\n", 50 | "async def get_report(query: str, report_type: str) -> str:\n", 51 | " researcher = GPTResearcher(query, report_type)\n", 52 | " research_result = await researcher.conduct_research()\n", 53 | " report = await researcher.write_report()\n", 54 | " return report\n", 55 | "\n", 56 | "if __name__ == \"__main__\":\n", 57 | " query = \"which team may win the NBA finals?\"\n", 58 | " report_type = \"research_report\"\n", 59 | "\n", 60 | " report = asyncio.run(get_report(query, report_type))\n", 61 | " print(report)" 62 | ], 63 | "metadata": { 64 | "id": "KWZe2InrL0ji" 65 | }, 66 | "execution_count": null, 67 | "outputs": [] 68 | } 69 | ] 70 | } 71 | -------------------------------------------------------------------------------- /examples/sample_report.py: -------------------------------------------------------------------------------- 1 | from gpt_researcher import GPTResearcher 2 | import asyncio 3 | 4 | 5 | async def main(): 6 | """ 7 | This is a sample script that shows how to run a research report. 8 | """ 9 | # Query 10 | query = "What happened in the latest burning man floods?" 11 | 12 | # Report Type 13 | report_type = "research_report" 14 | 15 | # Initialize the researcher 16 | researcher = GPTResearcher(query=query, report_type=report_type, config_path=None) 17 | # Conduct research on the given query 18 | await researcher.conduct_research() 19 | # Write the report 20 | report = await researcher.write_report() 21 | 22 | return report 23 | 24 | 25 | if __name__ == "__main__": 26 | asyncio.run(main()) 27 | -------------------------------------------------------------------------------- /frontend/pdf_styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Libre Baskerville', serif; 3 | font-size: 12pt; /* standard size for academic papers */ 4 | line-height: 1.6; /* for readability */ 5 | color: #333; /* softer on the eyes than black */ 6 | background-color: #fff; /* white background */ 7 | margin: 0; 8 | padding: 0; 9 | } 10 | 11 | h1, h2, h3, h4, h5, h6 { 12 | font-family: 'Libre Baskerville', serif; 13 | color: #000; /* darker than the body text */ 14 | margin-top: 1em; /* space above headers */ 15 | } 16 | 17 | h1 { 18 | font-size: 2em; /* make h1 twice the size of the body text */ 19 | } 20 | 21 | h2 { 22 | font-size: 1.5em; 23 | } 24 | 25 | /* Add some space between paragraphs */ 26 | p { 27 | margin-bottom: 1em; 28 | } 29 | 30 | /* Style for blockquotes, often used in academic papers */ 31 | blockquote { 32 | font-style: italic; 33 | margin: 1em 0; 34 | padding: 1em; 35 | background-color: #f9f9f9; /* a light grey background */ 36 | } 37 | 38 | /* You might want to style tables, figures, etc. too */ 39 | table { 40 | border-collapse: collapse; 41 | width: 100%; 42 | } 43 | 44 | table, th, td { 45 | border: 1px solid #ddd; 46 | text-align: left; 47 | padding: 8px; 48 | } 49 | 50 | th { 51 | background-color: #f2f2f2; 52 | color: black; 53 | } -------------------------------------------------------------------------------- /frontend/static/academicResearchAgentAvatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/academicResearchAgentAvatar.png -------------------------------------------------------------------------------- /frontend/static/businessAnalystAgentAvatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/businessAnalystAgentAvatar.png -------------------------------------------------------------------------------- /frontend/static/computerSecurityanalystAvatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/computerSecurityanalystAvatar.png -------------------------------------------------------------------------------- /frontend/static/defaultAgentAvatar.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/defaultAgentAvatar.JPG -------------------------------------------------------------------------------- /frontend/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/favicon.ico -------------------------------------------------------------------------------- /frontend/static/financeAgentAvatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/financeAgentAvatar.png -------------------------------------------------------------------------------- /frontend/static/gptr-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/gptr-logo.png -------------------------------------------------------------------------------- /frontend/static/mathAgentAvatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/mathAgentAvatar.png -------------------------------------------------------------------------------- /frontend/static/travelAgentAvatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/frontend/static/travelAgentAvatar.png -------------------------------------------------------------------------------- /frontend/styles.css: -------------------------------------------------------------------------------- 1 | @keyframes gradientBG { 2 | 0% {background-position: 0% 50%;} 3 | 50% {background-position: 100% 50%;} 4 | 100% {background-position: 0% 50%;} 5 | } 6 | 7 | html { 8 | scroll-behavior: smooth; 9 | } 10 | 11 | body { 12 | font-family: 'Montserrat', sans-serif; 13 | color: #fff; 14 | line-height: 1.6; 15 | background-size: 200% 200%; 16 | background-image: linear-gradient(45deg, #151A2D, #2D284D, #151A2D); 17 | animation: gradientBG 10s ease infinite; 18 | } 19 | 20 | .landing { 21 | display: flex; 22 | justify-content: center; 23 | align-items: center; 24 | height: 100vh; 25 | text-align: center; 26 | } 27 | 28 | .landing h1 { 29 | font-size: 3.5rem; 30 | font-weight: 700; 31 | margin-bottom: 2rem; 32 | } 33 | 34 | .landing p { 35 | font-size: 1.5rem; 36 | font-weight: 400; 37 | max-width: 500px; 38 | margin: auto; 39 | margin-bottom: 2rem; 40 | } 41 | 42 | .container { 43 | max-width: 900px; 44 | margin: auto; 45 | padding: 20px; 46 | background-color: rgba(255, 255, 255, 0.1); 47 | border-radius: 12px; 48 | box-shadow: 0px 10px 25px rgba(0, 0, 0, 0.1); 49 | transition: all .3s ease-in-out; 50 | margin-bottom: 180px; 51 | } 52 | 53 | .container:hover { 54 | transform: scale(1.01); 55 | box-shadow: 0px 15px 30px rgba(0, 0, 0, 0.2); 56 | } 57 | 58 | input, select, #output, #reportContainer { 59 | background-color: rgba(255,255,255,0.1); 60 | border: none; 61 | color: #fff; 62 | transition: all .3s ease-in-out; 63 | } 64 | 65 | input:hover, input:focus, select:hover, select:focus { 66 | background-color: #dfe4ea; 67 | border: 1px solid rgba(255, 255, 255, 0.5); 68 | box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.1); 69 | transition: all 0.3s ease-in-out; 70 | } 71 | 72 | .btn-primary { 73 | background: linear-gradient(to right, #0062cc, #007bff); 74 | border: none; 75 | transition: all .3s ease-in-out; 76 | } 77 | 78 | .btn-secondary { 79 | background: linear-gradient(to right, #6c757d, #6c757d); 80 | border: none; 81 | transition: all .3s ease-in-out; 82 | } 83 | 84 | .btn:hover { 85 | opacity: 0.8; 86 | transform: scale(1.1); 87 | box-shadow: 0px 10px 20px rgba(0, 0, 0, 0.3); 88 | } 89 | 90 | .agent_question { 91 | font-size: 1.4rem; 92 | font-weight: 500; 93 | margin-bottom: 0.2rem; 94 | } 95 | 96 | footer { 97 | position: fixed; 98 | left: 0; 99 | bottom: 0; 100 | width: 100%; 101 | background: linear-gradient(to right, #151A2D, #111827); 102 | color: white; 103 | text-align: center; 104 | padding: 10px 0; 105 | } 106 | 107 | .margin-div { 108 | margin-top: 20px; 109 | margin-bottom: 20px; 110 | padding: 10px; 111 | } 112 | 113 | .agent_response { 114 | background-color: #747d8c; 115 | margin: 10px; 116 | padding: 10px; 117 | border-radius: 12px; 118 | } 119 | 120 | #output { 121 | height: 300px; 122 | font-family: 'Times New Roman', Times, , "Courier New", serif; 123 | overflow: auto; 124 | padding: 10px; 125 | margin-bottom: 10px; 126 | margin-top: 10px; 127 | } 128 | 129 | #reportContainer { 130 | background-color: rgba(255,255,255,0.1); 131 | border: none; 132 | color: #fff; 133 | transition: all .3s ease-in-out; 134 | padding: 10px; 135 | border-radius: 12px; 136 | } 137 | -------------------------------------------------------------------------------- /gpt_researcher/README.md: -------------------------------------------------------------------------------- 1 | # 🔎 GPT Researcher 2 | [![Official Website](https://img.shields.io/badge/Official%20Website-tavily.com-blue?style=for-the-badge&logo=world&logoColor=white)](https://tavily.com) 3 | [![Discord Follow](https://dcbadge.vercel.app/api/server/2pFkc83fRq?style=for-the-badge)](https://discord.com/invite/2pFkc83fRq) 4 | 5 | [![GitHub Repo stars](https://img.shields.io/github/stars/assafelovic/gpt-researcher?style=social)](https://github.com/assafelovic/gpt-researcher) 6 | [![Twitter Follow](https://img.shields.io/twitter/follow/tavilyai?style=social)](https://twitter.com/tavilyai) 7 | [![PyPI version](https://badge.fury.io/py/gpt-researcher.svg)](https://badge.fury.io/py/gpt-researcher) 8 | 9 | **GPT Researcher is an autonomous agent designed for comprehensive online research on a variety of tasks.** 10 | 11 | The agent can produce detailed, factual and unbiased research reports, with customization options for focusing on relevant resources, outlines, and lessons. Inspired by the recent [Plan-and-Solve](https://arxiv.org/abs/2305.04091) and [RAG](https://arxiv.org/abs/2005.11401) papers, GPT Researcher addresses issues of speed, determinism and reliability, offering a more stable performance and increased speed through parallelized agent work, as opposed to synchronous operations. 12 | 13 | **Our mission is to empower individuals and organizations with accurate, unbiased, and factual information by leveraging the power of AI.** 14 | 15 | #### PIP Package 16 | > **Step 0** - Install Python 3.11 or later. [See here](https://www.tutorialsteacher.com/python/install-python) for a step-by-step guide. 17 | > **Step 1** - install GPT Researcher package [PyPI page](https://pypi.org/project/gpt-researcher/) 18 | ```bash 19 | $ pip install gpt-researcher 20 | ``` 21 | > **Step 2** - Create .env file with your OpenAI Key and Tavily API key or simply export it 22 | ```bash 23 | $ export OPENAI_API_KEY={Your OpenAI API Key here} 24 | ``` 25 | ```bash 26 | $ export TAVILY_API_KEY={Your Tavily API Key here} 27 | ``` 28 | > **Step 3** - Start Coding using GPT Researcher in your own code, example: 29 | ```python 30 | from gpt_researcher import GPTResearcher 31 | import asyncio 32 | 33 | 34 | async def get_report(query: str, report_type: str) -> str: 35 | researcher = GPTResearcher(query, report_type) 36 | report = await researcher.run() 37 | return report 38 | 39 | if __name__ == "__main__": 40 | query = "what team may win the NBA finals?" 41 | report_type = "research_report" 42 | 43 | report = asyncio.run(get_report(query, report_type)) 44 | print(report) 45 | 46 | ``` 47 | 48 | 49 | -------------------------------------------------------------------------------- /gpt_researcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .master import GPTResearcher 2 | from .config import Config 3 | 4 | __all__ = ['GPTResearcher', 'Config'] 5 | -------------------------------------------------------------------------------- /gpt_researcher/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import Config 2 | 3 | __all__ = ['Config'] -------------------------------------------------------------------------------- /gpt_researcher/config/config.py: -------------------------------------------------------------------------------- 1 | # config file 2 | import json 3 | import os 4 | 5 | 6 | class Config: 7 | """Config class for GPT Researcher.""" 8 | 9 | def __init__(self, config_file: str = None): 10 | """Initialize the config class.""" 11 | self.config_file = os.path.expanduser(config_file) if config_file else os.getenv('CONFIG_FILE') 12 | self.retriever = os.getenv('RETRIEVER', "tavily") 13 | self.embedding_provider = os.getenv('EMBEDDING_PROVIDER', 'openai') 14 | self.llm_provider = os.getenv('LLM_PROVIDER', "openai") 15 | self.fast_llm_model = os.getenv('FAST_LLM_MODEL', "gpt-3.5-turbo-16k") 16 | self.smart_llm_model = os.getenv('SMART_LLM_MODEL', "gpt-4o") 17 | self.fast_token_limit = int(os.getenv('FAST_TOKEN_LIMIT', 2000)) 18 | self.smart_token_limit = int(os.getenv('SMART_TOKEN_LIMIT', 4000)) 19 | self.browse_chunk_max_length = int(os.getenv('BROWSE_CHUNK_MAX_LENGTH', 8192)) 20 | self.summary_token_limit = int(os.getenv('SUMMARY_TOKEN_LIMIT', 700)) 21 | self.temperature = float(os.getenv('TEMPERATURE', 0.55)) 22 | self.user_agent = os.getenv('USER_AGENT', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " 23 | "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0") 24 | self.max_search_results_per_query = int(os.getenv('MAX_SEARCH_RESULTS_PER_QUERY', 5)) 25 | self.memory_backend = os.getenv('MEMORY_BACKEND', "local") 26 | self.total_words = int(os.getenv('TOTAL_WORDS', 800)) 27 | self.report_format = os.getenv('REPORT_FORMAT', "APA") 28 | self.max_iterations = int(os.getenv('MAX_ITERATIONS', 3)) 29 | self.agent_role = os.getenv('AGENT_ROLE', None) 30 | self.scraper = os.getenv("SCRAPER", "bs") 31 | self.max_subtopics = os.getenv("MAX_SUBTOPICS", 3) 32 | self.doc_path = os.getenv("DOC_PATH", "") 33 | 34 | self.load_config_file() 35 | 36 | if self.doc_path: 37 | self.validate_doc_path() 38 | 39 | def validate_doc_path(self): 40 | """Ensure that the folder exists at the doc path""" 41 | os.makedirs(self.doc_path, exist_ok=True) 42 | 43 | def load_config_file(self) -> None: 44 | """Load the config file.""" 45 | if self.config_file is None: 46 | return None 47 | with open(self.config_file, "r") as f: 48 | config = json.load(f) 49 | for key, value in config.items(): 50 | setattr(self, key.lower(), value) -------------------------------------------------------------------------------- /gpt_researcher/context/__init__.py: -------------------------------------------------------------------------------- 1 | from .compression import ContextCompressor 2 | from .retriever import SearchAPIRetriever 3 | 4 | __all__ = ['ContextCompressor', 'SearchAPIRetriever'] 5 | -------------------------------------------------------------------------------- /gpt_researcher/context/compression.py: -------------------------------------------------------------------------------- 1 | from .retriever import SearchAPIRetriever 2 | from langchain.retrievers import ( 3 | ContextualCompressionRetriever, 4 | ) 5 | from langchain.retrievers.document_compressors import ( 6 | DocumentCompressorPipeline, 7 | EmbeddingsFilter, 8 | ) 9 | from langchain.text_splitter import RecursiveCharacterTextSplitter 10 | 11 | 12 | class ContextCompressor: 13 | def __init__(self, documents, embeddings, max_results=5, **kwargs): 14 | self.max_results = max_results 15 | self.documents = documents 16 | self.kwargs = kwargs 17 | self.embeddings = embeddings 18 | self.similarity_threshold = 0.38 19 | 20 | def _get_contextual_retriever(self): 21 | splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) 22 | relevance_filter = EmbeddingsFilter(embeddings=self.embeddings, 23 | similarity_threshold=self.similarity_threshold) 24 | pipeline_compressor = DocumentCompressorPipeline( 25 | transformers=[splitter, relevance_filter] 26 | ) 27 | base_retriever = SearchAPIRetriever( 28 | pages=self.documents 29 | ) 30 | contextual_retriever = ContextualCompressionRetriever( 31 | base_compressor=pipeline_compressor, base_retriever=base_retriever 32 | ) 33 | return contextual_retriever 34 | 35 | def _pretty_print_docs(self, docs, top_n): 36 | return f"\n".join(f"Source: {d.metadata.get('source')}\n" 37 | f"Title: {d.metadata.get('title')}\n" 38 | f"Content: {d.page_content}\n" 39 | for i, d in enumerate(docs) if i < top_n) 40 | 41 | def get_context(self, query, max_results=5): 42 | compressed_docs = self._get_contextual_retriever() 43 | relevant_docs = compressed_docs.invoke(query) 44 | return self._pretty_print_docs(relevant_docs, max_results) -------------------------------------------------------------------------------- /gpt_researcher/context/retriever.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | from typing import Any, Dict, List, Optional 4 | 5 | from langchain.callbacks.manager import CallbackManagerForRetrieverRun 6 | from langchain.schema import Document 7 | from langchain.schema.retriever import BaseRetriever 8 | 9 | 10 | class SearchAPIRetriever(BaseRetriever): 11 | """Search API retriever.""" 12 | pages: List[Dict] = [] 13 | 14 | def _get_relevant_documents( 15 | self, query: str, *, run_manager: CallbackManagerForRetrieverRun 16 | ) -> List[Document]: 17 | 18 | docs = [ 19 | Document( 20 | page_content=page.get("raw_content", ""), 21 | metadata={ 22 | "title": page.get("title", ""), 23 | "source": page.get("url", ""), 24 | }, 25 | ) 26 | for page in self.pages 27 | ] 28 | 29 | return docs 30 | -------------------------------------------------------------------------------- /gpt_researcher/document/__init__.py: -------------------------------------------------------------------------------- 1 | from .document import DocumentLoader 2 | 3 | __all__ = ['DocumentLoader'] 4 | -------------------------------------------------------------------------------- /gpt_researcher/document/document.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | from langchain_community.document_loaders import ( 5 | PyMuPDFLoader, 6 | TextLoader, 7 | UnstructuredCSVLoader, 8 | UnstructuredExcelLoader, 9 | UnstructuredMarkdownLoader, 10 | UnstructuredPowerPointLoader, 11 | UnstructuredWordDocumentLoader 12 | ) 13 | 14 | 15 | class DocumentLoader: 16 | 17 | def __init__(self, path): 18 | self.path = path 19 | 20 | async def load(self) -> list: 21 | tasks = [] 22 | for root, dirs, files in os.walk(self.path): 23 | for file in files: 24 | file_path = os.path.join(root, file) 25 | file_name, file_extension_with_dot = os.path.splitext(file_path) 26 | file_extension = file_extension_with_dot.strip(".") 27 | tasks.append(self._load_document(file_path, file_extension)) 28 | 29 | docs = [] 30 | for pages in await asyncio.gather(*tasks): 31 | for page in pages: 32 | if page.page_content: 33 | docs.append({ 34 | "raw_content": page.page_content, 35 | "url": os.path.basename(page.metadata['source']) 36 | }) 37 | 38 | if not docs: 39 | raise ValueError("🤷 Failed to load any documents!") 40 | 41 | return docs 42 | 43 | async def _load_document(self, file_path: str, file_extension: str) -> list: 44 | try: 45 | loader_dict = { 46 | "pdf": PyMuPDFLoader(file_path), 47 | "txt": TextLoader(file_path), 48 | "doc": UnstructuredWordDocumentLoader(file_path), 49 | "docx": UnstructuredWordDocumentLoader(file_path), 50 | "pptx": UnstructuredPowerPointLoader(file_path), 51 | "csv": UnstructuredCSVLoader(file_path, mode="elements"), 52 | "xls": UnstructuredExcelLoader(file_path, mode="elements"), 53 | "xlsx": UnstructuredExcelLoader(file_path, mode="elements"), 54 | "md": UnstructuredMarkdownLoader(file_path) 55 | } 56 | 57 | loader = loader_dict.get(file_extension, None) 58 | if loader: 59 | data = loader.load() 60 | return data 61 | 62 | except Exception as e: 63 | print(f"Failed to load document : {file_path}") 64 | print(e) 65 | return [] 66 | -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/__init__.py: -------------------------------------------------------------------------------- 1 | from .google.google import GoogleProvider 2 | from .openai.openai import OpenAIProvider 3 | from .azureopenai.azureopenai import AzureOpenAIProvider 4 | 5 | __all__ = [ 6 | "GoogleProvider", 7 | "OpenAIProvider", 8 | "AzureOpenAIProvider" 9 | ] 10 | -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/azureopenai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/llm_provider/azureopenai/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/azureopenai/azureopenai.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from colorama import Fore, Style 4 | from langchain_openai import AzureChatOpenAI 5 | 6 | ''' 7 | Please note: 8 | Needs additional env vars such as: 9 | AZURE_OPENAI_ENDPOINT e.g. https://xxxx.openai.azure.com/", 10 | AZURE_OPENAI_API_KEY e.g "xxxxxxxxxxxxxxxxxxxxx", 11 | OPENAI_API_VERSION, e.g. "2024-03-01-preview" but needs to updated over time as API verison updates, 12 | AZURE_EMBEDDING_MODEL e.g. "ada2" The Azure OpenAI embedding model deployment name. 13 | 14 | config.py settings for Azure OpenAI should look like: 15 | self.embedding_provider = os.getenv('EMBEDDING_PROVIDER', 'azureopenai') 16 | self.llm_provider = os.getenv('LLM_PROVIDER', "azureopenai") 17 | self.fast_llm_model = os.getenv('FAST_LLM_MODEL', "gpt-3.5-turbo-16k") #Deployment name of your GPT3.5T model as per azure OpenAI studio deployment section 18 | self.smart_llm_model = os.getenv('SMART_LLM_MODEL', "gpt4") #Deployment name of your GPT4 1106-Preview+ (GPT4T) model as per azure OpenAI studio deployment section 19 | ''' 20 | class AzureOpenAIProvider: 21 | 22 | def __init__( 23 | self, 24 | deployment_name, 25 | temperature, 26 | max_tokens 27 | ): 28 | self.deployment_name = deployment_name 29 | self.temperature = temperature 30 | self.max_tokens = max_tokens 31 | self.api_key = self.get_api_key() 32 | self.llm = self.get_llm_model() 33 | 34 | def get_api_key(self): 35 | """ 36 | Gets the OpenAI API key 37 | Returns: 38 | 39 | """ 40 | try: 41 | api_key = os.environ["AZURE_OPENAI_API_KEY"] 42 | except: 43 | raise Exception( 44 | "Azure OpenAI API key not found. Please set the AZURE_OPENAI_API_KEY environment variable.") 45 | return api_key 46 | 47 | def get_llm_model(self): 48 | # Initializing the chat model 49 | llm = AzureChatOpenAI( 50 | deployment_name=self.deployment_name, 51 | temperature=self.temperature, 52 | max_tokens=self.max_tokens, 53 | api_key=self.api_key 54 | ) 55 | 56 | return llm 57 | 58 | async def get_chat_response(self, messages, stream, websocket=None): 59 | if not stream: 60 | # Getting output from the model chain using ainvoke for asynchronous invoking 61 | output = await self.llm.ainvoke(messages) 62 | 63 | return output.content 64 | 65 | else: 66 | return await self.stream_response(messages, websocket) 67 | 68 | async def stream_response(self, messages, websocket=None): 69 | paragraph = "" 70 | response = "" 71 | 72 | # Streaming the response using the chain astream method from langchain 73 | async for chunk in self.llm.astream(messages): 74 | content = chunk.content 75 | if content is not None: 76 | response += content 77 | paragraph += content 78 | if "\n" in paragraph: 79 | if websocket is not None: 80 | await websocket.send_json({"type": "report", "output": paragraph}) 81 | else: 82 | print(f"{Fore.GREEN}{paragraph}{Style.RESET_ALL}") 83 | paragraph = "" 84 | 85 | return response 86 | -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/google/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/llm_provider/google/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/google/google.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from colorama import Fore, Style 4 | from langchain_core.messages import HumanMessage, SystemMessage 5 | from langchain_google_genai import ChatGoogleGenerativeAI 6 | 7 | 8 | class GoogleProvider: 9 | 10 | def __init__( 11 | self, 12 | model, 13 | temperature, 14 | max_tokens 15 | ): 16 | # May be extended to support more google models in the future 17 | self.model = "gemini-pro" 18 | self.temperature = temperature 19 | self.max_tokens = max_tokens 20 | self.api_key = self.get_api_key() 21 | self.llm = self.get_llm_model() 22 | 23 | def get_api_key(self): 24 | """ 25 | Gets the GEMINI_API_KEY 26 | Returns: 27 | 28 | """ 29 | try: 30 | api_key = os.environ["GEMINI_API_KEY"] 31 | except: 32 | raise Exception( 33 | "GEMINI API key not found. Please set the GEMINI_API_KEY environment variable.") 34 | return api_key 35 | 36 | def get_llm_model(self): 37 | # Initializing the chat model 38 | llm = ChatGoogleGenerativeAI( 39 | convert_system_message_to_human=True, 40 | model=self.model, 41 | temperature=self.temperature, 42 | max_output_tokens=self.max_tokens, 43 | google_api_key=self.api_key 44 | ) 45 | 46 | return llm 47 | 48 | def convert_messages(self, messages): 49 | """ 50 | The function `convert_messages` converts messages based on their role into either SystemMessage 51 | or HumanMessage objects. 52 | 53 | Args: 54 | messages: It looks like the code snippet you provided is a function called `convert_messages` 55 | that takes a list of messages as input and converts each message based on its role into either a 56 | `SystemMessage` or a `HumanMessage`. 57 | 58 | Returns: 59 | The `convert_messages` function is returning a list of converted messages based on the input 60 | `messages`. The function checks the role of each message in the input list and creates a new 61 | `SystemMessage` object if the role is "system" or a new `HumanMessage` object if the role is 62 | "user". The function then returns a list of these converted messages. 63 | """ 64 | converted_messages = [] 65 | for message in messages: 66 | if message["role"] == "system": 67 | converted_messages.append( 68 | SystemMessage(content=message["content"])) 69 | elif message["role"] == "user": 70 | converted_messages.append( 71 | HumanMessage(content=message["content"])) 72 | 73 | return converted_messages 74 | 75 | async def get_chat_response(self, messages, stream, websocket=None): 76 | if not stream: 77 | # Getting output from the model chain using ainvoke for asynchronous invoking 78 | converted_messages = self.convert_messages(messages) 79 | output = await self.llm.ainvoke(converted_messages) 80 | 81 | return output.content 82 | 83 | else: 84 | return await self.stream_response(messages, websocket) 85 | 86 | async def stream_response(self, messages, websocket=None): 87 | paragraph = "" 88 | response = "" 89 | 90 | # Streaming the response using the chain astream method from langchain 91 | async for chunk in self.llm.astream(messages): 92 | content = chunk.content 93 | if content is not None: 94 | response += content 95 | paragraph += content 96 | if "\n" in paragraph: 97 | if websocket is not None: 98 | await websocket.send_json({"type": "report", "output": paragraph}) 99 | else: 100 | print(f"{Fore.GREEN}{paragraph}{Style.RESET_ALL}") 101 | paragraph = "" 102 | 103 | return response 104 | -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/llm_provider/openai/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/llm_provider/openai/openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from colorama import Fore, Style 4 | from langchain_openai import ChatOpenAI 5 | 6 | 7 | class OpenAIProvider: 8 | 9 | def __init__( 10 | self, 11 | model, 12 | temperature, 13 | max_tokens 14 | ): 15 | self.model = model 16 | self.temperature = temperature 17 | self.max_tokens = max_tokens 18 | self.api_key = self.get_api_key() 19 | self.base_url = self.get_base_url() 20 | self.llm = self.get_llm_model() 21 | 22 | def get_api_key(self): 23 | """ 24 | Gets the OpenAI API key 25 | Returns: 26 | 27 | """ 28 | try: 29 | api_key = os.environ["OPENAI_API_KEY"] 30 | except KeyError: 31 | raise Exception( 32 | "OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.") 33 | return api_key 34 | 35 | def get_base_url(self): 36 | """ 37 | Gets the OpenAI Base URL from the environment variable if defined otherwise use the default one 38 | Returns: 39 | 40 | """ 41 | base_url = os.environ.get("OPENAI_BASE_URL", None) 42 | return base_url 43 | 44 | 45 | def get_llm_model(self): 46 | # Initializing the chat model 47 | llm = ChatOpenAI( 48 | model=self.model, 49 | temperature=self.temperature, 50 | max_tokens=self.max_tokens, 51 | api_key=self.api_key 52 | ) 53 | if self.base_url: 54 | llm.openai_api_base = self.base_url 55 | 56 | return llm 57 | 58 | async def get_chat_response(self, messages, stream, websocket=None): 59 | if not stream: 60 | # Getting output from the model chain using ainvoke for asynchronous invoking 61 | output = await self.llm.ainvoke(messages) 62 | 63 | return output.content 64 | 65 | else: 66 | return await self.stream_response(messages, websocket) 67 | 68 | async def stream_response(self, messages, websocket=None): 69 | paragraph = "" 70 | response = "" 71 | 72 | # Streaming the response using the chain astream method from langchain 73 | async for chunk in self.llm.astream(messages): 74 | content = chunk.content 75 | if content is not None: 76 | response += content 77 | paragraph += content 78 | if "\n" in paragraph: 79 | if websocket is not None: 80 | await websocket.send_json({"type": "report", "output": paragraph}) 81 | else: 82 | print(f"{Fore.GREEN}{paragraph}{Style.RESET_ALL}") 83 | paragraph = "" 84 | 85 | return response 86 | -------------------------------------------------------------------------------- /gpt_researcher/master/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import GPTResearcher 2 | 3 | __all__ = ['GPTResearcher'] -------------------------------------------------------------------------------- /gpt_researcher/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .embeddings import Memory 2 | -------------------------------------------------------------------------------- /gpt_researcher/memory/embeddings.py: -------------------------------------------------------------------------------- 1 | from langchain_community.vectorstores import FAISS 2 | import os 3 | 4 | 5 | class Memory: 6 | def __init__(self, embedding_provider, **kwargs): 7 | 8 | _embeddings = None 9 | match embedding_provider: 10 | case "ollama": 11 | from langchain.embeddings import OllamaEmbeddings 12 | _embeddings = OllamaEmbeddings(model="llama2") 13 | case "openai": 14 | from langchain_openai import OpenAIEmbeddings 15 | _embeddings = OpenAIEmbeddings(model="text-embedding-3-small") 16 | case "azureopenai": 17 | from langchain_openai import AzureOpenAIEmbeddings 18 | _embeddings = AzureOpenAIEmbeddings(deployment=os.environ["AZURE_EMBEDDING_MODEL"], chunk_size=16) 19 | case "huggingface": 20 | from langchain.embeddings import HuggingFaceEmbeddings 21 | _embeddings = HuggingFaceEmbeddings() 22 | 23 | case _: 24 | raise Exception("Embedding provider not found.") 25 | 26 | self._embeddings = _embeddings 27 | 28 | def get_embeddings(self): 29 | return self._embeddings 30 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/__init__.py: -------------------------------------------------------------------------------- 1 | from .tavily_search.tavily_search import TavilySearch 2 | from .duckduckgo.duckduckgo import Duckduckgo 3 | from .google.google import GoogleSearch 4 | from .serper.serper import SerperSearch 5 | from .serpapi.serpapi import SerpApiSearch 6 | from .searx.searx import SearxSearch 7 | from .bing.bing import BingSearch 8 | from .yahoo.yahoo import YahooSearch 9 | 10 | __all__ = [ 11 | "TavilySearch", 12 | "Duckduckgo", 13 | "SerperSearch", 14 | "SerpApiSearch", 15 | "GoogleSearch", 16 | "SearxSearch", 17 | "BingSearch", 18 | "YahooSearch" 19 | ] 20 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/bing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/bing/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/bing/bing.py: -------------------------------------------------------------------------------- 1 | # Bing Search Retriever 2 | 3 | # libraries 4 | import os 5 | import requests 6 | import json 7 | 8 | 9 | class BingSearch(): 10 | """ 11 | Bing Search Retriever 12 | """ 13 | def __init__(self, query): 14 | """ 15 | Initializes the BingSearch object 16 | Args: 17 | query: 18 | """ 19 | self.query = query 20 | self.api_key = self.get_api_key() 21 | 22 | def get_api_key(self): 23 | """ 24 | Gets the Bing API key 25 | Returns: 26 | 27 | """ 28 | try: 29 | api_key = os.environ["BING_API_KEY"] 30 | except: 31 | raise Exception("Bing API key not found. Please set the BING_API_KEY environment variable.") 32 | return api_key 33 | 34 | def search(self, max_results=7): 35 | """ 36 | Searches the query 37 | Returns: 38 | 39 | """ 40 | print("Searching with query {0}...".format(self.query)) 41 | """Useful for general internet search queries using the Bing API.""" 42 | 43 | 44 | # Search the query 45 | url = "https://api.bing.microsoft.com/v7.0/search" 46 | 47 | headers = { 48 | 'Ocp-Apim-Subscription-Key': self.api_key, 49 | 'Content-Type': 'application/json' 50 | } 51 | params = { 52 | "responseFilter" : "Webpages", 53 | "q": self.query, 54 | "count": max_results, 55 | "setLang": "en-GB", 56 | "textDecorations": False, 57 | "textFormat": "HTML", 58 | "safeSearch": "Strict" 59 | } 60 | 61 | resp = requests.get(url, headers=headers, params=params) 62 | 63 | # Preprocess the results 64 | if resp is None: 65 | return 66 | try: 67 | search_results = json.loads(resp.text) 68 | except Exception: 69 | return 70 | if search_results is None: 71 | return 72 | 73 | results = search_results["webPages"]["value"] 74 | search_results = [] 75 | 76 | # Normalize the results to match the format of the other search APIs 77 | for result in results: 78 | # skip youtube results 79 | if "youtube.com" in result["url"]: 80 | continue 81 | search_result = { 82 | "title": result["name"], 83 | "href": result["url"], 84 | "body": result["snippet"], 85 | } 86 | search_results.append(search_result) 87 | 88 | return search_results 89 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/duckduckgo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/duckduckgo/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/duckduckgo/duckduckgo.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | from duckduckgo_search import DDGS 3 | 4 | 5 | class Duckduckgo: 6 | """ 7 | Duckduckgo API Retriever 8 | """ 9 | def __init__(self, query): 10 | self.ddg = DDGS() 11 | self.query = query 12 | 13 | def search(self, max_results=5): 14 | """ 15 | Performs the search 16 | :param query: 17 | :param max_results: 18 | :return: 19 | """ 20 | ddgs_gen = self.ddg.text(self.query, region='wt-wt', max_results=max_results) 21 | return ddgs_gen -------------------------------------------------------------------------------- /gpt_researcher/retrievers/google/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/google/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/google/google.py: -------------------------------------------------------------------------------- 1 | # Tavily API Retriever 2 | 3 | # libraries 4 | import os 5 | import requests 6 | import json 7 | from tavily import TavilyClient 8 | 9 | 10 | class GoogleSearch: 11 | """ 12 | Tavily API Retriever 13 | """ 14 | def __init__(self, query): 15 | """ 16 | Initializes the TavilySearch object 17 | Args: 18 | query: 19 | """ 20 | self.query = query 21 | self.api_key = self.get_api_key() #GOOGLE_API_KEY 22 | self.cx_key = self.get_cx_key() #GOOGLE_CX_KEY 23 | self.client = TavilyClient(self.api_key) 24 | 25 | def get_api_key(self): 26 | """ 27 | Gets the Tavily API key 28 | Returns: 29 | 30 | """ 31 | # Get the API key 32 | try: 33 | api_key = os.environ["GOOGLE_API_KEY"] 34 | except: 35 | raise Exception("Google API key not found. Please set the GOOGLE_API_KEY environment variable. " 36 | "You can get a key at https://developers.google.com/custom-search/v1/overview") 37 | return api_key 38 | 39 | def get_cx_key(self): 40 | """ 41 | Gets the Tavily API key 42 | Returns: 43 | 44 | """ 45 | # Get the API key 46 | try: 47 | api_key = os.environ["GOOGLE_CX_KEY"] 48 | except: 49 | raise Exception("Google CX key not found. Please set the GOOGLE_CX_KEY environment variable. " 50 | "You can get a key at https://developers.google.com/custom-search/v1/overview") 51 | return api_key 52 | 53 | def search(self, max_results=7): 54 | """ 55 | Searches the query 56 | Returns: 57 | 58 | """ 59 | """Useful for general internet search queries using the Google API.""" 60 | print("Searching with query {0}...".format(self.query)) 61 | url = f"https://www.googleapis.com/customsearch/v1?key={self.api_key}&cx={self.cx_key}&q={self.query}&start=1" 62 | resp = requests.get(url) 63 | 64 | if resp is None: 65 | return 66 | try: 67 | search_results = json.loads(resp.text) 68 | except Exception: 69 | return 70 | if search_results is None: 71 | return 72 | 73 | results = search_results.get("items", []) 74 | search_results = [] 75 | 76 | # Normalizing results to match the format of the other search APIs 77 | for result in results: 78 | # skip youtube results 79 | if "youtube.com" in result["link"]: 80 | continue 81 | search_result = { 82 | "title": result["title"], 83 | "href": result["link"], 84 | "body": result["snippet"], 85 | } 86 | search_results.append(search_result) 87 | 88 | return search_results 89 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/searx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/searx/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/searx/searx.py: -------------------------------------------------------------------------------- 1 | # Tavily API Retriever 2 | 3 | # libraries 4 | import os 5 | from tavily import TavilyClient 6 | from langchain_community.utilities import SearxSearchWrapper 7 | 8 | 9 | class SearxSearch(): 10 | """ 11 | Tavily API Retriever 12 | """ 13 | def __init__(self, query): 14 | """ 15 | Initializes the TavilySearch object 16 | Args: 17 | query: 18 | """ 19 | self.query = query 20 | self.api_key = self.get_api_key() 21 | self.client = TavilyClient(self.api_key) 22 | 23 | def get_api_key(self): 24 | """ 25 | Gets the Tavily API key 26 | Returns: 27 | 28 | """ 29 | # Get the API key 30 | try: 31 | api_key = os.environ["SEARX_URL"] 32 | except: 33 | raise Exception("Searx URL key not found. Please set the SEARX_URL environment variable. " 34 | "You can get your key from https://searx.space/") 35 | return api_key 36 | 37 | def search(self, max_results=7): 38 | """ 39 | Searches the query 40 | Returns: 41 | 42 | """ 43 | searx = SearxSearchWrapper(searx_host=os.environ["SEARX_URL"]) 44 | results = searx.results(self.query, max_results) 45 | # Normalizing results to match the format of the other search APIs 46 | search_response = [{"href": obj["link"], "body": obj["snippet"]} for obj in results] 47 | return search_response 48 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/serpapi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/serpapi/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/serpapi/serpapi.py: -------------------------------------------------------------------------------- 1 | # SerpApi Retriever 2 | 3 | # libraries 4 | import os 5 | import requests 6 | from duckduckgo_search import DDGS 7 | import urllib.parse 8 | 9 | 10 | class SerpApiSearch(): 11 | """ 12 | SerpApi Retriever 13 | """ 14 | def __init__(self, query): 15 | """ 16 | Initializes the SerpApiSearch object 17 | Args: 18 | query: 19 | """ 20 | self.query = query 21 | self.api_key = self.get_api_key() 22 | 23 | def get_api_key(self): 24 | """ 25 | Gets the SerpApi API key 26 | Returns: 27 | 28 | """ 29 | try: 30 | api_key = os.environ["SERPAPI_API_KEY"] 31 | except: 32 | raise Exception("SerpApi API key not found. Please set the SERPAPI_API_KEY environment variable. " 33 | "You can get a key at https://serpapi.com/") 34 | return api_key 35 | 36 | def search(self, max_results=7): 37 | """ 38 | Searches the query 39 | Returns: 40 | 41 | """ 42 | print("SerpApiSearch: Searching with query {0}...".format(self.query)) 43 | """Useful for general internet search queries using SerpApi.""" 44 | 45 | 46 | url = "https://serpapi.com/search.json" 47 | params = { 48 | "q": self.query, 49 | "api_key": self.api_key 50 | } 51 | encoded_url = url + "?" + urllib.parse.urlencode(params) 52 | search_response = [] 53 | try: 54 | response = requests.get(encoded_url, timeout=10) 55 | if response.status_code == 200: 56 | search_results = response.json() 57 | if search_results: 58 | results = search_results["organic_results"] 59 | for result in results: 60 | # skip youtube results 61 | if "youtube.com" in result["link"]: 62 | continue 63 | if results_processed >= max_results: 64 | break 65 | search_result = { 66 | "title": result["title"], 67 | "href": result["link"], 68 | "body": result["snippet"], 69 | } 70 | search_response.append(search_result) 71 | results_processed += 1 72 | except Exception as e: # Fallback in case overload on Tavily Search API 73 | print(f"Error: {e}") 74 | ddg = DDGS() 75 | search_response = ddg.text(self.query, region='wt-wt', max_results=max_results) 76 | 77 | return search_response 78 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/serper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/serper/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/serper/serper.py: -------------------------------------------------------------------------------- 1 | # Google Serper Retriever 2 | 3 | # libraries 4 | import os 5 | import requests 6 | import json 7 | 8 | 9 | class SerperSearch(): 10 | """ 11 | Google Serper Retriever 12 | """ 13 | def __init__(self, query): 14 | """ 15 | Initializes the SerperSearch object 16 | Args: 17 | query: 18 | """ 19 | self.query = query 20 | self.api_key = self.get_api_key() 21 | 22 | def get_api_key(self): 23 | """ 24 | Gets the Serper API key 25 | Returns: 26 | 27 | """ 28 | try: 29 | api_key = os.environ["SERPER_API_KEY"] 30 | except: 31 | raise Exception("Serper API key not found. Please set the SERPER_API_KEY environment variable. " 32 | "You can get a key at https://serper.dev/") 33 | return api_key 34 | 35 | def search(self, max_results=7): 36 | """ 37 | Searches the query 38 | Returns: 39 | 40 | """ 41 | print("Searching with query {0}...".format(self.query)) 42 | """Useful for general internet search queries using the Serp API.""" 43 | 44 | 45 | # Search the query (see https://serper.dev/playground for the format) 46 | url = "https://google.serper.dev/search" 47 | 48 | headers = { 49 | 'X-API-KEY': self.api_key, 50 | 'Content-Type': 'application/json' 51 | } 52 | data = json.dumps({"q": self.query, "num": max_results}) 53 | 54 | resp = requests.request("POST", url, timeout=10, headers=headers, data=data) 55 | 56 | # Preprocess the results 57 | if resp is None: 58 | return 59 | try: 60 | search_results = json.loads(resp.text) 61 | except Exception: 62 | return 63 | if search_results is None: 64 | return 65 | 66 | results = search_results["organic"] 67 | search_results = [] 68 | 69 | # Normalize the results to match the format of the other search APIs 70 | for result in results: 71 | # skip youtube results 72 | if "youtube.com" in result["link"]: 73 | continue 74 | search_result = { 75 | "title": result["title"], 76 | "href": result["link"], 77 | "body": result["snippet"], 78 | } 79 | search_results.append(search_result) 80 | 81 | return search_results 82 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/tavily_search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/tavily_search/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/tavily_search/tavily_search.py: -------------------------------------------------------------------------------- 1 | # Tavily API Retriever 2 | 3 | # libraries 4 | import os 5 | from tavily import TavilyClient 6 | from duckduckgo_search import DDGS 7 | from yahoo_search import search 8 | 9 | 10 | class TavilySearch(): 11 | """ 12 | Tavily API Retriever 13 | """ 14 | def __init__(self, query, topic="general"): 15 | """ 16 | Initializes the TavilySearch object 17 | Args: 18 | query: 19 | """ 20 | self.query = query 21 | self.api_key = self.get_api_key() 22 | self.client = TavilyClient(self.api_key) 23 | self.topic = topic 24 | 25 | def get_api_key(self): 26 | """ 27 | Gets the Tavily API key 28 | Returns: 29 | 30 | """ 31 | # Get the API key 32 | try: 33 | api_key = os.environ["TAVILY_API_KEY"] 34 | except: 35 | raise Exception("Tavily API key not found. Please set the TAVILY_API_KEY environment variable. " 36 | "You can get a key at https://app.tavily.com") 37 | return api_key 38 | 39 | def search(self, max_results=7): 40 | """ 41 | Searches the query 42 | Returns: 43 | 44 | """ 45 | try: 46 | # Search the query 47 | results = self.client.search(self.query, search_depth="basic", max_results=max_results, topic=self.topic) 48 | sources = results.get("results", []) 49 | if not sources: 50 | raise Exception("No results found with Tavily API search.") 51 | # Return the results 52 | search_response = [{"href": obj["url"], "body": obj["content"]} for obj in sources] 53 | except Exception as e: # Fallback in case overload on Tavily Search API 54 | print(f"Error: {e}. Fallback to DuckDuckGo Search API...") 55 | try: 56 | ddg = DDGS() 57 | search_response = ddg.text(self.query, region='wt-wt', max_results=max_results) 58 | except Exception as e: 59 | print(f"Error: {e}. Fallback to Yahoo Search API...") 60 | search_response = [{"href": obj.link, "body": obj.text, "title": obj.title} for obj in search(self.query).pages] 61 | return search_response 62 | -------------------------------------------------------------------------------- /gpt_researcher/retrievers/yahoo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/retrievers/yahoo/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/retrievers/yahoo/yahoo.py: -------------------------------------------------------------------------------- 1 | # Tavily API Retriever 2 | 3 | # libraries 4 | import os 5 | from yahoo_search import search 6 | 7 | 8 | class YahooSearch: 9 | """ 10 | Tavily API Retriever 11 | """ 12 | def __init__(self, query, topic="general"): 13 | """ 14 | Initializes the TavilySearch object 15 | Args: 16 | query: 17 | """ 18 | self.query = query 19 | 20 | def get_api_key(self): 21 | """ 22 | Gets the Tavily API key 23 | Returns: 24 | 25 | """ 26 | return "No API Key is required for this library" 27 | 28 | def search(self, max_results=7): 29 | """ 30 | Searches the query 31 | Returns: 32 | 33 | """ 34 | try: 35 | # Search the query 36 | results = search(self.query) 37 | sources = results.pages 38 | if not sources: 39 | raise Exception("No results found with Tavily API search.") 40 | # Return the results 41 | search_response = [{"href": obj.link, "body": obj.text, "title": obj.title} for obj in sources] 42 | except Exception as e: # Fallback in case overload on Tavily Search API 43 | print(f"Error: {e}") 44 | search_response = [] 45 | return search_response 46 | -------------------------------------------------------------------------------- /gpt_researcher/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .beautiful_soup.beautiful_soup import BeautifulSoupScraper 3 | from .newspaper.newspaper import NewspaperScraper 4 | from .web_base_loader.web_base_loader import WebBaseLoaderScraper 5 | from .arxiv.arxiv import ArxivScraper 6 | from .pymupdf.pymupdf import PyMuPDFScraper 7 | 8 | __all__ = [ 9 | "BeautifulSoupScraper", 10 | "NewspaperScraper", 11 | "WebBaseLoaderScraper", 12 | "ArxivScraper", 13 | "PyMuPDFScraper" 14 | ] -------------------------------------------------------------------------------- /gpt_researcher/scraper/arxiv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/scraper/arxiv/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/scraper/arxiv/arxiv.py: -------------------------------------------------------------------------------- 1 | from langchain_community.retrievers import ArxivRetriever 2 | 3 | 4 | class ArxivScraper: 5 | 6 | def __init__(self, link, session=None): 7 | self.link = link 8 | self.session = session 9 | 10 | def scrape(self): 11 | """ 12 | The function scrapes relevant documents from Arxiv based on a given link and returns the content 13 | of the first document. 14 | 15 | Returns: 16 | The code is returning the page content of the first document retrieved by the ArxivRetriever 17 | for a given query extracted from the link. 18 | """ 19 | query = self.link.split("/")[-1] 20 | retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) 21 | docs = retriever.get_relevant_documents(query=query) 22 | return docs[0].page_content 23 | -------------------------------------------------------------------------------- /gpt_researcher/scraper/beautiful_soup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/scraper/beautiful_soup/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/scraper/beautiful_soup/beautiful_soup.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | 4 | class BeautifulSoupScraper: 5 | 6 | def __init__(self, link, session=None): 7 | self.link = link 8 | self.session = session 9 | 10 | def scrape(self): 11 | """ 12 | This function scrapes content from a webpage by making a GET request, parsing the HTML using 13 | BeautifulSoup, and extracting script and style elements before returning the cleaned content. 14 | 15 | Returns: 16 | The `scrape` method is returning the cleaned and extracted content from the webpage specified 17 | by the `self.link` attribute. The method fetches the webpage content, removes script and style 18 | tags, extracts the text content, and returns the cleaned content as a string. If any exception 19 | occurs during the process, an error message is printed and an empty string is returned. 20 | """ 21 | try: 22 | response = self.session.get(self.link, timeout=4) 23 | soup = BeautifulSoup( 24 | response.content, "lxml", from_encoding=response.encoding 25 | ) 26 | 27 | for script_or_style in soup(["script", "style"]): 28 | script_or_style.extract() 29 | 30 | raw_content = self.get_content_from_url(soup) 31 | lines = (line.strip() for line in raw_content.splitlines()) 32 | chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) 33 | content = "\n".join(chunk for chunk in chunks if chunk) 34 | return content 35 | 36 | except Exception as e: 37 | print("Error! : " + str(e)) 38 | return "" 39 | 40 | def get_content_from_url(self, soup): 41 | """Get the text from the soup 42 | 43 | Args: 44 | soup (BeautifulSoup): The soup to get the text from 45 | 46 | Returns: 47 | str: The text from the soup 48 | """ 49 | text = "" 50 | tags = ["p", "h1", "h2", "h3", "h4", "h5"] 51 | for element in soup.find_all(tags): # Find all the

elements 52 | text += element.text + "\n" 53 | return text 54 | -------------------------------------------------------------------------------- /gpt_researcher/scraper/newspaper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/scraper/newspaper/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/scraper/newspaper/newspaper.py: -------------------------------------------------------------------------------- 1 | from newspaper import Article 2 | 3 | 4 | class NewspaperScraper: 5 | 6 | def __init__(self, link, session=None): 7 | self.link = link 8 | self.session = session 9 | 10 | def scrape(self) -> str: 11 | """ 12 | This Python function scrapes an article from a given link, extracts the title and text content, 13 | and returns them concatenated with a colon. 14 | 15 | Returns: 16 | The `scrape` method returns a string that contains the title of the article followed by a 17 | colon and the text of the article. If the title or text is not present, an empty string is 18 | returned. If an exception occurs during the scraping process, an error message is printed and an 19 | empty string is returned. 20 | """ 21 | try: 22 | article = Article( 23 | self.link, 24 | language="en", 25 | memoize_articles=False, 26 | fetch_images=False, 27 | ) 28 | article.download() 29 | article.parse() 30 | 31 | title = article.title 32 | text = article.text 33 | 34 | # If title, summary are not present then return None 35 | if not (title and text): 36 | return "" 37 | 38 | return f"{title} : {text}" 39 | 40 | except Exception as e: 41 | print("Error! : " + str(e)) 42 | return "" 43 | -------------------------------------------------------------------------------- /gpt_researcher/scraper/pymupdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/scraper/pymupdf/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/scraper/pymupdf/pymupdf.py: -------------------------------------------------------------------------------- 1 | from langchain_community.document_loaders import PyMuPDFLoader 2 | 3 | 4 | class PyMuPDFScraper: 5 | 6 | def __init__(self, link, session=None): 7 | self.link = link 8 | self.session = session 9 | 10 | def scrape(self) -> str: 11 | """ 12 | The `scrape` function uses PyMuPDFLoader to load a document from a given link and returns it as 13 | a string. 14 | 15 | Returns: 16 | The `scrape` method is returning a string representation of the `doc` object, which is loaded 17 | using PyMuPDFLoader from the provided link. 18 | """ 19 | loader = PyMuPDFLoader(self.link) 20 | doc = loader.load() 21 | return str(doc) 22 | -------------------------------------------------------------------------------- /gpt_researcher/scraper/scraper.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures.thread import ThreadPoolExecutor 2 | from functools import partial 3 | 4 | import requests 5 | 6 | from gpt_researcher.scraper import ( 7 | ArxivScraper, 8 | BeautifulSoupScraper, 9 | NewspaperScraper, 10 | PyMuPDFScraper, 11 | WebBaseLoaderScraper, 12 | ) 13 | 14 | 15 | class Scraper: 16 | """ 17 | Scraper class to extract the content from the links 18 | """ 19 | 20 | def __init__(self, urls, user_agent, scraper): 21 | """ 22 | Initialize the Scraper class. 23 | Args: 24 | urls: 25 | """ 26 | self.urls = urls 27 | self.session = requests.Session() 28 | self.session.headers.update({"User-Agent": user_agent}) 29 | self.scraper = scraper 30 | 31 | def run(self): 32 | """ 33 | Extracts the content from the links 34 | """ 35 | partial_extract = partial(self.extract_data_from_link, session=self.session) 36 | with ThreadPoolExecutor(max_workers=20) as executor: 37 | contents = executor.map(partial_extract, self.urls) 38 | res = [content for content in contents if content["raw_content"] is not None] 39 | return res 40 | 41 | def extract_data_from_link(self, link, session): 42 | """ 43 | Extracts the data from the link 44 | """ 45 | content = "" 46 | try: 47 | Scraper = self.get_scraper(link) 48 | scraper = Scraper(link, session) 49 | content = scraper.scrape() 50 | 51 | if len(content) < 100: 52 | return {"url": link, "raw_content": None} 53 | return {"url": link, "raw_content": content} 54 | except Exception as e: 55 | return {"url": link, "raw_content": None} 56 | 57 | def get_scraper(self, link): 58 | """ 59 | The function `get_scraper` determines the appropriate scraper class based on the provided link 60 | or a default scraper if none matches. 61 | 62 | Args: 63 | link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a 64 | PDF file. Based on the type of content the link points to, the method determines the appropriate 65 | scraper class to use for extracting data from that content. 66 | 67 | Returns: 68 | The `get_scraper` method returns the scraper class based on the provided link. The method 69 | checks the link to determine the appropriate scraper class to use based on predefined mappings 70 | in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the 71 | `PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper 72 | """ 73 | 74 | SCRAPER_CLASSES = { 75 | "pdf": PyMuPDFScraper, 76 | "arxiv": ArxivScraper, 77 | "newspaper": NewspaperScraper, 78 | "bs": BeautifulSoupScraper, 79 | "web_base_loader": WebBaseLoaderScraper, 80 | } 81 | 82 | scraper_key = None 83 | 84 | if link.endswith(".pdf"): 85 | scraper_key = "pdf" 86 | elif "arxiv.org" in link: 87 | scraper_key = "arxiv" 88 | else: 89 | scraper_key = self.scraper 90 | 91 | scraper_class = SCRAPER_CLASSES.get(scraper_key) 92 | if scraper_class is None: 93 | raise Exception("Scraper not found.") 94 | 95 | return scraper_class 96 | -------------------------------------------------------------------------------- /gpt_researcher/scraper/web_base_loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/scraper/web_base_loader/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/scraper/web_base_loader/web_base_loader.py: -------------------------------------------------------------------------------- 1 | from langchain_community.document_loaders import WebBaseLoader 2 | 3 | 4 | class WebBaseLoaderScraper: 5 | 6 | def __init__(self, link, session=None): 7 | self.link = link 8 | self.session = session 9 | 10 | def scrape(self) -> str: 11 | """ 12 | This Python function scrapes content from a webpage using a WebBaseLoader object and returns the 13 | concatenated page content. 14 | 15 | Returns: 16 | The `scrape` method is returning a string variable named `content` which contains the 17 | concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception 18 | occurs during the process, an error message is printed and an empty string is returned. 19 | """ 20 | try: 21 | loader = WebBaseLoader(self.link) 22 | loader.requests_kwargs = {"verify": False} 23 | docs = loader.load() 24 | content = "" 25 | 26 | for doc in docs: 27 | content += doc.page_content 28 | 29 | return content 30 | 31 | except Exception as e: 32 | print("Error! : " + str(e)) 33 | return "" 34 | -------------------------------------------------------------------------------- /gpt_researcher/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/gpt_researcher/utils/__init__.py -------------------------------------------------------------------------------- /gpt_researcher/utils/enum.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | class ReportType(Enum): 3 | ResearchReport = 'research_report' 4 | ResourceReport = 'resource_report' 5 | OutlineReport = 'outline_report' 6 | CustomReport = 'custom_report' 7 | DetailedReport = 'detailed_report' 8 | SubtopicReport = 'subtopic_report' 9 | 10 | class ReportSource(Enum): 11 | Web = 'web' 12 | Local = 'local' 13 | -------------------------------------------------------------------------------- /gpt_researcher/utils/llm.py: -------------------------------------------------------------------------------- 1 | # libraries 2 | from __future__ import annotations 3 | 4 | import json 5 | import logging 6 | from typing import Optional 7 | 8 | from colorama import Fore, Style 9 | from fastapi import WebSocket 10 | from langchain.output_parsers import PydanticOutputParser 11 | from langchain.prompts import PromptTemplate 12 | from langchain_openai import ChatOpenAI 13 | 14 | from gpt_researcher.master.prompts import auto_agent_instructions, generate_subtopics_prompt 15 | 16 | from .validators import Subtopics 17 | 18 | 19 | def get_provider(llm_provider): 20 | match llm_provider: 21 | case "openai": 22 | from ..llm_provider import OpenAIProvider 23 | llm_provider = OpenAIProvider 24 | case "azureopenai": 25 | from ..llm_provider import AzureOpenAIProvider 26 | llm_provider = AzureOpenAIProvider 27 | case "google": 28 | from ..llm_provider import GoogleProvider 29 | llm_provider = GoogleProvider 30 | 31 | case _: 32 | raise Exception("LLM provider not found.") 33 | 34 | return llm_provider 35 | 36 | 37 | async def create_chat_completion( 38 | messages: list, # type: ignore 39 | model: Optional[str] = None, 40 | temperature: float = 1.0, 41 | max_tokens: Optional[int] = None, 42 | llm_provider: Optional[str] = None, 43 | stream: Optional[bool] = False, 44 | websocket: WebSocket | None = None, 45 | ) -> str: 46 | """Create a chat completion using the OpenAI API 47 | Args: 48 | messages (list[dict[str, str]]): The messages to send to the chat completion 49 | model (str, optional): The model to use. Defaults to None. 50 | temperature (float, optional): The temperature to use. Defaults to 0.9. 51 | max_tokens (int, optional): The max tokens to use. Defaults to None. 52 | stream (bool, optional): Whether to stream the response. Defaults to False. 53 | llm_provider (str, optional): The LLM Provider to use. 54 | webocket (WebSocket): The websocket used in the currect request 55 | Returns: 56 | str: The response from the chat completion 57 | """ 58 | 59 | # validate input 60 | if model is None: 61 | raise ValueError("Model cannot be None") 62 | if max_tokens is not None and max_tokens > 8001: 63 | raise ValueError( 64 | f"Max tokens cannot be more than 8001, but got {max_tokens}") 65 | 66 | # Get the provider from supported providers 67 | ProviderClass = get_provider(llm_provider) 68 | provider = ProviderClass( 69 | model, 70 | temperature, 71 | max_tokens 72 | ) 73 | 74 | # create response 75 | for _ in range(10): # maximum of 10 attempts 76 | response = await provider.get_chat_response( 77 | messages, stream, websocket 78 | ) 79 | return response 80 | 81 | logging.error("Failed to get response from OpenAI API") 82 | raise RuntimeError("Failed to get response from OpenAI API") 83 | 84 | 85 | def choose_agent(smart_llm_model: str, llm_provider: str, task: str) -> dict: 86 | """Determines what server should be used 87 | Args: 88 | task (str): The research question the user asked 89 | smart_llm_model (str): the llm model to be used 90 | llm_provider (str): the llm provider used 91 | Returns: 92 | server - The server that will be used 93 | agent_role_prompt (str): The prompt for the server 94 | """ 95 | try: 96 | response = create_chat_completion( 97 | model=smart_llm_model, 98 | messages=[ 99 | {"role": "system", "content": f"{auto_agent_instructions()}"}, 100 | {"role": "user", "content": f"task: {task}"}], 101 | temperature=0, 102 | llm_provider=llm_provider 103 | ) 104 | agent_dict = json.loads(response) 105 | print(f"Agent: {agent_dict.get('server')}") 106 | return agent_dict 107 | except Exception as e: 108 | print(f"{Fore.RED}Error in choose_agent: {e}{Style.RESET_ALL}") 109 | return {"server": "Default Agent", 110 | "agent_role_prompt": "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text."} 111 | 112 | 113 | async def construct_subtopics(task: str, data: str, config, subtopics: list = []) -> list: 114 | try: 115 | parser = PydanticOutputParser(pydantic_object=Subtopics) 116 | 117 | prompt = PromptTemplate( 118 | template=generate_subtopics_prompt(), 119 | input_variables=["task", "data", "subtopics", "max_subtopics"], 120 | partial_variables={ 121 | "format_instructions": parser.get_format_instructions()}, 122 | ) 123 | 124 | print(f"\n🤖 Calling {config.smart_llm_model}...\n") 125 | 126 | if config.llm_provider == "openai": 127 | model = ChatOpenAI(model=config.smart_llm_model) 128 | elif config.llm_provider == "azureopenai": 129 | from langchain_openai import AzureChatOpenAI 130 | model = AzureChatOpenAI(model=config.smart_llm_model) 131 | else: 132 | return [] 133 | 134 | chain = prompt | model | parser 135 | 136 | output = chain.invoke({ 137 | "task": task, 138 | "data": data, 139 | "subtopics": subtopics, 140 | "max_subtopics": config.max_subtopics 141 | }) 142 | 143 | return output 144 | 145 | except Exception as e: 146 | print("Exception in parsing subtopics : ", e) 147 | return subtopics -------------------------------------------------------------------------------- /gpt_researcher/utils/validators.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | class Subtopic(BaseModel): 6 | task: str = Field(description="Task name", min_length=1) 7 | 8 | class Subtopics(BaseModel): 9 | subtopics: List[Subtopic] = [] 10 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from backend.server import app 2 | from dotenv import load_dotenv 3 | load_dotenv() 4 | 5 | if __name__ == "__main__": 6 | import uvicorn 7 | 8 | uvicorn.run(app, host="0.0.0.0", port=8000) -------------------------------------------------------------------------------- /multi_agents/README.md: -------------------------------------------------------------------------------- 1 | # LangGraph x GPT Researcher 2 | [LangGraph](https://python.langchain.com/docs/langgraph) is a library for building stateful, multi-actor applications with LLMs. 3 | This example uses Langgraph to automate the process of an in depth research on any given topic. 4 | 5 | ## Use case 6 | By using Langgraph, the research process can be significantly improved in depth and quality by leveraging multiple agents with specialized skills. 7 | Inspired by the recent [STORM](https://arxiv.org/abs/2402.14207) paper, this example showcases how a team of AI agents can work together to conduct research on a given topic, from planning to publication. 8 | 9 | An average run generates a 5-6 page research report in multiple formats such as PDF, Docx and Markdown. 10 | 11 | ## The Multi Agent Team 12 | The research team is made up of 7 AI agents: 13 | - **Chief Editor** - Oversees the research process and manages the team. This is the "master" agent that coordinates the other agents using Langgraph. 14 | - **Researcher** (gpt-researcher) - A specialized autonomous agent that conducts in depth research on a given topic. 15 | - **Editor** - Responsible for planning the research outline and structure. 16 | - **Reviewer** - Validates the correctness of the research results given a set of criteria. 17 | - **Revisor** - Revises the research results based on the feedback from the reviewer. 18 | - **Writer** - Responsible for compiling and writing the final report. 19 | - **Publisher** - Responsible for publishing the final report in various formats. 20 | 21 | ## How it works 22 | Generally, the process is based on the following stages: 23 | 1. Planning stage 24 | 2. Data collection and analysis 25 | 3. Review and revision 26 | 4. Writing and submission 27 | 5. Publication 28 | 29 | ### Architecture 30 |

31 | 32 |
33 |
34 | 35 | ### Steps 36 | More specifically (as seen in the architecture diagram) the process is as follows: 37 | - Browser (gpt-researcher) - Browses the internet for initial research based on the given research task. 38 | - Editor - Plans the report outline and structure based on the initial research. 39 | - For each outline topic (in parallel): 40 | - Researcher (gpt-researcher) - Runs an in depth research on the subtopics and writes a draft. 41 | - Reviewer - Validates the correctness of the draft given a set of criteria and provides feedback. 42 | - Revisor - Revises the draft until it is satisfactory based on the reviewer feedback. 43 | - Writer - Compiles and writes the final report including an introduction, conclusion and references section from the given research findings. 44 | - Publisher - Publishes the final report to multi formats such as PDF, Docx, Markdown, etc. 45 | 46 | ## How to run 47 | 1. Install required packages: 48 | ```bash 49 | pip install -r requirements.txt 50 | ``` 51 | 3. Update env variables 52 | ```bash 53 | export OPENAI_API_KEY={Your OpenAI API Key here} 54 | export TAVILY_API_KEY={Your Tavily API Key here} 55 | ``` 56 | 2. Run the application: 57 | ```bash 58 | python main.py 59 | ``` 60 | 61 | ## Usage 62 | To change the research query and customize the report, edit the `task.json` file in the main directory. 63 | #### Task.json contains the following fields: 64 | - `query` - The research query or task. 65 | - `model` - The OpenAI LLM to use for the agents. 66 | - `max_sections` - The maximum number of sections in the report. Each section is a subtopic of the research query. 67 | - `publish_formats` - The formats to publish the report in. The reports will be written in the `output` directory. 68 | - `follow_guidelines` - If true, the research report will follow the guidelines below. It will take longer to complete. If false, the report will be generated faster but may not follow the guidelines. 69 | - `guidelines` - A list of guidelines that the report must follow. 70 | - `verbose` - If true, the application will print detailed logs to the console. 71 | 72 | #### For example: 73 | ```json 74 | { 75 | "query": "Is AI in a hype cycle?", 76 | "model": "gpt-4o", 77 | "max_sections": 3, 78 | "publish_formats": { 79 | "markdown": true, 80 | "pdf": true, 81 | "docx": true 82 | }, 83 | "follow_guidelines": true, 84 | "guidelines": [ 85 | "The report MUST fully answer the original question", 86 | "The report MUST be written in apa format", 87 | "The report MUST be written in english" 88 | ], 89 | "verbose": true 90 | } 91 | ``` 92 | 93 | ## To Deploy 94 | 95 | ```shell 96 | pip install langgraph-cli 97 | langgraph up 98 | ``` 99 | 100 | From there, see documentation [here](https://github.com/langchain-ai/langgraph-example) on how to use the streaming and async endpoints, as well as the playground. -------------------------------------------------------------------------------- /multi_agents/agent.py: -------------------------------------------------------------------------------- 1 | from agents import ChiefEditorAgent 2 | 3 | chief_editor = ChiefEditorAgent({ 4 | "query": "Is AI in a hype cycle?", 5 | "max_sections": 3, 6 | "follow_guidelines": False, 7 | "model": "gpt-4o", 8 | "guidelines": [ 9 | "The report MUST be written in APA format", 10 | "Each sub section MUST include supporting sources using hyperlinks. If none exist, erase the sub section or rewrite it to be a part of the previous section", 11 | "The report MUST be written in spanish" 12 | ], 13 | "verbose": False 14 | }) 15 | graph = chief_editor.init_research_team() 16 | graph = graph.compile() 17 | -------------------------------------------------------------------------------- /multi_agents/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .researcher import ResearchAgent 2 | from .writer import WriterAgent 3 | from .publisher import PublisherAgent 4 | from .reviser import ReviserAgent 5 | from .reviewer import ReviewerAgent 6 | from .editor import EditorAgent 7 | from .master import ChiefEditorAgent 8 | 9 | __all__ = [ 10 | "ChiefEditorAgent", 11 | "ResearchAgent", 12 | "WriterAgent", 13 | "EditorAgent", 14 | "PublisherAgent", 15 | "ReviserAgent", 16 | "ReviewerAgent" 17 | ] -------------------------------------------------------------------------------- /multi_agents/agents/editor.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from .utils.views import print_agent_output 3 | from .utils.llms import call_model 4 | from langgraph.graph import StateGraph, END 5 | import asyncio 6 | import json 7 | 8 | from memory.draft import DraftState 9 | from . import \ 10 | ResearchAgent, \ 11 | ReviewerAgent, \ 12 | ReviserAgent 13 | 14 | 15 | class EditorAgent: 16 | def __init__(self): 17 | pass 18 | 19 | def plan_research(self, research_state: dict): 20 | """ 21 | Curate relevant sources for a query 22 | :param summary_report: 23 | :return: 24 | :param total_sub_headers: 25 | :return: 26 | """ 27 | 28 | initial_research = research_state.get("initial_research") 29 | task = research_state.get("task") 30 | max_sections = task.get("max_sections") 31 | prompt = [{ 32 | "role": "system", 33 | "content": "You are a research director. Your goal is to oversee the research project" 34 | " from inception to completion.\n " 35 | }, { 36 | "role": "user", 37 | "content": f"Today's date is {datetime.now().strftime('%d/%m/%Y')}\n." 38 | f"Research summary report: '{initial_research}'\n\n" 39 | f"Your task is to generate an outline of sections headers for the research project" 40 | f" based on the research summary report above.\n" 41 | f"You must generate a maximum of {max_sections} section headers.\n" 42 | f"You must focus ONLY on related research topics for subheaders and do NOT include introduction, conclusion and references.\n" 43 | f"You must return nothing but a JSON with the fields 'title' (str) and " 44 | f"'sections' (maximum {max_sections} section headers) with the following structure: " 45 | f"'{{title: string research title, date: today's date, " 46 | f"sections: ['section header 1', 'section header 2', 'section header 3' ...]}}.\n " 47 | }] 48 | 49 | print_agent_output(f"Planning an outline layout based on initial research...", agent="EDITOR") 50 | response = call_model(prompt=prompt, model=task.get("model"), response_format="json") 51 | plan = json.loads(response) 52 | 53 | return { 54 | "title": plan.get("title"), 55 | "date": plan.get("date"), 56 | "sections": plan.get("sections") 57 | } 58 | 59 | async def run_parallel_research(self, research_state: dict): 60 | research_agent = ResearchAgent() 61 | reviewer_agent = ReviewerAgent() 62 | reviser_agent = ReviserAgent() 63 | queries = research_state.get("sections") 64 | title = research_state.get("title") 65 | workflow = StateGraph(DraftState) 66 | 67 | workflow.add_node("researcher", research_agent.run_depth_research) 68 | workflow.add_node("reviewer", reviewer_agent.run) 69 | workflow.add_node("reviser", reviser_agent.run) 70 | 71 | # set up edges researcher->reviewer->reviser->reviewer... 72 | workflow.set_entry_point("researcher") 73 | workflow.add_edge('researcher', 'reviewer') 74 | workflow.add_edge('reviser', 'reviewer') 75 | workflow.add_conditional_edges('reviewer', 76 | (lambda draft: "accept" if draft['review'] is None else "revise"), 77 | {"accept": END, "revise": "reviser"}) 78 | 79 | chain = workflow.compile() 80 | 81 | # Execute the graph for each query in parallel 82 | print_agent_output(f"Running the following research tasks in parallel: {queries}...", agent="EDITOR") 83 | final_drafts = [chain.ainvoke({"task": research_state.get("task"), "topic": query, "title": title}) 84 | for query in queries] 85 | research_results = [result['draft'] for result in await asyncio.gather(*final_drafts)] 86 | 87 | return {"research_data": research_results} 88 | -------------------------------------------------------------------------------- /multi_agents/agents/master.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from langgraph.graph import StateGraph, END 4 | from .utils.views import print_agent_output 5 | from memory.research import ResearchState 6 | 7 | # Import agent classes 8 | from . import \ 9 | WriterAgent, \ 10 | EditorAgent, \ 11 | PublisherAgent, \ 12 | ResearchAgent 13 | 14 | 15 | class ChiefEditorAgent: 16 | def __init__(self, task: dict): 17 | self.task_id = int(time.time()) # Currently time based, but can be any unique identifier 18 | self.output_dir = f"./outputs/run_{self.task_id}_{task.get('query')[0:40]}" 19 | self.task = task 20 | os.makedirs(self.output_dir, exist_ok=True) 21 | 22 | def init_research_team(self): 23 | # Initialize agents 24 | writer_agent = WriterAgent() 25 | editor_agent = EditorAgent() 26 | research_agent = ResearchAgent() 27 | publisher_agent = PublisherAgent(self.output_dir) 28 | 29 | # Define a Langchain StateGraph with the ResearchState 30 | workflow = StateGraph(ResearchState) 31 | 32 | # Add nodes for each agent 33 | workflow.add_node("browser", research_agent.run_initial_research) 34 | workflow.add_node("planner", editor_agent.plan_research) 35 | workflow.add_node("researcher", editor_agent.run_parallel_research) 36 | workflow.add_node("writer", writer_agent.run) 37 | workflow.add_node("publisher", publisher_agent.run) 38 | 39 | workflow.add_edge('browser', 'planner') 40 | workflow.add_edge('planner', 'researcher') 41 | workflow.add_edge('researcher', 'writer') 42 | workflow.add_edge('writer', 'publisher') 43 | 44 | # set up start and end nodes 45 | workflow.set_entry_point("browser") 46 | workflow.add_edge('publisher', END) 47 | 48 | return workflow 49 | 50 | async def run_research_task(self): 51 | research_team = self.init_research_team() 52 | 53 | # compile the graph 54 | chain = research_team.compile() 55 | 56 | print_agent_output(f"Starting the research process for query '{self.task.get('query')}'...", "MASTER") 57 | result = await chain.ainvoke({"task": self.task}) 58 | 59 | return result 60 | -------------------------------------------------------------------------------- /multi_agents/agents/publisher.py: -------------------------------------------------------------------------------- 1 | from .utils.file_formats import \ 2 | write_md_to_pdf, \ 3 | write_md_to_word, \ 4 | write_text_to_md 5 | 6 | from .utils.views import print_agent_output 7 | 8 | 9 | class PublisherAgent: 10 | def __init__(self, output_dir: str): 11 | self.output_dir = output_dir 12 | 13 | async def publish_research_report(self, research_state: dict, publish_formats: dict): 14 | layout = self.generate_layout(research_state) 15 | await self.write_report_by_formats(layout, publish_formats) 16 | 17 | return layout 18 | 19 | def generate_layout(self, research_state: dict): 20 | sections = '\n\n'.join(f"{value}" 21 | for subheader in research_state.get("research_data") 22 | for key, value in subheader.items()) 23 | references = '\n'.join(f"{reference}" for reference in research_state.get("sources")) 24 | headers = research_state.get("headers") 25 | layout = f"""# {headers.get('title')} 26 | #### {headers.get("date")}: {research_state.get('date')} 27 | 28 | ## {headers.get("introduction")} 29 | {research_state.get('introduction')} 30 | 31 | ## {headers.get("table_of_contents")} 32 | {research_state.get('table_of_contents')} 33 | 34 | {sections} 35 | 36 | ## {headers.get("conclusion")} 37 | {research_state.get('conclusion')} 38 | 39 | ## {headers.get("references")} 40 | {references} 41 | """ 42 | return layout 43 | 44 | async def write_report_by_formats(self, layout:str, publish_formats: dict): 45 | if publish_formats.get("pdf"): 46 | await write_md_to_pdf(layout, self.output_dir) 47 | if publish_formats.get("docx"): 48 | await write_md_to_word(layout, self.output_dir) 49 | if publish_formats.get("markdown"): 50 | await write_text_to_md(layout, self.output_dir) 51 | 52 | async def run(self, research_state: dict): 53 | task = research_state.get("task") 54 | publish_formats = task.get("publish_formats") 55 | print_agent_output(output="Publishing final research report based on retrieved data...", agent="PUBLISHER") 56 | final_research_report = await self.publish_research_report(research_state, publish_formats) 57 | return {"report": final_research_report} 58 | -------------------------------------------------------------------------------- /multi_agents/agents/researcher.py: -------------------------------------------------------------------------------- 1 | from gpt_researcher import GPTResearcher 2 | from colorama import Fore, Style 3 | from .utils.views import print_agent_output 4 | 5 | 6 | class ResearchAgent: 7 | def __init__(self): 8 | pass 9 | 10 | async def research(self, query: str, research_report: str = "research_report", parent_query: str = "", verbose=True): 11 | # Initialize the researcher 12 | researcher = GPTResearcher(query=query, report_type=research_report, parent_query=parent_query, verbose=verbose) 13 | # Conduct research on the given query 14 | await researcher.conduct_research() 15 | # Write the report 16 | report = await researcher.write_report() 17 | 18 | return report 19 | 20 | async def run_subtopic_research(self, parent_query: str, subtopic: str, verbose: bool = True): 21 | try: 22 | report = await self.research(parent_query=parent_query, query=subtopic, 23 | research_report="subtopic_report", verbose=verbose) 24 | except Exception as e: 25 | print(f"{Fore.RED}Error in researching topic {subtopic}: {e}{Style.RESET_ALL}") 26 | report = None 27 | return {subtopic: report} 28 | 29 | async def run_initial_research(self, research_state: dict): 30 | task = research_state.get("task") 31 | query = task.get("query") 32 | print_agent_output(f"Running initial research on the following query: {query}", agent="RESEARCHER") 33 | return {"task": task, "initial_research": await self.research(query=query, verbose=task.get("verbose"))} 34 | 35 | async def run_depth_research(self, draft_state: dict): 36 | task = draft_state.get("task") 37 | topic = draft_state.get("topic") 38 | parent_query = task.get("query") 39 | verbose = task.get("verbose") 40 | print_agent_output(f"Running in depth research on the following report topic: {topic}", agent="RESEARCHER") 41 | research_draft = await self.run_subtopic_research(parent_query, topic, verbose) 42 | return {"draft": research_draft} 43 | -------------------------------------------------------------------------------- /multi_agents/agents/reviewer.py: -------------------------------------------------------------------------------- 1 | from .utils.views import print_agent_output 2 | from .utils.llms import call_model 3 | 4 | TEMPLATE = """You are an expert research article reviewer. \ 5 | Your goal is to review research drafts and provide feedback to the reviser only based on specific guidelines. \ 6 | """ 7 | 8 | class ReviewerAgent: 9 | def __init__(self): 10 | pass 11 | 12 | def review_draft(self, draft_state: dict): 13 | """ 14 | Review a draft article 15 | :param draft_state: 16 | :return: 17 | """ 18 | task = draft_state.get("task") 19 | guidelines = '- '.join(guideline for guideline in task.get("guidelines")) 20 | revision_notes = draft_state.get("revision_notes") 21 | 22 | revise_prompt = f"""The reviser has already revised the draft based on your previous review notes with the following feedback: 23 | {revision_notes}\n 24 | Please provide additional feedback ONLY if critical since the reviser has already made changes based on your previous feedback. 25 | If you think the article is sufficient or that non critical revisions are required, please aim to return None. 26 | """ 27 | 28 | review_prompt = f"""You have been tasked with reviewing the draft which was written by a non-expert based on specific guidelines. 29 | Please accept the draft if it is good enough to publish, or send it for revision, along with your notes to guide the revision. 30 | If not all of the guideline criteria are met, you should send appropriate revision notes. 31 | If the draft meets all the guidelines, please return None. 32 | {revise_prompt if revision_notes else ""} 33 | 34 | Guidelines: {guidelines}\nDraft: {draft_state.get("draft")}\n 35 | """ 36 | prompt = [{ 37 | "role": "system", 38 | "content": TEMPLATE 39 | }, { 40 | "role": "user", 41 | "content": review_prompt 42 | }] 43 | 44 | response = call_model(prompt, model=task.get("model")) 45 | 46 | if task.get("verbose"): 47 | print_agent_output(f"Review feedback is: {response}...", agent="REVIEWER") 48 | 49 | if 'None' in response: 50 | return None 51 | return response 52 | 53 | def run(self, draft_state: dict): 54 | task = draft_state.get("task") 55 | guidelines = task.get("guidelines") 56 | to_follow_guidelines = task.get("follow_guidelines") 57 | review = None 58 | if to_follow_guidelines: 59 | print_agent_output(f"Reviewing draft...", agent="REVIEWER") 60 | 61 | if task.get("verbose"): 62 | print_agent_output(f"Following guidelines {guidelines}...", agent="REVIEWER") 63 | 64 | review = self.review_draft(draft_state) 65 | else: 66 | print_agent_output(f"Ignoring guidelines...", agent="REVIEWER") 67 | return {"review": review} 68 | -------------------------------------------------------------------------------- /multi_agents/agents/reviser.py: -------------------------------------------------------------------------------- 1 | from .utils.views import print_agent_output 2 | from .utils.llms import call_model 3 | import json 4 | 5 | sample_revision_notes = """ 6 | { 7 | "draft": { 8 | draft title: The revised draft that you are submitting for review 9 | }, 10 | "revision_notes": Your message to the reviewer about the changes you made to the draft based on their feedback 11 | } 12 | """ 13 | 14 | class ReviserAgent: 15 | def __init__(self): 16 | pass 17 | 18 | def revise_draft(self, draft_state: dict): 19 | """ 20 | Review a draft article 21 | :param draft_state: 22 | :return: 23 | """ 24 | review = draft_state.get("review") 25 | task = draft_state.get("task") 26 | draft_report = draft_state.get("draft") 27 | prompt = [{ 28 | "role": "system", 29 | "content": "You are an expert writer. Your goal is to revise drafts based on reviewer notes." 30 | }, { 31 | "role": "user", 32 | "content": f"""Draft:\n{draft_report}" + "Reviewer's notes:\n{review}\n\n 33 | You have been tasked by your reviewer with revising the following draft, which was written by a non-expert. 34 | If you decide to follow the reviewer's notes, please write a new draft and make sure to address all of the points they raised. 35 | Please keep all other aspects of the draft the same. 36 | You MUST return nothing but a JSON in the following format: 37 | {sample_revision_notes} 38 | """ 39 | }] 40 | 41 | response = call_model(prompt, model=task.get("model"), response_format='json') 42 | return json.loads(response) 43 | 44 | def run(self, draft_state: dict): 45 | print_agent_output(f"Rewriting draft based on feedback...", agent="REVISOR") 46 | revision = self.revise_draft(draft_state) 47 | 48 | if draft_state.get("task").get("verbose"): 49 | print_agent_output(f"Revision notes: {revision.get('revision_notes')}", agent="REVISOR") 50 | 51 | return {"draft": revision.get("draft"), 52 | "revision_notes": revision.get("revision_notes")} 53 | -------------------------------------------------------------------------------- /multi_agents/agents/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/multi_agents/agents/utils/__init__.py -------------------------------------------------------------------------------- /multi_agents/agents/utils/file_formats.py: -------------------------------------------------------------------------------- 1 | import aiofiles 2 | import urllib 3 | import uuid 4 | import mistune 5 | 6 | 7 | async def write_to_file(filename: str, text: str) -> None: 8 | """Asynchronously write text to a file in UTF-8 encoding. 9 | 10 | Args: 11 | filename (str): The filename to write to. 12 | text (str): The text to write. 13 | """ 14 | # Convert text to UTF-8, replacing any problematic characters 15 | text_utf8 = text.encode('utf-8', errors='replace').decode('utf-8') 16 | 17 | async with aiofiles.open(filename, "w", encoding='utf-8') as file: 18 | await file.write(text_utf8) 19 | 20 | 21 | async def write_text_to_md(text: str, path: str) -> str: 22 | """Writes text to a Markdown file and returns the file path. 23 | 24 | Args: 25 | text (str): Text to write to the Markdown file. 26 | 27 | Returns: 28 | str: The file path of the generated Markdown file. 29 | """ 30 | task = uuid.uuid4().hex 31 | file_path = f"{path}/{task}.md" 32 | await write_to_file(file_path, text) 33 | print(f"Report written to {file_path}") 34 | return file_path 35 | 36 | 37 | async def write_md_to_pdf(text: str, path: str) -> str: 38 | """Converts Markdown text to a PDF file and returns the file path. 39 | 40 | Args: 41 | text (str): Markdown text to convert. 42 | 43 | Returns: 44 | str: The encoded file path of the generated PDF. 45 | """ 46 | task = uuid.uuid4().hex 47 | file_path = f"{path}/{task}.pdf" 48 | 49 | try: 50 | # Moved imports to inner function to avoid known import errors with gobject-2.0 51 | from md2pdf.core import md2pdf 52 | md2pdf(file_path, 53 | md_content=text, 54 | # md_file_path=f"{file_path}.md", 55 | css_file_path="./agents/utils/pdf_styles.css", 56 | base_url=None) 57 | print(f"Report written to {file_path}") 58 | except Exception as e: 59 | print(f"Error in converting Markdown to PDF: {e}") 60 | return "" 61 | 62 | encoded_file_path = urllib.parse.quote(file_path) 63 | return encoded_file_path 64 | 65 | 66 | async def write_md_to_word(text: str, path: str) -> str: 67 | """Converts Markdown text to a DOCX file and returns the file path. 68 | 69 | Args: 70 | text (str): Markdown text to convert. 71 | 72 | Returns: 73 | str: The encoded file path of the generated DOCX. 74 | """ 75 | task = uuid.uuid4().hex 76 | file_path = f"{path}/{task}.docx" 77 | 78 | try: 79 | from htmldocx import HtmlToDocx 80 | from docx import Document 81 | # Convert report markdown to HTML 82 | html = mistune.html(text) 83 | # Create a document object 84 | doc = Document() 85 | # Convert the html generated from the report to document format 86 | HtmlToDocx().add_html_to_document(html, doc) 87 | 88 | # Saving the docx document to file_path 89 | doc.save(file_path) 90 | 91 | print(f"Report written to {file_path}") 92 | 93 | encoded_file_path = urllib.parse.quote(f"{file_path}.docx") 94 | return encoded_file_path 95 | 96 | except Exception as e: 97 | print(f"Error in converting Markdown to DOCX: {e}") 98 | return "" 99 | -------------------------------------------------------------------------------- /multi_agents/agents/utils/llms.py: -------------------------------------------------------------------------------- 1 | from langchain.adapters.openai import convert_openai_messages 2 | from langchain_openai import ChatOpenAI 3 | 4 | 5 | def call_model(prompt: list, model: str, max_retries: int = 2, response_format: str = None) -> str: 6 | 7 | optional_params = {} 8 | if response_format == 'json': 9 | optional_params = { 10 | "response_format": {"type": "json_object"} 11 | } 12 | 13 | lc_messages = convert_openai_messages(prompt) 14 | response = ChatOpenAI(model=model, max_retries=max_retries, model_kwargs=optional_params).invoke(lc_messages).content 15 | return response -------------------------------------------------------------------------------- /multi_agents/agents/utils/pdf_styles.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: 'Libre Baskerville', serif; 3 | font-size: 12pt; /* standard size for academic papers */ 4 | line-height: 1.6; /* for readability */ 5 | color: #333; /* softer on the eyes than black */ 6 | background-color: #fff; /* white background */ 7 | margin: 0; 8 | padding: 0; 9 | } 10 | 11 | h1, h2, h3, h4, h5, h6 { 12 | font-family: 'Libre Baskerville', serif; 13 | color: #000; /* darker than the body text */ 14 | margin-top: 1em; /* space above headers */ 15 | } 16 | 17 | h1 { 18 | font-size: 2em; /* make h1 twice the size of the body text */ 19 | } 20 | 21 | h2 { 22 | font-size: 1.5em; 23 | } 24 | 25 | /* Add some space between paragraphs */ 26 | p { 27 | margin-bottom: 1em; 28 | } 29 | 30 | /* Style for blockquotes, often used in academic papers */ 31 | blockquote { 32 | font-style: italic; 33 | margin: 1em 0; 34 | padding: 1em; 35 | background-color: #f9f9f9; /* a light grey background */ 36 | } 37 | 38 | /* You might want to style tables, figures, etc. too */ 39 | table { 40 | border-collapse: collapse; 41 | width: 100%; 42 | } 43 | 44 | table, th, td { 45 | border: 1px solid #ddd; 46 | text-align: left; 47 | padding: 8px; 48 | } 49 | 50 | th { 51 | background-color: #f2f2f2; 52 | color: black; 53 | } -------------------------------------------------------------------------------- /multi_agents/agents/utils/views.py: -------------------------------------------------------------------------------- 1 | from colorama import Fore, Style 2 | from enum import Enum 3 | 4 | 5 | class AgentColor(Enum): 6 | RESEARCHER = Fore.LIGHTBLUE_EX 7 | EDITOR = Fore.YELLOW 8 | WRITER = Fore.LIGHTGREEN_EX 9 | PUBLISHER = Fore.MAGENTA 10 | REVIEWER = Fore.CYAN 11 | REVISOR = Fore.LIGHTWHITE_EX 12 | MASTER = Fore.LIGHTYELLOW_EX 13 | 14 | 15 | def print_agent_output(output:str, agent: str="RESEARCHER"): 16 | print(f"{AgentColor[agent].value}{agent}: {output}{Style.RESET_ALL}") -------------------------------------------------------------------------------- /multi_agents/agents/writer.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json5 as json 3 | from .utils.views import print_agent_output 4 | from .utils.llms import call_model 5 | 6 | sample_json = """ 7 | { 8 | "table_of_contents": A table of contents in markdown syntax (using '-') based on the research headers and subheaders, 9 | "introduction": An indepth introduction to the topic in markdown syntax and hyperlink references to relevant sources, 10 | "conclusion": A conclusion to the entire research based on all research data in markdown syntax and hyperlink references to relevant sources, 11 | "sources": A list with strings of all used source links in the entire research data in markdown syntax and apa citation format. For example: ['- Title, year, Author [source url](source)', ...] 12 | } 13 | """ 14 | 15 | 16 | class WriterAgent: 17 | def __init__(self): 18 | pass 19 | 20 | def get_headers(self, research_state: dict): 21 | return { 22 | "title": research_state.get("title"), 23 | "date": "Date", 24 | "introduction": "Introduction", 25 | "table_of_contents": "Table of Contents", 26 | "conclusion": "Conclusion", 27 | "references": "References" 28 | } 29 | 30 | def write_sections(self, research_state: dict): 31 | query = research_state.get("title") 32 | data = research_state.get("research_data") 33 | task = research_state.get("task") 34 | follow_guidelines = task.get("follow_guidelines") 35 | guidelines = task.get("guidelines") 36 | 37 | prompt = [{ 38 | "role": "system", 39 | "content": "You are a research writer. Your sole purpose is to write a well-written " 40 | "research reports about a " 41 | "topic based on research findings and information.\n " 42 | }, { 43 | "role": "user", 44 | "content": f"Today's date is {datetime.now().strftime('%d/%m/%Y')}\n." 45 | f"Query or Topic: {query}\n" 46 | f"Research data: {str(data)}\n" 47 | f"Your task is to write an in depth, well written and detailed " 48 | f"introduction and conclusion to the research report based on the provided research data. " 49 | f"Do not include headers in the results.\n" 50 | f"You MUST include any relevant sources to the introduction and conclusion as markdown hyperlinks -" 51 | f"For example: 'This is a sample text. ([url website](url))'\n\n" 52 | f"{f'You must follow the guidelines provided: {guidelines}' if follow_guidelines else ''}\n" 53 | f"You MUST return nothing but a JSON in the following format (without json markdown):\n" 54 | f"{sample_json}\n\n" 55 | 56 | }] 57 | 58 | response = call_model(prompt, task.get("model"), max_retries=2, response_format='json') 59 | return json.loads(response) 60 | 61 | def revise_headers(self, task: dict, headers: dict): 62 | prompt = [{ 63 | "role": "system", 64 | "content": """You are a research writer. 65 | Your sole purpose is to revise the headers data based on the given guidelines.""" 66 | }, { 67 | "role": "user", 68 | "content": f"""Your task is to revise the given headers JSON based on the guidelines given. 69 | You are to follow the guidelines but the values should be in simple strings, ignoring all markdown syntax. 70 | You must return nothing but a JSON in the same format as given in headers data. 71 | Guidelines: {task.get("guidelines")}\n 72 | Headers Data: {headers}\n 73 | """ 74 | 75 | }] 76 | 77 | response = call_model(prompt, task.get("model"), response_format='json') 78 | return {"headers": json.loads(response)} 79 | 80 | def run(self, research_state: dict): 81 | print_agent_output(f"Writing final research report based on research data...", agent="WRITER") 82 | research_layout_content = self.write_sections(research_state) 83 | 84 | if research_state.get("task").get("verbose"): 85 | print_agent_output(research_layout_content, agent="WRITER") 86 | 87 | headers = self.get_headers(research_state) 88 | if research_state.get("task").get("follow_guidelines"): 89 | print_agent_output("Rewriting layout based on guidelines...", agent="WRITER") 90 | headers = self.revise_headers(task=research_state.get("task"), headers=headers).get("headers") 91 | 92 | return {**research_layout_content, "headers": headers} 93 | -------------------------------------------------------------------------------- /multi_agents/langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "python_version": "3.11", 3 | "dependencies": [ 4 | "." 5 | ], 6 | "graphs": { 7 | "agent": "./agent.py:graph" 8 | }, 9 | "env": ".env" 10 | } -------------------------------------------------------------------------------- /multi_agents/main.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from agents import ChiefEditorAgent 3 | import asyncio 4 | import json 5 | import os 6 | 7 | # Run with LangSmith if API key is set 8 | if os.environ.get("LANGCHAIN_API_KEY"): 9 | os.environ["LANGCHAIN_TRACING_V2"] = "true" 10 | load_dotenv() 11 | 12 | 13 | def open_task(): 14 | with open('task.json', 'r') as f: 15 | task = json.load(f) 16 | 17 | if not task: 18 | raise Exception("No task provided. Please include a task.json file in the root directory.") 19 | 20 | return task 21 | 22 | 23 | async def main(): 24 | task = open_task() 25 | 26 | chief_editor = ChiefEditorAgent(task) 27 | research_report = await chief_editor.run_research_task() 28 | 29 | return research_report 30 | 31 | if __name__ == "__main__": 32 | asyncio.run(main()) 33 | -------------------------------------------------------------------------------- /multi_agents/memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/multi_agents/memory/__init__.py -------------------------------------------------------------------------------- /multi_agents/memory/draft.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, List, Annotated 2 | import operator 3 | 4 | 5 | class DraftState(TypedDict): 6 | task: dict 7 | topic: str 8 | draft: dict 9 | review: str 10 | revision_notes: str -------------------------------------------------------------------------------- /multi_agents/memory/research.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, List, Annotated 2 | import operator 3 | 4 | 5 | class ResearchState(TypedDict): 6 | task: dict 7 | initial_research: str 8 | sections: List[str] 9 | research_data: List[dict] 10 | # Report layout 11 | title: str 12 | headers: dict 13 | date: str 14 | table_of_contents: str 15 | introduction: str 16 | conclusion: str 17 | sources: List[str] 18 | report: str 19 | 20 | 21 | -------------------------------------------------------------------------------- /multi_agents/requirements.txt: -------------------------------------------------------------------------------- 1 | langgraph 2 | gpt_researcher 3 | langchain-community 4 | python-dotenv 5 | weasyprint 6 | json5 7 | -------------------------------------------------------------------------------- /multi_agents/task.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": "Is AI in a hype cycle?", 3 | "max_sections": 3, 4 | "publish_formats": { 5 | "markdown": true, 6 | "pdf": true, 7 | "docx": true 8 | }, 9 | "follow_guidelines": false, 10 | "model": "gpt-4o", 11 | "guidelines": [ 12 | "The report MUST be written in APA format", 13 | "Each sub section MUST include supporting sources using hyperlinks. If none exist, erase the sub section or rewrite it to be a part of the previous section", 14 | "The report MUST be written in spanish" 15 | ], 16 | "verbose": true 17 | } -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "gpt-researcher" 3 | version = "0.0.5" 4 | description = "GPT Researcher is an autonomous agent designed for comprehensive online research on a variety of tasks." 5 | authors = ["Tavily "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.10,<3.12" 11 | beautifulsoup4 = ">=4.12.2" 12 | colorama = ">=0.4.6" 13 | duckduckgo_search = ">=4.1.1" 14 | md2pdf = ">=1.0.1" 15 | openai = ">=1.3.3" 16 | playwright = ">=1.39.0" 17 | python-dotenv = ">=1.0.0" 18 | pyyaml = ">=6.0.1" 19 | uvicorn = ">=0.24.0.post1" 20 | pydantic = ">=2.5.1" 21 | fastapi = ">=0.104.1" 22 | python-multipart = ">=0.0.6" 23 | markdown = ">=3.5.1" 24 | langchain = ">=0.0.350" 25 | langgraph = ">=0.0.29" 26 | tavily-python = ">=0.2.8" 27 | permchain = ">=0.0.6" 28 | arxiv = ">=2.0.0" 29 | PyMuPDF = ">=1.23.6" 30 | requests = ">=2.31.0" 31 | jinja2 = ">=3.1.2" 32 | aiofiles = ">=23.2.1" 33 | newspaper3k = ">=0.2.8" 34 | langchain_community = ">=0.0.28" 35 | SQLAlchemy = ">=2.0.28" 36 | mistune = "^3.0.2" 37 | htmldocx = "^0.0.6" 38 | python-docx = "^1.1.0" 39 | langchain-openai = "^0.1.1" 40 | langchain-google-genai = "^0.0.11" 41 | lxml = { version = ">=4.9.2", extras = ["html_clean"] } 42 | unstructured = "^0.13.0" 43 | 44 | [build-system] 45 | requires = ["poetry-core"] 46 | build-backend = "poetry.core.masonry.api" 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # dependencies 2 | beautifulsoup4 3 | colorama 4 | duckduckgo_search 5 | yahoo-search-py 6 | md2pdf 7 | playwright 8 | openai 9 | python-dotenv 10 | pyyaml 11 | uvicorn 12 | pydantic 13 | fastapi 14 | python-multipart 15 | markdown 16 | langchain 17 | langchain-openai 18 | langchain-google-genai 19 | langchain_community 20 | tavily-python 21 | arxiv 22 | PyMuPDF 23 | requests 24 | jinja2 25 | aiofiles 26 | newspaper3k 27 | SQLAlchemy 28 | mistune 29 | python-docx 30 | htmldocx 31 | lxml[html_clean] 32 | websockets 33 | unstructured 34 | pytest 35 | pytest-asyncio -------------------------------------------------------------------------------- /scraping/js/overlay.js: -------------------------------------------------------------------------------- 1 | const overlay = document.createElement('div'); 2 | Object.assign(overlay.style, { 3 | position: 'fixed', 4 | zIndex: 999999, 5 | top: 0, 6 | left: 0, 7 | width: '100%', 8 | height: '100%', 9 | background: 'rgba(0, 0, 0, 0.7)', 10 | color: '#fff', 11 | fontSize: '24px', 12 | fontWeight: 'bold', 13 | display: 'flex', 14 | justifyContent: 'center', 15 | alignItems: 'center', 16 | }); 17 | const textContent = document.createElement('div'); 18 | Object.assign(textContent.style, { 19 | textAlign: 'center', 20 | }); 21 | textContent.textContent = 'Tavily AI: Analyzing Page'; 22 | overlay.appendChild(textContent); 23 | document.body.append(overlay); 24 | document.body.style.overflow = 'hidden'; 25 | let dotCount = 0; 26 | setInterval(() => { 27 | textContent.textContent = 'Tavily AI: Analyzing Page' + '.'.repeat(dotCount); 28 | dotCount = (dotCount + 1) % 4; 29 | }, 1000); 30 | -------------------------------------------------------------------------------- /scraping/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwchase17/gpt-researcher/df4350939dbe42a059d7cd00aec0ae61388c9e35/scraping/processing/__init__.py -------------------------------------------------------------------------------- /scraping/processing/html.py: -------------------------------------------------------------------------------- 1 | """HTML processing functions""" 2 | from __future__ import annotations 3 | 4 | from bs4 import BeautifulSoup 5 | from requests.compat import urljoin 6 | 7 | 8 | def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> list[tuple[str, str]]: 9 | """Extract hyperlinks from a BeautifulSoup object 10 | 11 | Args: 12 | soup (BeautifulSoup): The BeautifulSoup object 13 | base_url (str): The base URL 14 | 15 | Returns: 16 | List[Tuple[str, str]]: The extracted hyperlinks 17 | """ 18 | return [ 19 | (link.text, urljoin(base_url, link["href"])) 20 | for link in soup.find_all("a", href=True) 21 | ] 22 | 23 | 24 | def format_hyperlinks(hyperlinks: list[tuple[str, str]]) -> list[str]: 25 | """Format hyperlinks to be displayed to the user 26 | 27 | Args: 28 | hyperlinks (List[Tuple[str, str]]): The hyperlinks to format 29 | 30 | Returns: 31 | List[str]: The formatted hyperlinks 32 | """ 33 | return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks] 34 | -------------------------------------------------------------------------------- /scraping/scrape_skills.py: -------------------------------------------------------------------------------- 1 | from langchain.document_loaders import PyMuPDFLoader 2 | from langchain.retrievers import ArxivRetriever 3 | 4 | 5 | def scrape_pdf_with_pymupdf(url) -> str: 6 | """Scrape a pdf with pymupdf 7 | 8 | Args: 9 | url (str): The url of the pdf to scrape 10 | 11 | Returns: 12 | str: The text scraped from the pdf 13 | """ 14 | loader = PyMuPDFLoader(url) 15 | doc = loader.load() 16 | return str(doc) 17 | 18 | 19 | def scrape_pdf_with_arxiv(query) -> str: 20 | """Scrape a pdf with arxiv 21 | default document length of 70000 about ~15 pages or None for no limit 22 | 23 | Args: 24 | query (str): The query to search for 25 | 26 | Returns: 27 | str: The text scraped from the pdf 28 | """ 29 | retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) 30 | docs = retriever.get_relevant_documents(query=query) 31 | return docs[0].page_content -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open(r"README.md", "r", encoding="utf-8") as f: 4 | long_description = f.read() 5 | 6 | with open("requirements.txt", "r") as f: 7 | reqs = [line.strip() for line in f if ('selenium' not in line and 'webdriver' not in line)] 8 | 9 | setup( 10 | name="gpt-researcher", 11 | version="0.5.3", 12 | description="GPT Researcher is an autonomous agent designed for comprehensive online research on a variety of tasks.", 13 | package_dir={'gpt_researcher': 'gpt_researcher'}, 14 | packages=find_packages(), 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/assafelovic/gpt-researcher", 18 | author="Assaf Elovic", 19 | author_email="assaf.elovic@gmail.com", 20 | license="MIT", 21 | classifiers=[ 22 | "License :: OSI Approved :: MIT License", 23 | "Intended Audience :: Developers", 24 | "Intended Audience :: Education", 25 | "Intended Audience :: Science/Research", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 29 | ], 30 | install_requires=reqs, 31 | 32 | 33 | ) -------------------------------------------------------------------------------- /tests/all-6-report-types.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import pytest 4 | from gpt_researcher import GPTResearcher 5 | 6 | # Define the report types to test 7 | report_types = [ 8 | "research_report", 9 | "custom_report", 10 | "subtopic_report", 11 | "summary_report", 12 | "detailed_report", 13 | "quick_report" 14 | ] 15 | 16 | # Define a common query and sources for testing 17 | query = "What are the latest advancements in AI?" 18 | sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] 19 | 20 | # Define the output directory 21 | output_dir = "./outputs" 22 | 23 | @pytest.mark.asyncio 24 | @pytest.mark.parametrize("report_type", report_types) 25 | async def test_gpt_researcher(report_type): 26 | # Ensure the output directory exists 27 | if not os.path.exists(output_dir): 28 | os.makedirs(output_dir) 29 | 30 | # Create an instance of GPTResearcher 31 | researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) 32 | 33 | # Conduct research and write the report 34 | await researcher.conduct_research() 35 | report = await researcher.write_report() 36 | 37 | # Define the expected output filenames 38 | pdf_filename = os.path.join(output_dir, f"{report_type}.pdf") 39 | docx_filename = os.path.join(output_dir, f"{report_type}.docx") 40 | 41 | # Check if the PDF and DOCX files are created 42 | # assert os.path.exists(pdf_filename), f"PDF file not found for report type: {report_type}" 43 | # assert os.path.exists(docx_filename), f"DOCX file not found for report type: {report_type}" 44 | 45 | # Clean up the generated files (optional) 46 | # os.remove(pdf_filename) 47 | # os.remove(docx_filename) 48 | 49 | if __name__ == "__main__": 50 | pytest.main() -------------------------------------------------------------------------------- /tests/documents-report-source.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import pytest 4 | from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct 5 | from dotenv import load_dotenv 6 | load_dotenv() 7 | 8 | # Define the report types to test 9 | report_types = [ 10 | "research_report", 11 | "custom_report", 12 | "subtopic_report", 13 | "summary_report", 14 | "detailed_report", 15 | "quick_report" 16 | ] 17 | 18 | # Define a common query and sources for testing 19 | query = "What can you tell me about myself based on my documents?" 20 | 21 | # Define the output directory 22 | output_dir = "./outputs" 23 | 24 | @pytest.mark.asyncio 25 | @pytest.mark.parametrize("report_type", report_types) 26 | async def test_gpt_researcher(report_type): 27 | # Ensure the output directory exists 28 | if not os.path.exists(output_dir): 29 | os.makedirs(output_dir) 30 | 31 | # Create an instance of GPTResearcher with report_source set to "documents" 32 | researcher = GPTResearcher(query=query, report_type=report_type, report_source="documents") 33 | 34 | # Conduct research and write the report 35 | await researcher.conduct_research() 36 | report = await researcher.write_report() 37 | 38 | # Define the expected output filenames 39 | pdf_filename = os.path.join(output_dir, f"{report_type}.pdf") 40 | docx_filename = os.path.join(output_dir, f"{report_type}.docx") 41 | 42 | # Check if the PDF and DOCX files are created 43 | # assert os.path.exists(pdf_filename), f"PDF file not found for report type: {report_type}" 44 | # assert os.path.exists(docx_filename), f"DOCX file not found for report type: {report_type}" 45 | 46 | # Clean up the generated files (optional) 47 | # os.remove(pdf_filename) 48 | # os.remove(docx_filename) 49 | 50 | if __name__ == "__main__": 51 | pytest.main() --------------------------------------------------------------------------------