├── .env.example ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── actions │ ├── login-docker │ │ └── action.yml │ ├── setup-docker │ │ └── action.yml │ ├── setup-postgres-ext │ │ └── action.yml │ ├── setup-python-full │ │ └── action.yml │ ├── setup-python-light │ │ └── action.yml │ ├── start-r2r-full │ │ └── action.yml │ └── start-r2r-light │ │ └── action.yml └── workflows │ ├── build-cluster-service-docker.yml │ ├── build-r2r-docker.yml │ ├── build-unst-service-docker.yml │ ├── publish-to-npm.yml │ ├── publish-to-pypi.yml │ ├── quality.yml │ ├── r2r-full-py-integration-tests.yml │ ├── r2r-js-sdk-ci.yml │ ├── r2r-js-sdk-integration-tests.yml │ └── r2r-light-py-integration-tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.md ├── MANIFEST.md ├── README.md ├── SECURITY.md ├── docker ├── compose.full.swarm.yaml ├── compose.full.yaml ├── compose.yaml ├── env │ ├── hatchet.env │ ├── minio.env │ ├── postgres.env │ ├── r2r-dashboard.env │ ├── r2r-full.env │ └── r2r.env ├── fluent-bit │ ├── fluent-bit.conf │ └── parsers.conf ├── scripts │ ├── create-hatchet-db.sh │ ├── setup-token.sh │ └── start-r2r.sh ├── user_configs │ └── README.md └── user_tools │ ├── README.md │ └── user_requirements.txt ├── js ├── README.md └── sdk │ ├── .prettierignore │ ├── README.md │ ├── __tests__ │ ├── ChunksIntegrationSuperUser.test.ts │ ├── CollectionsIntegrationSuperUser.test.ts │ ├── ConversationsIntegrationSuperUser.test.ts │ ├── ConversationsIntegrationUser.test.ts │ ├── DocumentsAndCollectionsIntegrationUser.test.ts │ ├── DocumentsIntegrationSuperUser.test.ts │ ├── GraphsIntegrationSuperUser.test.ts │ ├── PromptsIntegrationSuperUser.test.ts │ ├── RetrievalIntegrationSuperUser.test.ts │ ├── SystemIntegrationSuperUser.test.ts │ ├── SystemIntegrationUser.test.ts │ ├── UsersIntegrationSuperUser.test.ts │ └── util │ │ └── typeTransformer.test.ts │ ├── examples │ ├── data │ │ ├── folder │ │ │ ├── karamozov.txt │ │ │ └── myshkin.txt │ │ ├── invalid.json │ │ ├── marmeladov.txt │ │ ├── raskolnikov.txt │ │ ├── raskolnikov_2.txt │ │ ├── sonia.txt │ │ └── zametov.txt │ └── hello_r2r.js │ ├── jest.config.js │ ├── package-lock.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── src │ ├── baseClient.ts │ ├── index.ts │ ├── r2rClient.ts │ ├── types.ts │ ├── utils │ │ ├── index.ts │ │ ├── typeTransformer.ts │ │ └── utils.ts │ └── v3 │ │ └── clients │ │ ├── chunks.ts │ │ ├── collections.ts │ │ ├── conversations.ts │ │ ├── documents.ts │ │ ├── graphs.ts │ │ ├── indices.ts │ │ ├── prompts.ts │ │ ├── retrieval.ts │ │ ├── system.ts │ │ └── users.ts │ └── tsconfig.json ├── llms.txt ├── py ├── .dockerignore ├── Dockerfile ├── README.md ├── all_possible_config.toml ├── core │ ├── __init__.py │ ├── agent │ │ ├── __init__.py │ │ ├── base.py │ │ ├── rag.py │ │ └── research.py │ ├── base │ │ ├── __init__.py │ │ ├── abstractions │ │ │ └── __init__.py │ │ ├── agent │ │ │ ├── __init__.py │ │ │ ├── agent.py │ │ │ └── tools │ │ │ │ ├── built_in │ │ │ │ ├── get_file_content.py │ │ │ │ ├── search_file_descriptions.py │ │ │ │ ├── search_file_knowledge.py │ │ │ │ ├── tavily_extract.py │ │ │ │ ├── tavily_search.py │ │ │ │ ├── web_scrape.py │ │ │ │ └── web_search.py │ │ │ │ └── registry.py │ │ ├── api │ │ │ └── models │ │ │ │ └── __init__.py │ │ ├── parsers │ │ │ ├── __init__.py │ │ │ └── base_parser.py │ │ ├── providers │ │ │ ├── __init__.py │ │ │ ├── auth.py │ │ │ ├── base.py │ │ │ ├── crypto.py │ │ │ ├── database.py │ │ │ ├── email.py │ │ │ ├── embedding.py │ │ │ ├── file.py │ │ │ ├── ingestion.py │ │ │ ├── llm.py │ │ │ ├── ocr.py │ │ │ ├── orchestration.py │ │ │ └── scheduler.py │ │ └── utils │ │ │ └── __init__.py │ ├── configs │ │ ├── full.toml │ │ ├── full_azure.toml │ │ ├── full_lm_studio.toml │ │ ├── full_ollama.toml │ │ ├── gemini.toml │ │ ├── lm_studio.toml │ │ ├── ollama.toml │ │ ├── r2r_azure.toml │ │ ├── r2r_azure_with_test_limits.toml │ │ ├── r2r_with_auth.toml │ │ └── tavily.toml │ ├── examples │ │ ├── __init__.py │ │ ├── data │ │ │ ├── DeepSeek_R1.pdf │ │ │ ├── aristotle.txt │ │ │ ├── aristotle_v2.txt │ │ │ ├── aristotle_v3.txt │ │ │ ├── got.txt │ │ │ ├── graphrag.pdf │ │ │ ├── lyft_2021.pdf │ │ │ ├── pg_essay_1.html │ │ │ ├── pg_essay_2.html │ │ │ ├── pg_essay_3.html │ │ │ ├── pg_essay_4.html │ │ │ ├── pg_essay_5.html │ │ │ ├── sample.mp3 │ │ │ ├── sample2.mp3 │ │ │ ├── screen_shot.png │ │ │ ├── test.txt │ │ │ ├── uber_2021.pdf │ │ │ └── yc_companies.txt │ │ ├── hello_r2r.ipynb │ │ ├── hello_r2r.py │ │ └── supported_file_types │ │ │ ├── bmp.bmp │ │ │ ├── css.css │ │ │ ├── csv.csv │ │ │ ├── doc.doc │ │ │ ├── docx.docx │ │ │ ├── eml.eml │ │ │ ├── epub.epub │ │ │ ├── heic.heic │ │ │ ├── html.html │ │ │ ├── jpeg.jpeg │ │ │ ├── jpg.jpg │ │ │ ├── js.js │ │ │ ├── json.json │ │ │ ├── md.md │ │ │ ├── msg.msg │ │ │ ├── odt.odt │ │ │ ├── org.org │ │ │ ├── p7s.p7s │ │ │ ├── pdf.pdf │ │ │ ├── png.png │ │ │ ├── ppt.ppt │ │ │ ├── pptx.pptx │ │ │ ├── py.py │ │ │ ├── rst.rst │ │ │ ├── rtf.rtf │ │ │ ├── tiff.tiff │ │ │ ├── ts.ts │ │ │ ├── tsv.tsv │ │ │ ├── txt.txt │ │ │ ├── xls.xls │ │ │ └── xlsx.xlsx │ ├── main │ │ ├── __init__.py │ │ ├── abstractions.py │ │ ├── api │ │ │ └── v3 │ │ │ │ ├── base_router.py │ │ │ │ ├── chunks_router.py │ │ │ │ ├── collections_router.py │ │ │ │ ├── conversations_router.py │ │ │ │ ├── documents_router.py │ │ │ │ ├── graph_router.py │ │ │ │ ├── indices_router.py │ │ │ │ ├── prompts_router.py │ │ │ │ ├── retrieval_router.py │ │ │ │ ├── system_router.py │ │ │ │ └── users_router.py │ │ ├── app.py │ │ ├── app_entry.py │ │ ├── assembly │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── factory.py │ │ │ └── utils.py │ │ ├── config.py │ │ ├── middleware │ │ │ ├── __init__.py │ │ │ └── project_schema.py │ │ ├── orchestration │ │ │ ├── __init__.py │ │ │ ├── hatchet │ │ │ │ ├── __init__.py │ │ │ │ ├── graph_workflow.py │ │ │ │ └── ingestion_workflow.py │ │ │ └── simple │ │ │ │ ├── __init__.py │ │ │ │ ├── graph_workflow.py │ │ │ │ └── ingestion_workflow.py │ │ └── services │ │ │ ├── __init__.py │ │ │ ├── auth_service.py │ │ │ ├── base.py │ │ │ ├── graph_service.py │ │ │ ├── ingestion_service.py │ │ │ ├── maintenance_service.py │ │ │ ├── management_service.py │ │ │ └── retrieval_service.py │ ├── parsers │ │ ├── __init__.py │ │ ├── media │ │ │ ├── __init__.py │ │ │ ├── audio_parser.py │ │ │ ├── bmp_parser.py │ │ │ ├── doc_parser.py │ │ │ ├── docx_parser.py │ │ │ ├── img_parser.py │ │ │ ├── odt_parser.py │ │ │ ├── pdf_parser.py │ │ │ ├── ppt_parser.py │ │ │ ├── pptx_parser.py │ │ │ └── rtf_parser.py │ │ ├── structured │ │ │ ├── __init__.py │ │ │ ├── csv_parser.py │ │ │ ├── eml_parser.py │ │ │ ├── epub_parser.py │ │ │ ├── json_parser.py │ │ │ ├── msg_parser.py │ │ │ ├── org_parser.py │ │ │ ├── p7s_parser.py │ │ │ ├── rst_parser.py │ │ │ ├── tsv_parser.py │ │ │ ├── xls_parser.py │ │ │ └── xlsx_parser.py │ │ └── text │ │ │ ├── __init__.py │ │ │ ├── css_parser.py │ │ │ ├── html_parser.py │ │ │ ├── js_parser.py │ │ │ ├── md_parser.py │ │ │ ├── python_parser.py │ │ │ ├── text_parser.py │ │ │ └── ts_parser.py │ ├── providers │ │ ├── __init__.py │ │ ├── auth │ │ │ ├── __init__.py │ │ │ ├── clerk.py │ │ │ ├── jwt.py │ │ │ ├── r2r_auth.py │ │ │ └── supabase.py │ │ ├── crypto │ │ │ ├── __init__.py │ │ │ ├── bcrypt.py │ │ │ └── nacl.py │ │ ├── database │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── chunks.py │ │ │ ├── collections.py │ │ │ ├── conversations.py │ │ │ ├── documents.py │ │ │ ├── filters.py │ │ │ ├── graphs.py │ │ │ ├── limits.py │ │ │ ├── maintenance.py │ │ │ ├── postgres.py │ │ │ ├── prompts │ │ │ │ ├── __init__.py │ │ │ │ ├── chunk_enrichment.yaml │ │ │ │ ├── collection_summary.yaml │ │ │ │ ├── dynamic_rag_agent.yaml │ │ │ │ ├── dynamic_rag_agent_xml_tooling.yaml │ │ │ │ ├── graph_communities.yaml │ │ │ │ ├── graph_entity_description.yaml │ │ │ │ ├── graph_extraction.yaml │ │ │ │ ├── hyde.yaml │ │ │ │ ├── rag.yaml │ │ │ │ ├── rag_fusion.yaml │ │ │ │ ├── static_rag_agent.yaml │ │ │ │ ├── static_research_agent.yaml │ │ │ │ ├── summary.yaml │ │ │ │ ├── system.yaml │ │ │ │ ├── vision_img.yaml │ │ │ │ └── vision_pdf.yaml │ │ │ ├── prompts_handler.py │ │ │ ├── tokens.py │ │ │ ├── users.py │ │ │ └── utils.py │ │ ├── email │ │ │ ├── __init__.py │ │ │ ├── console_mock.py │ │ │ ├── mailersend.py │ │ │ ├── sendgrid.py │ │ │ └── smtp.py │ │ ├── embeddings │ │ │ ├── __init__.py │ │ │ ├── litellm.py │ │ │ ├── ollama.py │ │ │ ├── openai.py │ │ │ └── utils.py │ │ ├── file │ │ │ ├── __init__.py │ │ │ ├── postgres.py │ │ │ └── s3.py │ │ ├── ingestion │ │ │ ├── __init__.py │ │ │ ├── r2r │ │ │ │ └── base.py │ │ │ └── unstructured │ │ │ │ └── base.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── anthropic.py │ │ │ ├── azure_foundry.py │ │ │ ├── litellm.py │ │ │ ├── openai.py │ │ │ ├── r2r_llm.py │ │ │ └── utils.py │ │ ├── ocr │ │ │ ├── __init__.py │ │ │ └── mistral.py │ │ ├── orchestration │ │ │ ├── __init__.py │ │ │ ├── hatchet.py │ │ │ └── simple.py │ │ └── scheduler │ │ │ ├── __init__.py │ │ │ └── apscheduler.py │ └── utils │ │ ├── __init__.py │ │ ├── context.py │ │ ├── logging_config.py │ │ ├── sentry.py │ │ └── serper.py ├── migrations │ ├── README │ ├── alembic.ini │ ├── env.py │ ├── script.py.mako │ └── versions │ │ ├── 2fac23e4d91b_migrate_to_document_search.py │ │ ├── 3efc7b3b1b3d_add_total_tokens_count.py │ │ ├── 7eb70560f406_add_limits_overrides_to_users.py │ │ ├── 8077140e1e99_v3_api_database_revision.py │ │ ├── c45a9cf6a8a4_add_user_and_document_count_to_.py │ │ └── d342e632358a_migrate_to_asyncpg.py ├── pyproject.toml ├── r2r │ ├── __init__.py │ ├── mcp.py │ ├── r2r.toml │ └── serve.py ├── sdk │ ├── README.md │ ├── __init__.py │ ├── asnyc_methods │ │ ├── __init__.py │ │ ├── chunks.py │ │ ├── collections.py │ │ ├── conversations.py │ │ ├── documents.py │ │ ├── graphs.py │ │ ├── indices.py │ │ ├── prompts.py │ │ ├── retrieval.py │ │ ├── system.py │ │ └── users.py │ ├── async_client.py │ ├── base │ │ ├── __init_.py │ │ └── base_client.py │ ├── models.py │ ├── sync_client.py │ └── sync_methods │ │ ├── __init__.py │ │ ├── chunks.py │ │ ├── collections.py │ │ ├── conversations.py │ │ ├── documents.py │ │ ├── graphs.py │ │ ├── indices.py │ │ ├── prompts.py │ │ ├── retrieval.py │ │ ├── system.py │ │ └── users.py ├── shared │ ├── __init__.py │ ├── abstractions │ │ ├── __init__.py │ │ ├── base.py │ │ ├── document.py │ │ ├── exception.py │ │ ├── graph.py │ │ ├── llm.py │ │ ├── prompt.py │ │ ├── search.py │ │ ├── tool.py │ │ ├── user.py │ │ └── vector.py │ ├── api │ │ └── models │ │ │ ├── __init__.py │ │ │ ├── auth │ │ │ ├── __init__.py │ │ │ └── responses.py │ │ │ ├── base.py │ │ │ ├── graph │ │ │ ├── __init__.py │ │ │ └── responses.py │ │ │ ├── ingestion │ │ │ ├── __init__.py │ │ │ └── responses.py │ │ │ ├── management │ │ │ ├── __init__.py │ │ │ └── responses.py │ │ │ └── retrieval │ │ │ ├── __init__.py │ │ │ └── responses.py │ └── utils │ │ ├── __init__.py │ │ ├── base_utils.py │ │ └── splitter │ │ ├── __init__.py │ │ └── text.py ├── tests │ ├── integration │ │ ├── conftest.py │ │ ├── test_agent.py │ │ ├── test_base.py │ │ ├── test_chunks.py │ │ ├── test_collections.py │ │ ├── test_collections_users_interaction.py │ │ ├── test_conversations.py │ │ ├── test_documents.py │ │ ├── test_filters.py │ │ ├── test_graphs.py │ │ ├── test_indices.py │ │ ├── test_ingestion.py │ │ ├── test_retrieval.py │ │ ├── test_retrieval_advanced.py │ │ ├── test_system.py │ │ └── test_users.py │ ├── scaling │ │ ├── __init__.py │ │ └── loadTester.py │ └── unit │ │ ├── agent │ │ ├── test_agent.py │ │ ├── test_agent_citations.py │ │ ├── test_agent_citations_old.py │ │ ├── test_agent_old.py │ │ └── test_streaming_agent.py │ │ ├── app │ │ ├── test_config.py │ │ └── test_routes.py │ │ ├── conftest.py │ │ ├── database │ │ ├── test_collections.py │ │ ├── test_conversations.py │ │ ├── test_graphs.py │ │ └── test_limits.py │ │ ├── document │ │ ├── test_chunks.py │ │ ├── test_document_processing.py │ │ └── test_documents.py │ │ └── retrieval │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_citations.py │ │ ├── test_database_filters.py │ │ ├── test_rag_processing.py │ │ └── test_retrieval_old.py └── uv.lock └── services ├── README.md ├── clustering ├── Dockerfile.clustering └── main.py └── unstructured ├── Dockerfile.unstructured ├── README.md └── main.py /.env.example: -------------------------------------------------------------------------------- 1 | # Environment variables for LLM provider(s) 2 | export OPENAI_API_KEY=sk-... 3 | # uncomment the following lines to enable other LLM providers 4 | # export ANTHROPIC_API_KEY=... 5 | # export VERTEX_API_KEY=... 6 | # export XAI_API_KEY=... 7 | # Add other provider keys as needed 8 | 9 | # Environment variables for the Postgres database 10 | export R2R_POSTGRES_USER=your_user 11 | export R2R_POSTGRES_PASSWORD=your_password 12 | export R2R_POSTGRES_HOST=your_host 13 | export R2R_POSTGRES_PORT=your_port 14 | export R2R_POSTGRES_DBNAME=your_db 15 | export R2R_PROJECT_NAME=your_project_name 16 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-documentation 2 | *.ipynb linguist-documentation 3 | templates/** linguist-vendored 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/actions/login-docker/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Login Docker' 2 | description: 'Sets up Docker for running R2R' 3 | inputs: 4 | docker_username: 5 | description: 'Docker Hub username' 6 | required: true 7 | docker_password: 8 | description: 'Docker Hub password or token' 9 | required: true 10 | runs: 11 | using: "composite" 12 | steps: 13 | - name: Login to Docker Hub 14 | uses: docker/login-action@v2 15 | with: 16 | username: ${{ inputs.docker_username }} 17 | password: ${{ inputs.docker_password }} 18 | -------------------------------------------------------------------------------- /.github/actions/setup-docker/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Setup Docker' 2 | description: 'Sets up Docker for running R2R' 3 | runs: 4 | using: "composite" 5 | steps: 6 | - name: Set up Docker 7 | uses: docker-practice/actions-setup-docker@master 8 | with: 9 | docker_version: 20.10 10 | docker_buildx: true 11 | 12 | - name: Set up Docker Buildx 13 | uses: docker/setup-buildx-action@v2 14 | -------------------------------------------------------------------------------- /.github/actions/setup-python-full/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Setup Python for R2R Full' 2 | description: 'Sets up Python and installs R2R dependencies for full installation' 3 | 4 | inputs: 5 | os: 6 | description: 'Operating system' 7 | required: true 8 | python-version: 9 | description: 'Python version to use' 10 | required: false 11 | default: '3.12' 12 | 13 | runs: 14 | using: "composite" 15 | steps: 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ inputs.python-version }} 20 | cache: 'pip' 21 | 22 | - name: Install R2R CLI & Python SDK 23 | shell: bash 24 | run: | 25 | pip install r2r 26 | 27 | - name: Install uv 28 | shell: bash 29 | run: | 30 | pip install uv 31 | 32 | - name: Install uv 33 | shell: bash 34 | run: | 35 | pip install uv 36 | 37 | - name: Cache uv dependencies 38 | uses: actions/cache@v4 39 | with: 40 | path: | 41 | py/.venv 42 | py/uv.lock 43 | key: ${{ runner.os }}-uv-${{ hashFiles('py/pyproject.toml', 'py/uv.lock') }} 44 | restore-keys: | 45 | ${{ runner.os }}-uv- 46 | 47 | - name: Install dependencies with uv 48 | shell: bash 49 | working-directory: py 50 | run: | 51 | uv sync --extra core 52 | -------------------------------------------------------------------------------- /.github/actions/setup-python-light/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Setup Python for R2R Light' 2 | description: 'Sets up Python environment and installs dependencies using uv' 3 | 4 | inputs: 5 | os: 6 | description: 'Operating system' 7 | required: true 8 | python-version: 9 | description: 'Python version to use' 10 | required: false 11 | default: '3.12' 12 | 13 | runs: 14 | using: "composite" 15 | steps: 16 | - name: Set up Python environment 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ inputs.python-version }} 20 | cache: 'pip' 21 | 22 | - name: Install uv 23 | shell: bash 24 | run: | 25 | pip install uv 26 | 27 | - name: Cache uv dependencies 28 | uses: actions/cache@v4 29 | with: 30 | path: | 31 | py/.venv 32 | py/uv.lock 33 | key: ${{ runner.os }}-uv-${{ hashFiles('py/pyproject.toml', 'py/uv.lock') }} 34 | restore-keys: | 35 | ${{ runner.os }}-uv- 36 | 37 | - name: Install dependencies with uv 38 | shell: bash 39 | working-directory: py 40 | run: | 41 | uv sync --extra core 42 | uv pip install pip wheel 43 | -------------------------------------------------------------------------------- /.github/actions/start-r2r-full/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Start R2R Server' 2 | description: 'Starts the R2R server' 3 | runs: 4 | using: "composite" 5 | steps: 6 | - name: Inspect Docker image manifests 7 | shell: bash 8 | run: | 9 | docker manifest inspect ragtoriches/prod:latest 10 | 11 | - name: Start R2R Server 12 | shell: bash 13 | run: | 14 | cd py 15 | docker build -t r2r/local . 16 | export R2R_CONFIG_NAME=full_azure 17 | export R2R_IMAGE=r2r/local 18 | docker compose -f r2r/compose.full.yaml --project-name r2r-full up -d 19 | uv run r2r serve --docker --full --config-name=full_azure --build --image=r2r-local 20 | -------------------------------------------------------------------------------- /.github/actions/start-r2r-light/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Start R2R Server' 2 | description: 'Starts the R2R server' 3 | inputs: 4 | config-name: 5 | description: 'The R2R configuration name to use' 6 | required: false 7 | default: 'r2r_azure_with_test_limits' 8 | runs: 9 | using: "composite" 10 | steps: 11 | - name: Start R2R server 12 | shell: bash 13 | run: | 14 | cd py 15 | export R2R_CONFIG_NAME=${{ inputs.config-name }} 16 | uv run python -m r2r.serve & 17 | echo "Waiting for services to start..." 18 | sleep 30 19 | -------------------------------------------------------------------------------- /.github/workflows/build-cluster-service-docker.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Cluster Service Docker Image 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | REGISTRY_BASE: ragtoriches 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout Repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.12' 20 | 21 | - name: Install toml package 22 | run: pip install toml 23 | 24 | - name: Determine version 25 | id: version 26 | run: | 27 | echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/cluster-prod" >> $GITHUB_OUTPUT 28 | 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v3 31 | 32 | - name: Docker Auth 33 | uses: docker/login-action@v3 34 | with: 35 | username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }} 36 | password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }} 37 | 38 | - name: Build and push image 39 | uses: docker/build-push-action@v5 40 | with: 41 | context: ./services/clustering 42 | file: ./services/clustering/Dockerfile.clustering 43 | platforms: linux/amd64,linux/arm64 44 | push: true 45 | tags: ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest 46 | provenance: false 47 | sbom: false 48 | 49 | - name: Verify manifest 50 | run: | 51 | docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest 52 | -------------------------------------------------------------------------------- /.github/workflows/build-unst-service-docker.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Unstructured Service Docker Image 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | REGISTRY_BASE: ragtoriches 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout Repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.12' 20 | 21 | - name: Install toml package 22 | run: pip install toml 23 | 24 | - name: Determine version 25 | id: version 26 | run: | 27 | echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/unst-prod" >> $GITHUB_OUTPUT 28 | 29 | - name: Set up Docker Buildx 30 | uses: docker/setup-buildx-action@v3 31 | 32 | - name: Docker Auth 33 | uses: docker/login-action@v3 34 | with: 35 | username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }} 36 | password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }} 37 | 38 | - name: Build and push image 39 | uses: docker/build-push-action@v5 40 | with: 41 | context: ./services/unstructured 42 | file: ./services/unstructured/Dockerfile.unstructured 43 | platforms: linux/amd64,linux/arm64 44 | push: true 45 | tags: ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest 46 | provenance: false 47 | sbom: false 48 | 49 | - name: Verify manifest 50 | run: | 51 | docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest 52 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-npm.yml: -------------------------------------------------------------------------------- 1 | name: Publish NPM Package 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | defaults: 10 | run: 11 | working-directory: js/sdk 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Node.js 16 | uses: actions/setup-node@v3 17 | with: 18 | node-version: '20' 19 | registry-url: 'https://registry.npmjs.org' 20 | 21 | - name: Install pnpm 22 | uses: pnpm/action-setup@v2 23 | with: 24 | version: 6.0.2 25 | 26 | - name: Install dependencies 27 | run: pnpm install 28 | 29 | - name: Build 30 | run: pnpm run build 31 | 32 | - name: Publish to npm 33 | run: pnpm publish --no-git-checks 34 | env: 35 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 36 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | - dev-minor 8 | workflow_dispatch: 9 | 10 | jobs: 11 | publish: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.12' 21 | 22 | - name: Install tools 23 | run: pip install twine tomlkit build 24 | 25 | - name: Bump version for dev branches (TestPyPI) 26 | if: github.event_name == 'push' 27 | run: | 28 | cd py 29 | old_version=$(python -c "import tomlkit; d=tomlkit.parse(open('pyproject.toml').read()); print(d['project']['version'])") 30 | new_version="${old_version}a$(date +'%Y%m%d%H%M')" 31 | python -c "import tomlkit; d=tomlkit.parse(open('pyproject.toml').read()); d['project']['version']='$new_version'; open('pyproject.toml','w').write(tomlkit.dumps(d))" 32 | 33 | - name: Build distributions 34 | run: | 35 | cd py 36 | python -m build 37 | 38 | - name: Publish to TestPyPI 39 | if: github.event_name == 'push' 40 | env: 41 | PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring 42 | TEST_PYPI_API_TOKEN: ${{ secrets.TEST_PYPI_API_TOKEN }} 43 | run: | 44 | cd py 45 | twine upload --repository-url https://test.pypi.org/legacy/ -u __token__ -p "$TEST_PYPI_API_TOKEN" dist/* 46 | 47 | - name: Publish to PyPI 48 | if: github.event_name == 'workflow_dispatch' 49 | env: 50 | PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring 51 | PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 52 | run: | 53 | cd py 54 | twine upload -u __token__ -p "$PYPI_API_TOKEN" dist/* 55 | -------------------------------------------------------------------------------- /.github/workflows/quality.yml: -------------------------------------------------------------------------------- 1 | name: Code Quality Checks 2 | 3 | on: 4 | push: 5 | branches: [ '**' ] 6 | pull_request: 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.x' 18 | 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install pre-commit 23 | pip install mypy 24 | pip install types-requests types-toml types-aiofiles 25 | 26 | - name: Run pre-commit hooks 27 | run: | 28 | pre-commit run --all-files 29 | -------------------------------------------------------------------------------- /.github/workflows/r2r-js-sdk-ci.yml: -------------------------------------------------------------------------------- 1 | name: R2R JS SDK Integration CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | paths: 7 | - 'js/sdk/**' 8 | pull_request: 9 | branches: [main] 10 | paths: 11 | - 'js/sdk/**' 12 | 13 | jobs: 14 | build-and-test: 15 | runs-on: ubuntu-latest 16 | 17 | defaults: 18 | run: 19 | working-directory: ./js/sdk 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | 24 | - name: Use Node.js 25 | uses: actions/setup-node@v4 26 | with: 27 | node-version: "18" 28 | 29 | - name: Install pnpm 30 | uses: pnpm/action-setup@v4 31 | with: 32 | version: 8 33 | 34 | - name: Install dependencies 35 | run: pnpm install 36 | 37 | - name: Build 38 | run: pnpm run build 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | js/sdk/**/*.js 2 | js/tests/* 3 | **/.DS_Store 4 | 5 | .env 6 | .gitignore 7 | *.log 8 | .DS_Store 9 | *.gguf 10 | logs/ 11 | workspace/ 12 | py/workspace/ 13 | uploads/ 14 | **/__pycache__ 15 | **/.mypy_cache 16 | **/.pytest_cache 17 | dump/* 18 | .next 19 | node_modules 20 | .idea 21 | 22 | coverage.xml 23 | .coverage 24 | 25 | **/*.sqlite* 26 | **/*.sqlite3* 27 | 28 | node_modules/ 29 | dist/ 30 | **/.data/* 31 | 32 | *.exe 33 | *.exe~ 34 | *.dll 35 | *.so 36 | *.dylib 37 | *.test 38 | go.work 39 | go.work.sum 40 | 41 | .vscode/ 42 | .python-version 43 | .ruff_cache/ 44 | *.egg-info 45 | .venv 46 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | exclude: ^.venv/ 7 | - id: end-of-file-fixer 8 | exclude: ^.venv/ 9 | - id: check-added-large-files 10 | exclude: ^.venv/ 11 | - id: check-ast 12 | exclude: ^.venv/ 13 | - id: check-yaml 14 | exclude: ^.venv/ 15 | 16 | - repo: local 17 | hooks: 18 | - id: check-typing-imports 19 | name: Check for Dict, List, or Union usage 20 | entry: bash -c 'echo "Checking for typing imports..." && FOUND=$(cd "$(git rev-parse --show-toplevel)" && find . -path "*/py/*.py" | grep -v "venv" | grep -v "/.venv/" | grep -v "/site-packages/" | grep -v "test_" | grep -v "/migrations/" | xargs grep -l "from typing.*import.*[^d]Dict\\|from typing.*import.*List\\|from typing.*import.*Union" 2>/dev/null || echo "") && if [ -n "$FOUND" ]; then echo "$FOUND"; echo " Please import dict instead of Dict, list instead of List, and the logical OR operator"; exit 1; else echo "No problematic imports found!"; exit 0; fi' 21 | language: system 22 | types: [python] 23 | pass_filenames: false 24 | 25 | - repo: local 26 | hooks: 27 | - id: check-print-statements 28 | name: Check for print statements 29 | entry: bash -c 'echo "Checking for print statements..." && FOUND=$(cd "$(git rev-parse --show-toplevel)" && find . -path "*/py/*.py" | grep -v "venv" | grep -v "/.venv/" | grep -v "/site-packages/" | grep -v "test_" | grep -v "/core/examples/" | grep -v "/migrations/" | grep -v "/tests/" | grep -v "/examples.py" | xargs grep -l "print(" 2>/dev/null || echo "") && if [ -n "$FOUND" ]; then echo "$FOUND"; echo "Found print statements!"; exit 1; else echo "No print statements found!"; exit 0; fi' 30 | language: system 31 | types: [python] 32 | pass_filenames: false 33 | exclude: ^(.venv/|py/.venv/|py/core/examples/|py/migrations/|py/tests/) 34 | 35 | - repo: https://github.com/astral-sh/ruff-pre-commit 36 | rev: v0.9.6 37 | hooks: 38 | - id: ruff 39 | args: [--fix] 40 | files: ^py/ 41 | exclude: ^(py/tests/|.venv/) 42 | - id: ruff-format 43 | files: ^py/ 44 | exclude: ^(py/tests/|.venv/) 45 | 46 | - repo: local 47 | hooks: 48 | - id: mypy 49 | name: mypy 50 | entry: bash -c 'cd "$(git rev-parse --show-toplevel)/py" && python -m mypy --exclude "migrations" --exclude "venv*" --exclude "test_*" .' 51 | language: system 52 | types: [python] 53 | pass_filenames: false 54 | exclude: ^(.venv/|migrations/) 55 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct Summary 2 | 3 | TL;DR: Be nice. Be respectful. Be professional. Don't be a jerk. 4 | 5 | ## Commitment 6 | 7 | We strive for a harassment-free, inclusive, and healthy community experience for all, regardless of personal characteristics or background. 8 | 9 | ## Expected Behaviors 10 | 11 | - **Empathy and Kindness**: Show understanding and kindness to others. 12 | - **Respect**: Value different viewpoints and experiences. 13 | - **Constructive Feedback**: Offer and accept feedback graciously. 14 | - **Accountability**: Own up to mistakes and learn from them. 15 | - **Community Focus**: Prioritize what's best for the whole community. 16 | 17 | ## Unacceptable Behaviors 18 | 19 | - **Sexualized Content**: Avoid sexual language and unwelcome sexual attention. 20 | - **Disrespect**: No trolling, insults, or derogatory comments. 21 | - **Harassment**: Public or private harassment is unacceptable. 22 | - **Privacy Violations**: Do not share private information without consent. 23 | - **Inappropriate Conduct**: Behavior not suitable for a professional setting is not allowed. 24 | 25 | ## Enforcement 26 | 27 | - **Leaders' Responsibility**: Leaders clarify standards and take corrective actions. 28 | - **Scope**: Applies to all community spaces and when representing the community. 29 | - **Reporting**: Incidents can be reported to owen@sciphi.ai. 30 | 31 | ## Enforcement Guidelines 32 | 33 | - **Correction**: Private warning for unprofessional behavior. 34 | - **Warning**: Consequences for repeated violations. 35 | - **Temporary Ban**: For serious or sustained inappropriate behavior. 36 | - **Permanent Ban**: For egregious violations, including harassment. 37 | 38 | ## Attribution 39 | 40 | Adapted from the [Contributor Covenant version 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html), with Community Impact Guidelines inspired by [Mozilla's code of conduct enforcement ladder](https://www.mozilla.org/en-US/about/governance/policies/participation/). 41 | 42 | For more details and FAQs, visit [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available [here](https://www.contributor-covenant.org/translations). 43 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # R2R Contribution Guide 2 | 3 | ## Quick Start 4 | 5 | - **Pre-Discussion**: Feel free to propose your ideas via issues, [Discord](https://discord.gg/p6KqD2kjtB) if you want to get early feedback. 6 | - **Code of Conduct**: Adhere to our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions. 7 | - **Pull Requests (PRs)**: Follow the PR process for contributions. 8 | 9 | ## Pull Request Process 10 | 11 | 1. **Dependencies**: Ensure all dependencies are necessary and documented. 12 | 2. **Documentation**: Update README.md with any changes to interfaces, including new environment variables, exposed ports, and other relevant details. 13 | 3. **Versioning**: Increment version numbers in examples and README.md following [SemVer](http://semver.org/). 14 | 4. **Review**: A PR can be merged after receiving approval from at least two other developers. If you lack merge permissions, request a review for merging. 15 | 16 | ## Attribution 17 | 18 | This Code of Conduct adapts from the [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4/). 19 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2024 EmergentAGI Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.md: -------------------------------------------------------------------------------- 1 | # The R2R Manifest 2 | 3 | We will do our best to build useful AI tools for developers _(before AGI)_. 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ./py/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | # Security Policy 3 | 4 | At R2R, we take the security of our project and its users seriously. We appreciate the contributions of security researchers and developers in helping us identify and address potential vulnerabilities. 5 | 6 | ## Reporting a Vulnerability 7 | 8 | If you discover a potential security vulnerability in R2R, please follow these steps to report it: 9 | 10 | 1. Create a new issue on the GitHub repository using the "Vulnerability Disclosure" issue template. 11 | 2. Set the issue as "confidential" if you are unsure whether the issue is a potential vulnerability or not. It is easier to make a confidential issue public than to remediate an issue that should have been confidential. 12 | 3. Label the issue with the `security` label at a minimum. Additional labels may be applied by the security team and other project maintainers to assist with the triage process. 13 | 4. Provide a detailed description of the vulnerability, including steps to reproduce, potential impact, and any other relevant information. 14 | 5. If the issue contains sensitive information or user-specific data, such as private repository contents, assign the `keep confidential` label to the issue. If possible, avoid including such information directly in the issue and instead provide links to resources that are only accessible to the project maintainers. 15 | 16 | ## Vulnerability Handling Process 17 | 18 | Once a vulnerability is reported, the R2R security team will follow these steps: 19 | 20 | 1. Acknowledge receipt of the vulnerability report within 48 hours. 21 | 2. Assess the severity and impact of the vulnerability. 22 | 3. Develop a fix or mitigation plan for the vulnerability. 23 | 4. Notify the reporter about the progress and estimated timeline for the fix. 24 | 5. Once the fix is ready, release a new version of R2R that addresses the vulnerability. 25 | 6. Publicly disclose the vulnerability and the fix after a reasonable period to allow users to update their installations. 26 | 27 | ## Scope 28 | 29 | This security policy applies to the R2R codebase and its dependencies. It does not cover vulnerabilities in the underlying operating systems, hardware, or third-party libraries used by R2R. 30 | 31 | ## Recognition 32 | 33 | We greatly appreciate the efforts of security researchers and developers who responsibly disclose vulnerabilities to us. With your permission, we will acknowledge your contribution in the release notes and any public disclosures related to the vulnerability. 34 | 35 | ## Contact 36 | 37 | If you have any questions or concerns regarding the security of R2R, please contact the project maintainers at [security@r2r.com](mailto:security@r2r.com). 38 | 39 | Thank you for helping us keep R2R and its users secure! 40 | -------------------------------------------------------------------------------- /docker/compose.yaml: -------------------------------------------------------------------------------- 1 | volumes: 2 | postgres_data: 3 | name: postgres_data 4 | minio_data: 5 | name: minio_data 6 | 7 | services: 8 | postgres: 9 | image: pgvector/pgvector:pg16 10 | profiles: [postgres] 11 | env_file: 12 | - ./env/postgres.env 13 | volumes: 14 | - postgres_data:/var/lib/postgresql/data 15 | ports: 16 | - "5432:5432" 17 | healthcheck: 18 | test: ["CMD-SHELL", "pg_isready -U postgres"] 19 | interval: 10s 20 | timeout: 5s 21 | retries: 5 22 | restart: on-failure 23 | command: > 24 | postgres 25 | -c max_connections=1024 26 | 27 | minio: 28 | image: minio/minio 29 | profiles: [minio] 30 | env_file: 31 | - ./env/minio.env 32 | volumes: 33 | - minio_data:/data 34 | ports: 35 | - "9000:9000" 36 | - "9001:9001" 37 | healthcheck: 38 | test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] 39 | interval: 10s 40 | timeout: 5s 41 | retries: 5 42 | restart: on-failure 43 | command: server /data --console-address ":9001" 44 | 45 | graph_clustering: 46 | image: ragtoriches/cluster-prod 47 | ports: 48 | - "7276:7276" 49 | healthcheck: 50 | test: ["CMD", "curl", "-f", "http://localhost:7276/health"] 51 | interval: 10s 52 | timeout: 5s 53 | retries: 5 54 | 55 | r2r: 56 | image: sciphiai/r2r:latest 57 | ports: 58 | - "7272:7272" 59 | env_file: 60 | - ./env/r2r.env 61 | healthcheck: 62 | test: ["CMD", "curl", "-f", "http://localhost:7272/v3/health"] 63 | interval: 6s 64 | timeout: 5s 65 | retries: 5 66 | restart: on-failure 67 | volumes: 68 | - ./user_configs:/app/user_configs 69 | - ./user_tools:/app/user_tools 70 | extra_hosts: 71 | - host.docker.internal:host-gateway 72 | 73 | r2r-dashboard: 74 | image: sciphiai/r2r-dashboard:1.0.3 75 | env_file: 76 | - ./env/r2r-dashboard.env 77 | ports: 78 | - "7273:3000" 79 | -------------------------------------------------------------------------------- /docker/env/hatchet.env: -------------------------------------------------------------------------------- 1 | DATABASE_URL="postgres://hatchet_user:hatchet_password@hatchet-postgres:5432/hatchet?sslmode=disable" 2 | 3 | HATCHET_CLIENT_GRPC_MAX_RECV_MESSAGE_LENGTH=134217728 4 | HATCHET_CLIENT_GRPC_MAX_SEND_MESSAGE_LENGTH=134217728 5 | 6 | DATABASE_POSTGRES_PORT=5432 7 | DATABASE_POSTGRES_HOST=hatchet-postgres 8 | DATABASE_POSTGRES_USERNAME=hatchet_user 9 | DATABASE_POSTGRES_PASSWORD=hatchet_password 10 | HATCHET_DATABASE_POSTGRES_DB_NAME=hatchet 11 | POSTGRES_DB=hatchet 12 | POSTGRES_USER=hatchet_user 13 | POSTGRES_PASSWORD=hatchet_password 14 | 15 | SERVER_TASKQUEUE_RABBITMQ_URL=amqp://user:password@hatchet-rabbitmq:5672/ 16 | SERVER_AUTH_COOKIE_DOMAIN=http://host.docker.internal:7274 17 | SERVER_URL=http://host.docker.internal:7274 18 | SERVER_AUTH_COOKIE_INSECURE=t 19 | SERVER_GRPC_BIND_ADDRESS=0.0.0.0 20 | SERVER_GRPC_INSECURE=t 21 | SERVER_GRPC_BROADCAST_ADDRESS=hatchet-engine:7077 22 | SERVER_GRPC_MAX_MSG_SIZE=134217728 23 | SERVER_GRPC_PORT="7077" 24 | 25 | RABBITMQ_DEFAULT_USER=user 26 | RABBITMQ_DEFAULT_PASS=password 27 | -------------------------------------------------------------------------------- /docker/env/minio.env: -------------------------------------------------------------------------------- 1 | MINIO_ROOT_USER=minioadmin 2 | MINIO_ROOT_PASSWORD=minioadmin 3 | -------------------------------------------------------------------------------- /docker/env/postgres.env: -------------------------------------------------------------------------------- 1 | POSTGRES_USER=postgres 2 | POSTGRES_PASSWORD=postgres 3 | POSTGRES_HOST=postgres 4 | POSTGRES_PORT=5432 5 | POSTGRES_MAX_CONNECTIONS=1024 6 | PGPORT=5432 7 | -------------------------------------------------------------------------------- /docker/env/r2r-dashboard.env: -------------------------------------------------------------------------------- 1 | NEXT_PUBLIC_R2R_DEPLOYMENT_URL=http://localhost:7272 2 | NEXT_PUBLIC_HATCHET_DASHBOARD_URL=http://localhost:7274 3 | NEXT_PUBLIC_R2R_DEFAULT_EMAIL="admin@example.com" 4 | NEXT_PUBLIC_R2R_DEFAULT_PASSWORD="change_me_immediately" 5 | -------------------------------------------------------------------------------- /docker/env/r2r-full.env: -------------------------------------------------------------------------------- 1 | # R2R 2 | R2R_PORT=7272 3 | R2R_HOST=0.0.0.0 4 | R2R_LOG_LEVEL=INFO 5 | R2R_CONFIG_NAME=full 6 | R2R_CONFIG_PATH= 7 | R2R_PROJECT_NAME=r2r_default 8 | R2R_SECRET_KEY= 9 | R2R_USER_TOOLS_PATH=/app/user_tools 10 | R2R_LOG_FORMAT= 11 | 12 | # Postgres Configuration 13 | R2R_POSTGRES_USER=postgres 14 | R2R_POSTGRES_PASSWORD=postgres 15 | R2R_POSTGRES_HOST=postgres 16 | R2R_POSTGRES_PORT=5432 17 | R2R_POSTGRES_DBNAME=postgres 18 | R2R_POSTGRES_MAX_CONNECTIONS=1024 19 | R2R_POSTGRES_STATEMENT_CACHE_SIZE=100 20 | 21 | # Hatchet 22 | HATCHET_CLIENT_TLS_STRATEGY=none 23 | 24 | # OpenAI 25 | OPENAI_API_KEY= 26 | OPENAI_API_BASE= 27 | 28 | # Azure Foundry 29 | AZURE_FOUNDRY_API_ENDPOINT= 30 | AZURE_FOUNDRY_API_KEY= 31 | 32 | # XAI / GROK 33 | XAI_API_KEY= 34 | 35 | # Anthropic 36 | ANTHROPIC_API_KEY= 37 | 38 | # Azure 39 | AZURE_API_KEY= 40 | AZURE_API_BASE= 41 | AZURE_API_VERSION= 42 | 43 | # Google Vertex AI 44 | GOOGLE_APPLICATION_CREDENTIALS= 45 | VERTEX_PROJECT= 46 | VERTEX_LOCATION= 47 | 48 | # Google Gemini 49 | GEMINI_API_KEY= 50 | 51 | # Mistral 52 | MISTRAL_API_KEY= 53 | 54 | # AWS Bedrock 55 | AWS_ACCESS_KEY_ID= 56 | AWS_SECRET_ACCESS_KEY= 57 | AWS_REGION_NAME= 58 | 59 | # Groq 60 | GROQ_API_KEY= 61 | 62 | # Cohere 63 | COHERE_API_KEY= 64 | 65 | # Anyscale 66 | ANYSCALE_API_KEY= 67 | 68 | # Ollama 69 | OLLAMA_API_BASE=http://host.docker.internal:11434 70 | 71 | # LM Studio 72 | LM_STUDIO_API_BASE=http://host.docker.internal:1234 73 | LM_STUDIO_API_KEY=1234 74 | 75 | # Huggingface 76 | HUGGINGFACE_API_BASE=http://host.docker.internal:8080 77 | HUGGINGFACE_API_KEY= 78 | 79 | # Unstructured 80 | UNSTRUCTURED_API_KEY= 81 | UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general 82 | UNSTRUCTURED_SERVICE_URL=http://unstructured:7275 83 | UNSTRUCTURED_NUM_WORKERS=10 84 | 85 | # Graphologic 86 | CLUSTERING_SERVICE_URL=http://graph_clustering:7276 87 | 88 | # OAuth Credentials 89 | GOOGLE_CLIENT_ID= 90 | GOOGLE_CLIENT_SECRET= 91 | GOOGLE_REDIRECT_URI= 92 | 93 | GITHUB_CLIENT_ID= 94 | GITHUB_CLIENT_SECRET= 95 | GITHUB_REDIRECT_URI= 96 | 97 | # Email 98 | MAILERSEND_API_KEY= 99 | SENDGRID_API_KEY= 100 | 101 | # Websearch 102 | FIRECRAWL_API_KEY= 103 | SERPER_API_KEY= 104 | TAVILY_API_KEY= 105 | 106 | # Sentry Tracing 107 | R2R_SENTRY_DSN= 108 | R2R_SENTRY_ENVIRONMENT= 109 | R2R_SENTRY_TRACES_SAMPLE_RATE= 110 | R2R_SENTRY_PROFILES_SAMPLE_RATE= 111 | -------------------------------------------------------------------------------- /docker/env/r2r.env: -------------------------------------------------------------------------------- 1 | # R2R 2 | R2R_PORT=7272 3 | R2R_HOST=0.0.0.0 4 | R2R_LOG_LEVEL=INFO 5 | R2R_CONFIG_NAME= 6 | R2R_CONFIG_PATH= 7 | R2R_PROJECT_NAME=r2r_default 8 | R2R_SECRET_KEY= 9 | R2R_USER_TOOLS_PATH=/app/user_tools 10 | R2R_LOG_FORMAT= 11 | 12 | # Postgres Configuration 13 | R2R_POSTGRES_USER=postgres 14 | R2R_POSTGRES_PASSWORD=postgres 15 | R2R_POSTGRES_HOST=postgres 16 | R2R_POSTGRES_PORT=5432 17 | R2R_POSTGRES_DBNAME=postgres 18 | R2R_POSTGRES_MAX_CONNECTIONS=1024 19 | R2R_POSTGRES_STATEMENT_CACHE_SIZE=100 20 | 21 | # Hatchet 22 | HATCHET_CLIENT_TLS_STRATEGY=none 23 | 24 | # OpenAI 25 | OPENAI_API_KEY= 26 | OPENAI_API_BASE= 27 | 28 | # Azure Foundry 29 | AZURE_FOUNDRY_API_ENDPOINT= 30 | AZURE_FOUNDRY_API_KEY= 31 | 32 | # XAI / GROK 33 | XAI_API_KEY= 34 | 35 | # Anthropic 36 | ANTHROPIC_API_KEY= 37 | 38 | # Azure 39 | AZURE_API_KEY= 40 | AZURE_API_BASE= 41 | AZURE_API_VERSION= 42 | 43 | # Google Vertex AI 44 | GOOGLE_APPLICATION_CREDENTIALS= 45 | VERTEX_PROJECT= 46 | VERTEX_LOCATION= 47 | 48 | # Google Gemini 49 | GEMINI_API_KEY= 50 | 51 | # Mistral 52 | MISTRAL_API_KEY= 53 | 54 | # AWS Bedrock 55 | AWS_ACCESS_KEY_ID= 56 | AWS_SECRET_ACCESS_KEY= 57 | AWS_REGION_NAME= 58 | 59 | # Groq 60 | GROQ_API_KEY= 61 | 62 | # Cohere 63 | COHERE_API_KEY= 64 | 65 | # Anyscale 66 | ANYSCALE_API_KEY= 67 | 68 | # Ollama 69 | OLLAMA_API_BASE=http://host.docker.internal:11434 70 | 71 | # LM Studio 72 | LM_STUDIO_API_BASE=http://host.docker.internal:1234 73 | LM_STUDIO_API_KEY=1234 74 | 75 | # Huggingface 76 | HUGGINGFACE_API_BASE=http://host.docker.internal:8080 77 | HUGGINGFACE_API_KEY= 78 | 79 | # Unstructured 80 | UNSTRUCTURED_API_KEY= 81 | UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general 82 | UNSTRUCTURED_SERVICE_URL=http://unstructured:7275 83 | UNSTRUCTURED_NUM_WORKERS=10 84 | 85 | # Graphologic 86 | CLUSTERING_SERVICE_URL=http://graph_clustering:7276 87 | 88 | # OAuth Credentials 89 | GOOGLE_CLIENT_ID= 90 | GOOGLE_CLIENT_SECRET= 91 | GOOGLE_REDIRECT_URI= 92 | 93 | GITHUB_CLIENT_ID= 94 | GITHUB_CLIENT_SECRET= 95 | GITHUB_REDIRECT_URI= 96 | 97 | # Email 98 | MAILERSEND_API_KEY= 99 | SENDGRID_API_KEY= 100 | 101 | # Websearch 102 | FIRECRAWL_API_KEY= 103 | SERPER_API_KEY= 104 | TAVILY_API_KEY= 105 | 106 | # Sentry Tracing 107 | R2R_SENTRY_DSN= 108 | R2R_SENTRY_ENVIRONMENT= 109 | R2R_SENTRY_TRACES_SAMPLE_RATE= 110 | R2R_SENTRY_PROFILES_SAMPLE_RATE= 111 | -------------------------------------------------------------------------------- /docker/fluent-bit/fluent-bit.conf: -------------------------------------------------------------------------------- 1 | [SERVICE] 2 | Flush 1 3 | Daemon Off 4 | Log_Level info 5 | Parsers_File parsers.conf 6 | 7 | [INPUT] 8 | Tag backend 9 | Name forward 10 | Listen 0.0.0.0 11 | Port 24224 12 | 13 | [FILTER] 14 | Match backend 15 | Name parser 16 | Key_Name log 17 | Parser json 18 | 19 | [OUTPUT] 20 | Match backend 21 | Name http 22 | host host.docker.internal 23 | port 9428 24 | uri /insert/jsonline?_stream_fields=log&_msg_field=msg,message&_time_field=date 25 | format json_lines 26 | json_date_format iso8601 27 | -------------------------------------------------------------------------------- /docker/fluent-bit/parsers.conf: -------------------------------------------------------------------------------- 1 | [PARSER] 2 | Name json 3 | Format json 4 | -------------------------------------------------------------------------------- /docker/scripts/create-hatchet-db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | echo 'Waiting for PostgreSQL to be ready...' 5 | while ! pg_isready -h hatchet-postgres -p 5432 -U ${HATCHET_POSTGRES_USER:-hatchet_user}; do 6 | sleep 1 7 | done 8 | 9 | echo 'PostgreSQL is ready, checking if database exists...' 10 | if ! PGPASSWORD=${HATCHET_POSTGRES_PASSWORD:-hatchet_password} psql -h hatchet-postgres -p 5432 -U ${HATCHET_POSTGRES_USER:-hatchet_user} -lqt | grep -qw ${HATCHET_POSTGRES_DBNAME:-hatchet}; then 11 | echo 'Database does not exist, creating it...' 12 | PGPASSWORD=${HATCHET_POSTGRES_PASSWORD:-hatchet_password} createdb -h hatchet-postgres -p 5432 -U ${HATCHET_POSTGRES_USER:-hatchet_user} -w ${HATCHET_POSTGRES_DBNAME:-hatchet} 13 | else 14 | echo 'Database already exists, skipping creation.' 15 | fi 16 | -------------------------------------------------------------------------------- /docker/scripts/setup-token.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | echo 'Starting token creation process...' 5 | 6 | # Attempt to create token and capture both stdout and stderr 7 | TOKEN_OUTPUT=$(/hatchet/hatchet-admin token create --config /hatchet/config --tenant-id 707d0855-80ab-4e1f-a156-f1c4546cbf52 2>&1) 8 | 9 | # Extract the token (assuming it's the only part that looks like a JWT) 10 | TOKEN=$(echo "$TOKEN_OUTPUT" | grep -Eo 'eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*') 11 | 12 | if [ -z "$TOKEN" ]; then 13 | echo 'Error: Failed to extract token. Full command output:' >&2 14 | echo "$TOKEN_OUTPUT" >&2 15 | exit 1 16 | fi 17 | 18 | echo "$TOKEN" > /tmp/hatchet_api_key 19 | echo 'Token created and saved to /tmp/hatchet_api_key' 20 | 21 | # Copy token to final destination 22 | echo -n "$TOKEN" > /hatchet_api_key/api_key.txt 23 | echo 'Token copied to /hatchet_api_key/api_key.txt' 24 | 25 | # Verify token was copied correctly 26 | if [ "$(cat /tmp/hatchet_api_key)" != "$(cat /hatchet_api_key/api_key.txt)" ]; then 27 | echo 'Error: Token copy failed, files do not match' >&2 28 | echo 'Content of /tmp/hatchet_api_key:' 29 | cat /tmp/hatchet_api_key 30 | echo 'Content of /hatchet_api_key/api_key.txt:' 31 | cat /hatchet_api_key/api_key.txt 32 | exit 1 33 | fi 34 | 35 | echo 'Hatchet API key has been saved successfully' 36 | echo 'Token length:' ${#TOKEN} 37 | echo 'Token (first 20 chars):' ${TOKEN:0:20} 38 | echo 'Token structure:' $(echo $TOKEN | awk -F. '{print NF-1}') 'parts' 39 | 40 | # Check each part of the token 41 | for i in 1 2 3; do 42 | PART=$(echo $TOKEN | cut -d. -f$i) 43 | echo 'Part' $i 'length:' ${#PART} 44 | echo 'Part' $i 'base64 check:' $(echo $PART | base64 -d >/dev/null 2>&1 && echo 'Valid' || echo 'Invalid') 45 | done 46 | 47 | # Final validation attempt 48 | if ! echo $TOKEN | awk -F. '{print $2}' | base64 -d 2>/dev/null | jq . >/dev/null 2>&1; then 49 | echo 'Warning: Token payload is not valid JSON when base64 decoded' >&2 50 | else 51 | echo 'Token payload appears to be valid JSON' 52 | fi 53 | -------------------------------------------------------------------------------- /docker/scripts/start-r2r.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if HATCHET_CLIENT_TOKEN is set, if not read it from the API key file 4 | if [ -z "${HATCHET_CLIENT_TOKEN}" ]; then 5 | export HATCHET_CLIENT_TOKEN=$(cat /hatchet_api_key/api_key.txt) 6 | fi 7 | 8 | # Start the application 9 | exec uvicorn core.main.app_entry:app --host ${R2R_HOST} --port ${R2R_PORT} 10 | -------------------------------------------------------------------------------- /docker/user_configs/README.md: -------------------------------------------------------------------------------- 1 | # User Configs Directory 2 | 3 | ## Overview 4 | This directory is mounted inside the R2R Docker container and is intended for custom configuration files. Any files placed here will be accessible to the application running in the container. 5 | 6 | ## Usage 7 | 1. Place your custom configuration files in this directory. 8 | 2. Set the `R2R_CONFIG_PATH` in the `r2r.env` or `r2r-full.env` files. 9 | 3. The path format inside the container is: `/app/user_configs/.toml` 10 | 11 | ## Configuration 12 | The application uses the environment variable you set to locate your configuration file: 13 | ``` 14 | R2R_CONFIG_PATH=/app/user_configs/.toml 15 | ``` 16 | 17 | If you want to use a different filename, update the `R2R_CONFIG_PATH` variable in your environment file to point to your custom file, for example: 18 | ``` 19 | R2R_CONFIG_PATH=/app/user_configs/my_custom_config.toml 20 | ``` 21 | 22 | ## Troubleshooting 23 | If you encounter configuration errors, check: 24 | 1. Your configuration file exists in this directory 25 | 2. The filename matches what's specified in `R2R_CONFIG_PATH` 26 | 3. The file has proper permissions (readable) 27 | 4. The file contains valid TOML syntax 28 | 29 | For more detailed configuration information, see the main documentation. 30 | -------------------------------------------------------------------------------- /docker/user_tools/README.md: -------------------------------------------------------------------------------- 1 | # User-Defined Tools Directory 2 | 3 | ## Overview 4 | This directory is mounted inside the R2R Docker container and is intended for custom tool files. Any files placed here will be accessible to the application running in the container. 5 | 6 | ## Usage 7 | 1. Place your custom tool definitions in this directory. Utilize the template structure demonstrated here. 8 | 2. Add any additional dependencies that you may need to the user_requirements.txt file in this directory. 9 | 3. Include the tool in your agent configuration. 10 | 11 | ## Creating a tool 12 | ```python 13 | from core.base.agent.tools.base import Tool 14 | 15 | 16 | class ToolNameTool(Tool): 17 | """ 18 | A user defined tool. 19 | """ 20 | 21 | def __init__(self): 22 | super().__init__( 23 | name="tool_name", 24 | description="A natural language tool description that is shown to the agent.", 25 | parameters={ 26 | "type": "object", 27 | "properties": { 28 | "input_parameter": { 29 | "type": "string", 30 | "description": "Define any input parameters by their name and type", 31 | }, 32 | }, 33 | "required": ["input_parameter"], 34 | }, 35 | results_function=self.execute, 36 | llm_format_function=None, 37 | ) 38 | 39 | async def execute(self, input_parameter: str, *args, **kwargs): 40 | """ 41 | Implementation of the tool. 42 | """ 43 | 44 | # Any custom tool logic can go here 45 | 46 | output_response = some_method(input_parameter) 47 | 48 | result = AggregateSearchResult( 49 | generic_tool_result=[web_response], 50 | ) 51 | 52 | # Add to results collector if context is provided 53 | if context and hasattr(context, "search_results_collector"): 54 | context.search_results_collector.add_aggregate_result(result) 55 | 56 | return result 57 | ``` 58 | 59 | ## Troubleshooting 60 | 61 | For more detailed configuration information, see the main documentation. 62 | -------------------------------------------------------------------------------- /docker/user_tools/user_requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/docker/user_tools/user_requirements.txt -------------------------------------------------------------------------------- /js/README.md: -------------------------------------------------------------------------------- 1 | # R2R JavaScript SDK Documentation 2 | 3 | For the complete look at the R2R JavaScript SDK, [visit our documentation.](https://r2r-docs.sciphi.ai/api-and-sdks/introduction) 4 | 5 | ## Installation 6 | 7 | Before starting, make sure you have completed the [R2R installation](https://r2r-docs.sciphi.ai/documentation/installation/overview). 8 | 9 | Install the R2R JavaScript SDK: 10 | 11 | ```bash 12 | npm install r2r-js 13 | ``` 14 | 15 | ## Getting Started 16 | 17 | 1. Import the R2R client: 18 | 19 | ```javascript 20 | const { r2rClient } = require('r2r-js'); 21 | ``` 22 | 23 | 2. Initialize the client: 24 | 25 | ```javascript 26 | const client = new r2rClient('http://localhost:7272'); 27 | ``` 28 | 29 | 3. Check if R2R is running correctly: 30 | 31 | ```javascript 32 | const healthResponse = await client.health(); 33 | // {"status":"ok"} 34 | ``` 35 | 36 | 4. Login (Optional): 37 | ```javascript 38 | // client.register("me@email.com", "my_password"), 39 | // client.verify_email("me@email.com", "my_verification_code") 40 | client.login("me@email.com", "my_password") 41 | ``` 42 | When using authentication the commands below automatically restrict the scope to a user's available documents. 43 | -------------------------------------------------------------------------------- /js/sdk/.prettierignore: -------------------------------------------------------------------------------- 1 | examples/ 2 | -------------------------------------------------------------------------------- /js/sdk/__tests__/PromptsIntegrationSuperUser.test.ts: -------------------------------------------------------------------------------- 1 | import { r2rClient } from "../src/index"; 2 | import { describe, test, beforeAll, expect } from "@jest/globals"; 3 | 4 | const baseUrl = "http://localhost:7272"; 5 | 6 | describe("r2rClient V3 Collections Integration Tests", () => { 7 | let client: r2rClient; 8 | 9 | beforeAll(async () => { 10 | client = new r2rClient(baseUrl); 11 | await client.users.login({ 12 | email: "admin@example.com", 13 | password: "change_me_immediately", 14 | }); 15 | }); 16 | 17 | test("List prompts", async () => { 18 | const response = await client.prompts.list(); 19 | expect(response.results).toBeDefined(); 20 | }); 21 | 22 | test("Create a prompt", async () => { 23 | const response = await client.prompts.create({ 24 | name: "test-prompt", 25 | template: "Hello, {name}!", 26 | inputTypes: { name: "string" }, 27 | }); 28 | expect(response.results).toBeDefined(); 29 | }); 30 | 31 | test("Retrieve a prompt", async () => { 32 | const response = await client.prompts.retrieve({ 33 | name: "test-prompt", 34 | }); 35 | expect(response.results).toBeDefined(); 36 | }); 37 | 38 | test("Update a prompt", async () => { 39 | const response = await client.prompts.update({ 40 | name: "test-prompt", 41 | template: "Hello, {name}! How are you?", 42 | inputTypes: { name: "string" }, 43 | }); 44 | expect(response.results).toBeDefined(); 45 | }); 46 | 47 | test("Delete a prompt", async () => { 48 | const response = await client.prompts.delete({ 49 | name: "test-prompt", 50 | }); 51 | expect(response.results).toBeDefined(); 52 | }); 53 | }); 54 | -------------------------------------------------------------------------------- /js/sdk/__tests__/SystemIntegrationSuperUser.test.ts: -------------------------------------------------------------------------------- 1 | import { r2rClient } from "../src/index"; 2 | import { describe, test, beforeAll, expect } from "@jest/globals"; 3 | 4 | const baseUrl = "http://localhost:7272"; 5 | 6 | describe("r2rClient V3 Collections Integration Tests", () => { 7 | let client: r2rClient; 8 | 9 | beforeAll(async () => { 10 | client = new r2rClient(baseUrl); 11 | await client.users.login({ 12 | email: "admin@example.com", 13 | password: "change_me_immediately", 14 | }); 15 | }); 16 | 17 | test("Get the health of the system", async () => { 18 | const response = await client.system.health(); 19 | expect(response.results).toBeDefined(); 20 | }); 21 | 22 | test("Get the settings of the system", async () => { 23 | const response = await client.system.settings(); 24 | expect(response.results).toBeDefined(); 25 | }); 26 | 27 | test("Get the status of the system", async () => { 28 | const response = await client.system.status(); 29 | expect(response.results).toBeDefined(); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /js/sdk/__tests__/SystemIntegrationUser.test.ts: -------------------------------------------------------------------------------- 1 | import { r2rClient } from "../src/index"; 2 | import { describe, test, beforeAll, expect } from "@jest/globals"; 3 | 4 | const baseUrl = "http://localhost:7272"; 5 | 6 | describe("r2rClient V3 System Integration Tests User", () => { 7 | let client: r2rClient; 8 | let userId: string; 9 | let name: string | undefined; 10 | 11 | beforeAll(async () => { 12 | client = new r2rClient(baseUrl); 13 | }); 14 | 15 | test("Register a new user", async () => { 16 | const response = await client.users.create({ 17 | email: "system_integration_test_user@example.com", 18 | password: "change_me_immediately", 19 | name: "Test User", 20 | bio: "This is the bio of the test user.", 21 | }); 22 | 23 | userId = response.results.id; 24 | name = response.results.name; 25 | expect(response.results).toBeDefined(); 26 | expect(response.results.isSuperuser).toBe(false); 27 | expect(response.results.name).toBe("Test User"); 28 | expect(response.results.bio).toBe("This is the bio of the test user."); 29 | }); 30 | 31 | test("Login as a user", async () => { 32 | const response = await client.users.login({ 33 | email: "system_integration_test_user@example.com", 34 | password: "change_me_immediately", 35 | }); 36 | expect(response.results).toBeDefined(); 37 | }); 38 | 39 | test("Get the health of the system", async () => { 40 | const response = await client.system.health(); 41 | expect(response.results).toBeDefined(); 42 | }); 43 | 44 | test("Only a superuser can call the `system/settings` endpoint.", async () => { 45 | await expect(client.system.settings()).rejects.toThrow(/Status 403/); 46 | }); 47 | 48 | test("Only an authorized user can call the `system/status` endpoint.", async () => { 49 | await expect(client.system.status()).rejects.toThrow(/Status 403/); 50 | }); 51 | 52 | test("Delete a user", async () => { 53 | const response = await client.users.delete({ 54 | id: userId, 55 | password: "change_me_immediately", 56 | }); 57 | expect(response.results).toBeDefined(); 58 | }); 59 | }); 60 | -------------------------------------------------------------------------------- /js/sdk/examples/data/folder/karamozov.txt: -------------------------------------------------------------------------------- 1 | Alexius Fyodorovich Karamazov erat tertius filius Fyodoris Pavlovich Karamazov 2 | possessoris terrarum in nostro districtu bene noti sua aetate, et adhuc apud nos 3 | memoriae mandati ob mortem tragicam et obscuram, quae tredecim annos abhinc 4 | accidit, quamque suo loco describam. 5 | -------------------------------------------------------------------------------- /js/sdk/examples/data/folder/myshkin.txt: -------------------------------------------------------------------------------- 1 | Sub finem Novembris, tempore liquationis, hora nona mane, tramen in via 2 | ferrea Varsaviae et Petropoli plenis velocitatibus Petropolim 3 | appropinquabat. Dies ita humidus et nebulosus erat ut magno cum labore 4 | viatores invicem videre possent. 5 | -------------------------------------------------------------------------------- /js/sdk/examples/data/invalid.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "John Doe" 3 | "age": 30, 4 | 'address': '123 Main St', 5 | "phone_numbers": [ 6 | "555-0123", 7 | "555-4567", 8 | ], 9 | "is_active": True, 10 | "details": { 11 | "occupation": "developer" 12 | "skills": ["python", "javascript"] 13 | } 14 | "notes": "Some text with "nested" quotes" 15 | } 16 | -------------------------------------------------------------------------------- /js/sdk/examples/data/marmeladov.txt: -------------------------------------------------------------------------------- 1 | His conversation seemed to excite a general though languid interest. The 2 | boys at the counter fell to sniggering. The innkeeper came down from the 3 | upper room, apparently on purpose to listen to the “funny fellow” 4 | and sat down at a little distance, yawning lazily, but with dignity. 5 | Evidently Marmeladov was a familiar figure here, and he had most 6 | likely acquired his weakness for high-flown speeches from the habit of 7 | frequently entering into conversation with strangers of all sorts in 8 | the tavern. This habit develops into a necessity in some drunkards, and 9 | especially in those who are looked after sharply and kept in order 10 | at home. Hence in the company of other drinkers they try to justify 11 | themselves and even if possible obtain consideration. 12 | 13 | “Funny fellow!” pronounced the innkeeper. “And why don’t you work, why 14 | aren’t you at your duty, if you are in the service?” 15 | 16 | “Why am I not at my duty, honoured sir,” Marmeladov went on, addressing 17 | himself exclusively to Raskolnikov, as though it had been he who put 18 | that question to him. “Why am I not at my duty? Does not my heart ache 19 | to think what a useless worm I am? A month ago when Mr. Lebeziatnikov 20 | beat my wife with his own hands, and I lay drunk, didn’t I suffer? 21 | Excuse me, young man, has it ever happened to you... hm... well, to 22 | petition hopelessly for a loan?” 23 | -------------------------------------------------------------------------------- /js/sdk/examples/data/raskolnikov.txt: -------------------------------------------------------------------------------- 1 | In vespera praecipue calida ineunte Iulio iuvenis e cenaculo in quo hospitabatur in 2 | S. loco exiit et lente, quasi dubitans, versus pontem K. ambulavit. Feliciter vitavit 3 | ne domina sua eum in scala occurreret. Cenaculum suum sub tecto domus altae, quinque 4 | tabulatorum, erat, et magis armario quam conclavi simile erat. Domina, quae ei cenaculum, 5 | prandia et ministerium praebebat, in tabulato infra habitabat, et quotienscumque exibat, 6 | praeterire culinam eius, cuius ianua semper aperta erat, cogebatur. Et quoties praeteribat, 7 | iuvenis aegrotum et pavidum sensum habebat, quod eum corrugare frontem et pudere faciebat. 8 | Desperanter apud dominam suam aere alieno obrutus erat, et eam convenire timebat. 9 | -------------------------------------------------------------------------------- /js/sdk/examples/data/raskolnikov_2.txt: -------------------------------------------------------------------------------- 1 | When Raskolnikov got home, his hair was soaked with sweat and he was 2 | breathing heavily. He went rapidly up the stairs, walked into his 3 | unlocked room and at once fastened the latch. Then in senseless terror 4 | he rushed to the corner, to that hole under the paper where he had put 5 | the things; put his hand in, and for some minutes felt carefully in the 6 | hole, in every crack and fold of the paper. Finding nothing, he got up 7 | and drew a deep breath. 8 | -------------------------------------------------------------------------------- /js/sdk/examples/data/sonia.txt: -------------------------------------------------------------------------------- 1 | On the canal bank near the bridge and not two houses away from the one 2 | where Sonia lodged, there was a crowd of people, consisting principally 3 | of gutter children. The hoarse broken voice of Katerina Ivanovna could 4 | be heard from the bridge, and it certainly was a strange spectacle 5 | likely to attract a street crowd. Katerina Ivanovna in her old dress 6 | with the green shawl, wearing a torn straw hat, crushed in a hideous way 7 | on one side, was really frantic. She was exhausted and breathless. Her 8 | wasted consumptive face looked more suffering than ever, and indeed out 9 | of doors in the sunshine a consumptive always looks worse than at home. 10 | But her excitement did not flag, and every moment her irritation grew 11 | more intense. She rushed at the children, shouted at them, coaxed 12 | them, told them before the crowd how to dance and what to sing, began 13 | explaining to them why it was necessary, and driven to desperation by 14 | their not understanding, beat them.... Then she would make a rush at the 15 | crowd; if she noticed any decently dressed person stopping to look, she 16 | immediately appealed to him to see what these children “from a genteel, 17 | one may say aristocratic, house” had been brought to. If she heard 18 | laughter or jeering in the crowd, she would rush at once at the scoffers 19 | and begin squabbling with them. Some people laughed, others shook their 20 | heads, but everyone felt curious at the sight of the madwoman with the 21 | frightened children. The frying-pan of which Lebeziatnikov had spoken 22 | was not there, at least Raskolnikov did not see it. But instead of 23 | rapping on the pan, Katerina Ivanovna began clapping her wasted hands, 24 | when she made Lida and Kolya dance and Polenka sing. She too joined in 25 | the singing, but broke down at the second note with a fearful cough, 26 | which made her curse in despair and even shed tears. What made her most 27 | furious was the weeping and terror of Kolya and Lida. Some effort had 28 | been made to dress the children up as street singers are dressed. The 29 | boy had on a turban made of something red and white to look like a Turk. 30 | There had been no costume for Lida; she simply had a red knitted cap, 31 | or rather a night cap that had belonged to Marmeladov, decorated with 32 | a broken piece of white ostrich feather, which had been Katerina 33 | Ivanovna’s grandmother’s and had been preserved as a family possession. 34 | Polenka was in her everyday dress; she looked in timid perplexity at her 35 | mother, and kept at her side, hiding her tears. She dimly realised her 36 | mother’s condition, and looked uneasily about her. She was terribly 37 | frightened of the street and the crowd. Sonia followed Katerina 38 | Ivanovna, weeping and beseeching her to return home, but Katerina 39 | Ivanovna was not to be persuaded. 40 | -------------------------------------------------------------------------------- /js/sdk/examples/data/zametov.txt: -------------------------------------------------------------------------------- 1 | “How he keeps on! Are you afraid of having let out some secret? Don’t 2 | worry yourself; you said nothing about a countess. But you said a lot 3 | about a bulldog, and about ear-rings and chains, and about Krestovsky 4 | Island, and some porter, and Nikodim Fomitch and Ilya Petrovitch, the 5 | assistant superintendent. And another thing that was of special interest 6 | to you was your own sock. You whined, ‘Give me my sock.’ Zametov 7 | hunted all about your room for your socks, and with his own scented, 8 | ring-bedecked fingers he gave you the rag. And only then were you 9 | comforted, and for the next twenty-four hours you held the wretched 10 | thing in your hand; we could not get it from you. It is most likely 11 | somewhere under your quilt at this moment. And then you asked so 12 | piteously for fringe for your trousers. We tried to find out what sort 13 | of fringe, but we could not make it out. Now to business! Here are 14 | thirty-five roubles; I take ten of them, and shall give you an account 15 | of them in an hour or two. I will let Zossimov know at the same time, 16 | though he ought to have been here long ago, for it is nearly twelve. And 17 | you, Nastasya, look in pretty often while I am away, to see whether he 18 | wants a drink or anything else. And I will tell Pashenka what is wanted 19 | myself. Good-bye!” 20 | -------------------------------------------------------------------------------- /js/sdk/examples/hello_r2r.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const { r2rClient } = require("r2r-js"); 3 | 4 | // Create an account at SciPhi Cloud https://app.sciphi.ai and set an R2R_API_KEY environment variable 5 | // or set the base URL to your instance. E.g. r2rClient("http://localhost:7272") 6 | const client = new r2rClient(); 7 | 8 | async function main() { 9 | const filePath = path.resolve(__dirname, "data/raskolnikov.txt"); 10 | 11 | 12 | console.log("Ingesting file..."); 13 | const ingestResult = await client.documents.create({ 14 | file: { 15 | path: filePath, 16 | name: "raskolnikov.txt" 17 | }, 18 | metadata: { author: "Dostoevsky" }, 19 | }); 20 | console.log("Ingest result:", JSON.stringify(ingestResult, null, 2)); 21 | 22 | console.log("Waiting for the file to be ingested..."); 23 | await new Promise((resolve) => setTimeout(resolve, 10000)); 24 | 25 | console.log("Performing RAG..."); 26 | const ragResponse = await client.retrieval.rag({ 27 | query: "To whom was Raskolnikov desperately in debt to?", 28 | }); 29 | 30 | console.log("Search Results:"); 31 | ragResponse.results.searchResults.chunkSearchResults.forEach( 32 | (result, index) => { 33 | console.log(`\nResult ${index + 1}:`); 34 | console.log(`Text: ${result.text.substring(0, 100)}...`); 35 | console.log(`Score: ${result.score}`); 36 | }, 37 | ); 38 | 39 | console.log("\nCompletion:"); 40 | console.log(ragResponse.results.completion); 41 | } 42 | 43 | main(); 44 | -------------------------------------------------------------------------------- /js/sdk/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: "ts-jest", 3 | testEnvironment: "node", 4 | testMatch: [ 5 | "**/__tests__/**/*.ts?(x)", 6 | "**/__tests__/**/?(*.)+(spec|test).ts?(x)", 7 | ], 8 | maxWorkers: 1, 9 | }; 10 | -------------------------------------------------------------------------------- /js/sdk/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "r2r-js", 3 | "version": "0.4.43", 4 | "description": "", 5 | "main": "dist/index.js", 6 | "browser": "dist/index.browser.js", 7 | "types": "dist/index.d.ts", 8 | "exports": { 9 | ".": "./dist/index.js" 10 | }, 11 | "scripts": { 12 | "build": "tsc", 13 | "prepublishOnly": "npm run build", 14 | "format": "prettier --write .", 15 | "pretest:integration": "node setup.js", 16 | "test": "jest --no-cache", 17 | "test:watch": "jest --watch", 18 | "test:coverage": "jest --coverage", 19 | "test:collections": "jest CollectionsIntegrationSuperUser CollectionsIntegrationUser", 20 | "test:documents": "jest DocumentsIntegrationSuperUser", 21 | "test:retrieval": "jest RetrievalIntegrationSuperUser", 22 | "test:users": "jest UsersIntegrationSuperUser" 23 | }, 24 | "files": [ 25 | "dist" 26 | ], 27 | "keywords": [], 28 | "author": "", 29 | "license": "ISC", 30 | "dependencies": { 31 | "@jest/globals": "^29.7.0", 32 | "@rrweb/types": "2.0.0-alpha.17", 33 | "axios": "^1.8.4", 34 | "form-data": "^4.0.1", 35 | "rrweb-snapshot": "2.0.0-alpha.4", 36 | "uuid": "^10.0.0" 37 | }, 38 | "devDependencies": { 39 | "@rrweb/record": "2.0.0-alpha.17", 40 | "@types/jest": "^29.5.14", 41 | "@types/node": "^20.17.9", 42 | "@types/uuid": "^10.0.0", 43 | "jest": "^29.7.0", 44 | "prettier": "^3.4.2", 45 | "ts-jest": "^29.2.5", 46 | "ts-node": "^10.9.2", 47 | "typescript": "^5.7.2" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /js/sdk/src/index.ts: -------------------------------------------------------------------------------- 1 | export { r2rClient } from "./r2rClient"; 2 | export * from "./types"; 3 | -------------------------------------------------------------------------------- /js/sdk/src/utils/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./typeTransformer"; 2 | export * from "./utils"; 3 | -------------------------------------------------------------------------------- /js/sdk/src/utils/utils.ts: -------------------------------------------------------------------------------- 1 | export function downloadBlob(blob: Blob, filename: string): void { 2 | const url = window.URL.createObjectURL(blob); 3 | const link = document.createElement("a"); 4 | link.href = url; 5 | link.download = filename; 6 | document.body.appendChild(link); 7 | link.click(); 8 | document.body.removeChild(link); 9 | window.URL.revokeObjectURL(url); 10 | } 11 | -------------------------------------------------------------------------------- /js/sdk/src/v3/clients/system.ts: -------------------------------------------------------------------------------- 1 | import { r2rClient } from "../../r2rClient"; 2 | import { 3 | WrappedGenericMessageResponse, 4 | WrappedServerStatsResponse, 5 | WrappedSettingsResponse, 6 | } from "../../types"; 7 | 8 | export class SystemClient { 9 | constructor(private client: r2rClient) {} 10 | 11 | /** 12 | * Check the health of the R2R server. 13 | */ 14 | async health(): Promise { 15 | return await this.client.makeRequest("GET", "health"); 16 | } 17 | 18 | /** 19 | * Get the configuration settings for the R2R server. 20 | * @returns 21 | */ 22 | async settings(): Promise { 23 | return await this.client.makeRequest("GET", "system/settings"); 24 | } 25 | 26 | /** 27 | * Get statistics about the server, including the start time, uptime, 28 | * CPU usage, and memory usage. 29 | * @returns 30 | */ 31 | async status(): Promise { 32 | return await this.client.makeRequest("GET", "system/status"); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /js/sdk/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2017", 4 | "module": "commonjs", 5 | "outDir": "./dist", 6 | "rootDir": "./src", 7 | "declaration": true, 8 | "moduleResolution": "node", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "experimentalDecorators": true, 12 | "emitDecoratorMetadata": true, 13 | "forceConsistentCasingInFileNames": true, 14 | "jsx": "react", 15 | "lib": ["es2017", "dom"], 16 | "sourceMap": true, 17 | "types": ["node", "jest", "@types/jest"], 18 | "skipLibCheck": true 19 | }, 20 | "include": ["src/**/*"], 21 | "exclude": ["node_modules", "**/__tests__/*", "**/*.spec.ts"] 22 | } 23 | -------------------------------------------------------------------------------- /py/.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | .Python 6 | env 7 | pip-log.txt 8 | pip-delete-this-directory.txt 9 | .tox 10 | .coverage 11 | .coverage.* 12 | .cache 13 | nosetests.xml 14 | coverage.xml 15 | *.cover 16 | *.log 17 | .git 18 | .mypy_cache 19 | .pytest_cache 20 | .hypothesis 21 | -------------------------------------------------------------------------------- /py/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim AS builder 2 | 3 | # Install system dependencies 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ 6 | poppler-utils \ 7 | && apt-get clean && rm -rf /var/lib/apt/lists/* \ 8 | && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 9 | 10 | # Add Rust to PATH 11 | ENV PATH="/root/.cargo/bin:${PATH}" 12 | 13 | # Create the /app/py directory 14 | RUN mkdir -p /app/py 15 | WORKDIR /app/py 16 | COPY pyproject.toml ./ 17 | RUN pip install -e ".[core]" && \ 18 | pip install gunicorn uvicorn pydantic 19 | 20 | # Optionally, if you want gunicorn and uvicorn explicitly installed, you can 21 | # either list them under [project] in `pyproject.toml` or install them here: 22 | RUN pip install --no-cache-dir gunicorn uvicorn 23 | 24 | # Create the final image 25 | FROM python:3.12-slim 26 | 27 | # Minimal runtime deps 28 | RUN apt-get update && apt-get install -y --no-install-recommends \ 29 | curl poppler-utils \ 30 | && apt-get clean && rm -rf /var/lib/apt/lists/* 31 | 32 | # Copy the built environment from builder to final image 33 | # (If you want a fully self-contained environment, copy /usr/local) 34 | COPY --from=builder /usr/local /usr/local 35 | 36 | WORKDIR /app 37 | 38 | # Copy the rest of your source code 39 | COPY . /app 40 | 41 | # Expose environment variables and port 42 | ARG R2R_PORT=8000 R2R_HOST=0.0.0.0 43 | ENV R2R_PORT=$R2R_PORT R2R_HOST=$R2R_HOST 44 | EXPOSE $R2R_PORT 45 | 46 | # Launch the app 47 | CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $R2R_HOST --port $R2R_PORT"] 48 | -------------------------------------------------------------------------------- /py/core/agent/__init__.py: -------------------------------------------------------------------------------- 1 | # FIXME: Once the agent is properly type annotated, remove the type: ignore comments 2 | from .base import ( # type: ignore 3 | R2RAgent, 4 | R2RStreamingAgent, 5 | R2RXMLStreamingAgent, 6 | ) 7 | from .rag import ( # type: ignore 8 | R2RRAGAgent, 9 | R2RStreamingRAGAgent, 10 | R2RXMLToolsRAGAgent, 11 | R2RXMLToolsStreamingRAGAgent, 12 | ) 13 | 14 | # Import the concrete implementations 15 | from .research import ( 16 | R2RResearchAgent, 17 | R2RStreamingResearchAgent, 18 | R2RXMLToolsResearchAgent, 19 | R2RXMLToolsStreamingResearchAgent, 20 | ) 21 | 22 | __all__ = [ 23 | # Base 24 | "R2RAgent", 25 | "R2RStreamingAgent", 26 | "R2RXMLStreamingAgent", 27 | # RAG Agents 28 | "R2RRAGAgent", 29 | "R2RXMLToolsRAGAgent", 30 | "R2RStreamingRAGAgent", 31 | "R2RXMLToolsStreamingRAGAgent", 32 | "R2RResearchAgent", 33 | "R2RStreamingResearchAgent", 34 | "R2RXMLToolsResearchAgent", 35 | "R2RXMLToolsStreamingResearchAgent", 36 | ] 37 | -------------------------------------------------------------------------------- /py/core/base/agent/__init__.py: -------------------------------------------------------------------------------- 1 | # FIXME: Once the agent is properly type annotated, remove the type: ignore comments 2 | from .agent import ( # type: ignore 3 | Agent, 4 | AgentConfig, 5 | Conversation, 6 | ) 7 | 8 | __all__ = [ 9 | # Agent abstractions 10 | "Agent", 11 | "AgentConfig", 12 | "Conversation", 13 | ] 14 | -------------------------------------------------------------------------------- /py/core/base/agent/tools/built_in/search_file_descriptions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from shared.abstractions.tool import Tool 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SearchFileDescriptionsTool(Tool): 9 | """ 10 | A tool to search over high-level document data (titles, descriptions, etc.) 11 | """ 12 | 13 | def __init__(self): 14 | super().__init__( 15 | name="search_file_descriptions", 16 | description=( 17 | "Semantic search over AI-generated summaries of stored documents. " 18 | "This does NOT retrieve chunk-level contents or knowledge-graph relationships. " 19 | "Use this when you need a broad overview of which documents (files) might be relevant." 20 | ), 21 | parameters={ 22 | "type": "object", 23 | "properties": { 24 | "query": { 25 | "type": "string", 26 | "description": "Query string to semantic search over available files 'list documents about XYZ'.", 27 | } 28 | }, 29 | "required": ["query"], 30 | }, 31 | results_function=self.execute, 32 | llm_format_function=None, 33 | ) 34 | 35 | async def execute(self, query: str, *args, **kwargs): 36 | """ 37 | Calls the file_search_method from context. 38 | """ 39 | from core.base.abstractions import AggregateSearchResult 40 | 41 | context = self.context 42 | 43 | # Check if context has necessary method 44 | if not context or not hasattr(context, "file_search_method"): 45 | logger.error("No file_search_method provided in context") 46 | return AggregateSearchResult(document_search_results=[]) 47 | 48 | # Get the file_search_method from context 49 | file_search_method = context.file_search_method 50 | 51 | # Call the content_method from the context 52 | try: 53 | doc_results = await file_search_method( 54 | query=query, 55 | settings=context.search_settings, 56 | ) 57 | except Exception as e: 58 | logger.error(f"Error calling content_method: {e}") 59 | return AggregateSearchResult(document_search_results=[]) 60 | 61 | result = AggregateSearchResult(document_search_results=doc_results) 62 | 63 | # Add to results collector if context has it 64 | if hasattr(context, "search_results_collector"): 65 | context.search_results_collector.add_aggregate_result(result) 66 | 67 | return result 68 | -------------------------------------------------------------------------------- /py/core/base/agent/tools/built_in/web_search.py: -------------------------------------------------------------------------------- 1 | from shared.abstractions.tool import Tool 2 | 3 | 4 | class WebSearchTool(Tool): 5 | """ 6 | A web search tool that uses Serper to perform Google searches and returns 7 | the most relevant results. 8 | """ 9 | 10 | def __init__(self): 11 | super().__init__( 12 | name="web_search", 13 | description=( 14 | "Search for information on the web - use this tool when the user " 15 | "query needs LIVE or recent data from the internet." 16 | ), 17 | parameters={ 18 | "type": "object", 19 | "properties": { 20 | "query": { 21 | "type": "string", 22 | "description": "The query to search with an external web API.", 23 | }, 24 | }, 25 | "required": ["query"], 26 | }, 27 | results_function=self.execute, 28 | llm_format_function=None, 29 | ) 30 | 31 | async def execute(self, query: str, *args, **kwargs): 32 | """ 33 | Implementation of web search functionality. 34 | """ 35 | import asyncio 36 | 37 | from core.base.abstractions import ( 38 | AggregateSearchResult, 39 | WebSearchResult, 40 | ) 41 | from core.utils.serper import SerperClient 42 | 43 | context = self.context 44 | 45 | serper_client = SerperClient() 46 | 47 | raw_results = await asyncio.get_event_loop().run_in_executor( 48 | None, 49 | lambda: serper_client.get_raw(query), 50 | ) 51 | 52 | web_response = await asyncio.get_event_loop().run_in_executor( 53 | None, lambda: WebSearchResult.from_serper_results(raw_results) 54 | ) 55 | 56 | result = AggregateSearchResult( 57 | web_search_results=[web_response], 58 | ) 59 | 60 | # Add to results collector if context is provided 61 | if context and hasattr(context, "search_results_collector"): 62 | context.search_results_collector.add_aggregate_result(result) 63 | 64 | return result 65 | -------------------------------------------------------------------------------- /py/core/base/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_parser import AsyncParser 2 | 3 | __all__ = [ 4 | "AsyncParser", 5 | ] 6 | -------------------------------------------------------------------------------- /py/core/base/parsers/base_parser.py: -------------------------------------------------------------------------------- 1 | """Abstract base class for parsers.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import AsyncGenerator, Generic, TypeVar 5 | 6 | T = TypeVar("T") 7 | 8 | 9 | class AsyncParser(ABC, Generic[T]): 10 | @abstractmethod 11 | async def ingest(self, data: T, **kwargs) -> AsyncGenerator[str, None]: 12 | pass 13 | -------------------------------------------------------------------------------- /py/core/base/providers/__init__.py: -------------------------------------------------------------------------------- 1 | from .auth import AuthConfig, AuthProvider 2 | from .base import AppConfig, Provider, ProviderConfig 3 | from .crypto import CryptoConfig, CryptoProvider 4 | from .database import ( 5 | DatabaseConfig, 6 | DatabaseConnectionManager, 7 | DatabaseProvider, 8 | Handler, 9 | LimitSettings, 10 | PostgresConfigurationSettings, 11 | ) 12 | from .email import EmailConfig, EmailProvider 13 | from .embedding import EmbeddingConfig, EmbeddingProvider 14 | from .file import FileConfig, FileProvider 15 | from .ingestion import ( 16 | ChunkingStrategy, 17 | IngestionConfig, 18 | IngestionProvider, 19 | ) 20 | from .llm import CompletionConfig, CompletionProvider 21 | from .ocr import OCRConfig, OCRProvider 22 | from .orchestration import OrchestrationConfig, OrchestrationProvider, Workflow 23 | from .scheduler import SchedulerConfig, SchedulerProvider 24 | 25 | __all__ = [ 26 | # Auth provider 27 | "AuthConfig", 28 | "AuthProvider", 29 | # Base provider classes 30 | "AppConfig", 31 | "Provider", 32 | "ProviderConfig", 33 | # Crypto provider 34 | "CryptoConfig", 35 | "CryptoProvider", 36 | # Database providers 37 | "DatabaseConnectionManager", 38 | "DatabaseConfig", 39 | "LimitSettings", 40 | "PostgresConfigurationSettings", 41 | "DatabaseProvider", 42 | "Handler", 43 | # Email provider 44 | "EmailConfig", 45 | "EmailProvider", 46 | # Embedding provider 47 | "EmbeddingConfig", 48 | "EmbeddingProvider", 49 | # File provider 50 | "FileConfig", 51 | "FileProvider", 52 | # Ingestion provider 53 | "IngestionConfig", 54 | "IngestionProvider", 55 | "ChunkingStrategy", 56 | # LLM provider 57 | "CompletionConfig", 58 | "CompletionProvider", 59 | # OCR provider 60 | "OCRConfig", 61 | "OCRProvider", 62 | # Orchestration provider 63 | "OrchestrationConfig", 64 | "OrchestrationProvider", 65 | "Workflow", 66 | # Scheduler provider 67 | "SchedulerConfig", 68 | "SchedulerProvider", 69 | ] 70 | -------------------------------------------------------------------------------- /py/core/base/providers/orchestration.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from enum import Enum 3 | from typing import Any 4 | 5 | from .base import Provider, ProviderConfig 6 | 7 | 8 | class Workflow(Enum): 9 | INGESTION = "ingestion" 10 | GRAPH = "graph" 11 | 12 | 13 | class OrchestrationConfig(ProviderConfig): 14 | provider: str 15 | max_runs: int = 2_048 16 | graph_search_results_creation_concurrency_limit: int = 32 17 | ingestion_concurrency_limit: int = 16 18 | graph_search_results_concurrency_limit: int = 8 19 | 20 | def validate_config(self) -> None: 21 | if self.provider not in self.supported_providers: 22 | raise ValueError(f"Provider {self.provider} is not supported.") 23 | 24 | @property 25 | def supported_providers(self) -> list[str]: 26 | return ["hatchet", "simple"] 27 | 28 | 29 | class OrchestrationProvider(Provider): 30 | def __init__(self, config: OrchestrationConfig): 31 | super().__init__(config) 32 | self.config = config 33 | self.worker = None 34 | 35 | @abstractmethod 36 | async def start_worker(self): 37 | pass 38 | 39 | @abstractmethod 40 | def get_worker(self, name: str, max_runs: int) -> Any: 41 | pass 42 | 43 | @abstractmethod 44 | def step(self, *args, **kwargs) -> Any: 45 | pass 46 | 47 | @abstractmethod 48 | def workflow(self, *args, **kwargs) -> Any: 49 | pass 50 | 51 | @abstractmethod 52 | def failure(self, *args, **kwargs) -> Any: 53 | pass 54 | 55 | @abstractmethod 56 | def register_workflows( 57 | self, workflow: Workflow, service: Any, messages: dict 58 | ) -> None: 59 | pass 60 | 61 | @abstractmethod 62 | async def run_workflow( 63 | self, 64 | workflow_name: str, 65 | parameters: dict, 66 | options: dict, 67 | *args, 68 | **kwargs, 69 | ) -> dict[str, str]: 70 | pass 71 | -------------------------------------------------------------------------------- /py/core/base/providers/scheduler.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from .base import Provider, ProviderConfig 4 | 5 | 6 | class SchedulerConfig(ProviderConfig): 7 | """Configuration for scheduler provider""" 8 | 9 | provider: str = "apscheduler" 10 | 11 | def validate_config(self): 12 | if self.provider not in self.supported_providers: 13 | raise ValueError( 14 | f"Scheduler provider {self.provider} is not supported." 15 | ) 16 | 17 | @property 18 | def supported_providers(self) -> list[str]: 19 | return ["apscheduler"] 20 | 21 | 22 | class SchedulerProvider(Provider): 23 | """Base class for scheduler providers""" 24 | 25 | def __init__(self, config: SchedulerConfig): 26 | super().__init__(config) 27 | self.config = config 28 | 29 | @abstractmethod 30 | async def add_job(self, func, trigger, **kwargs): 31 | pass 32 | 33 | @abstractmethod 34 | async def start(self): 35 | pass 36 | 37 | @abstractmethod 38 | async def shutdown(self): 39 | pass 40 | -------------------------------------------------------------------------------- /py/core/base/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from shared.utils import ( 2 | RecursiveCharacterTextSplitter, 3 | TextSplitter, 4 | _decorate_vector_type, 5 | _get_vector_column_str, 6 | deep_update, 7 | dump_collector, 8 | dump_obj, 9 | format_search_results_for_llm, 10 | generate_default_prompt_id, 11 | generate_default_user_collection_id, 12 | generate_document_id, 13 | generate_entity_document_id, 14 | generate_extraction_id, 15 | generate_id, 16 | generate_user_id, 17 | validate_uuid, 18 | yield_sse_event, 19 | ) 20 | 21 | __all__ = [ 22 | "format_search_results_for_llm", 23 | "generate_id", 24 | "generate_default_user_collection_id", 25 | "generate_document_id", 26 | "generate_extraction_id", 27 | "generate_user_id", 28 | "generate_entity_document_id", 29 | "generate_default_prompt_id", 30 | "RecursiveCharacterTextSplitter", 31 | "TextSplitter", 32 | "validate_uuid", 33 | "deep_update", 34 | "_decorate_vector_type", 35 | "_get_vector_column_str", 36 | "yield_sse_event", 37 | "dump_collector", 38 | "dump_obj", 39 | ] 40 | -------------------------------------------------------------------------------- /py/core/configs/full.toml: -------------------------------------------------------------------------------- 1 | [completion] 2 | provider = "r2r" 3 | concurrent_request_limit = 128 4 | 5 | [ingestion] 6 | provider = "unstructured_local" 7 | strategy = "auto" 8 | chunking_strategy = "by_title" 9 | new_after_n_chars = 2_048 10 | max_characters = 4_096 11 | combine_under_n_chars = 1_024 12 | overlap = 1_024 13 | 14 | [ingestion.extra_parsers] 15 | pdf = ["zerox", "ocr"] 16 | 17 | [orchestration] 18 | provider = "hatchet" 19 | kg_creation_concurrency_limit = 32 20 | ingestion_concurrency_limit = 16 21 | kg_concurrency_limit = 8 22 | -------------------------------------------------------------------------------- /py/core/configs/full_azure.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "azure/gpt-4.1-mini" 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "azure/gpt-4.1" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "azure/gpt-4.1" 10 | 11 | # LLM used for transcription 12 | audio_lm = "azure/whisper-1" 13 | 14 | # Reasoning model, used for `research` agent 15 | reasoning_llm = "azure/o3-mini" 16 | # Planning model, used for `research` agent 17 | planning_llm = "azure/o3-mini" 18 | 19 | [embedding] 20 | base_model = "azure/text-embedding-3-small" 21 | 22 | [completion_embedding] 23 | base_model = "azure/text-embedding-3-small" 24 | 25 | [ingestion] 26 | provider = "unstructured_local" 27 | strategy = "auto" 28 | chunking_strategy = "by_title" 29 | new_after_n_chars = 2_048 30 | max_characters = 4_096 31 | combine_under_n_chars = 1_024 32 | overlap = 1_024 33 | document_summary_model = "azure/gpt-4.1-mini" 34 | automatic_extraction = true # enable automatic extraction of entities and relations 35 | 36 | [ingestion.extra_parsers] 37 | pdf = ["zerox", "ocr"] 38 | 39 | [ingestion.chunk_enrichment_settings] 40 | generation_config = { model = "azure/gpt-4.1-mini" } 41 | 42 | [orchestration] 43 | provider = "hatchet" 44 | kg_creation_concurrency_limit = 32 45 | ingestion_concurrency_limit = 4 46 | kg_concurrency_limit = 8 47 | -------------------------------------------------------------------------------- /py/core/configs/full_lm_studio.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "lm_studio/llama-3.2-3b-instruct" 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "lm_studio/llama-3.2-3b-instruct" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "lm_studio/llama3.2-vision" # TODO - Replace with viable candidate 10 | 11 | # LLM used for transcription 12 | audio_lm = "lm_studio/llama-3.2-3b-instruct" # TODO - Replace with viable candidate 13 | 14 | [embedding] 15 | provider = "litellm" 16 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5" 17 | base_dimension = nan 18 | batch_size = 128 19 | concurrent_request_limit = 2 20 | 21 | [completion_embedding] 22 | # Generally this should be the same as the embedding config, but advanced users may want to run with a different provider to reduce latency 23 | provider = "litellm" 24 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5" 25 | base_dimension = nan 26 | batch_size = 128 27 | concurrent_request_limit = 2 28 | 29 | [agent] 30 | tools = ["search_file_knowledge"] 31 | 32 | [completion] 33 | provider = "litellm" 34 | concurrent_request_limit = 1 35 | 36 | [completion.generation_config] 37 | temperature = 0.1 38 | top_p = 1 39 | max_tokens_to_sample = 1_024 40 | stream = false 41 | 42 | [ingestion] 43 | provider = "unstructured_local" 44 | strategy = "auto" 45 | chunking_strategy = "by_title" 46 | new_after_n_chars = 512 47 | max_characters = 1_024 48 | combine_under_n_chars = 128 49 | overlap = 20 50 | chunks_for_document_summary = 16 51 | document_summary_model = "lm_studio/llama-3.2-3b-instruct" 52 | automatic_extraction = false 53 | 54 | [orchestration] 55 | provider = "hatchet" 56 | -------------------------------------------------------------------------------- /py/core/configs/full_ollama.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "ollama/llama3.1" 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "ollama/llama3.1" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "ollama/llama3.1" # TODO - Replace with viable candidate 10 | 11 | # LLM used for transcription 12 | audio_lm = "ollama/llama3.1" # TODO - Replace with viable candidate 13 | 14 | 15 | # Reasoning model, used for `research` agent 16 | reasoning_llm = "ollama/llama3.1" 17 | # Planning model, used for `research` agent 18 | planning_llm = "ollama/llama3.1" 19 | 20 | [embedding] 21 | provider = "ollama" 22 | base_model = "mxbai-embed-large" 23 | base_dimension = 1_024 24 | batch_size = 128 25 | concurrent_request_limit = 2 26 | 27 | [completion_embedding] 28 | provider = "ollama" 29 | base_model = "mxbai-embed-large" 30 | base_dimension = 1_024 31 | batch_size = 128 32 | concurrent_request_limit = 2 33 | 34 | [agent] 35 | tools = ["search_file_knowledge"] 36 | 37 | [completion] 38 | provider = "litellm" 39 | concurrent_request_limit = 1 40 | 41 | [completion.generation_config] 42 | temperature = 0.1 43 | top_p = 1 44 | max_tokens_to_sample = 1_024 45 | stream = false 46 | api_base = "http://host.docker.internal:11434" 47 | 48 | [ingestion] 49 | provider = "unstructured_local" 50 | strategy = "auto" 51 | chunking_strategy = "by_title" 52 | new_after_n_chars = 512 53 | max_characters = 1_024 54 | combine_under_n_chars = 128 55 | overlap = 20 56 | chunks_for_document_summary = 16 57 | document_summary_model = "ollama/llama3.1" 58 | automatic_extraction = false 59 | 60 | [orchestration] 61 | provider = "hatchet" 62 | -------------------------------------------------------------------------------- /py/core/configs/gemini.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | fast_llm = "gemini/gemini-2.0-flash-lite" 3 | quality_llm = "gemini/gemini-2.0-flash" 4 | vlm = "gemini/gemini-2.0-flash" 5 | audio_lm = "gemini/gemini-2.0-flash-lite" 6 | 7 | [embedding] 8 | provider = "litellm" 9 | base_model = "gemini/text-embedding-004" 10 | base_dimension = nan 11 | batch_size = 128 12 | concurrent_request_limit = 2 13 | 14 | [completion_embedding] 15 | provider = "litellm" 16 | base_model = "gemini/text-embedding-004" 17 | base_dimension = nan 18 | batch_size = 128 19 | concurrent_request_limit = 2 20 | -------------------------------------------------------------------------------- /py/core/configs/lm_studio.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "lm_studio/llama-3.2-3b-instruct" 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "lm_studio/llama-3.2-3b-instruct" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "lm_studio/llama3.2-vision" # TODO - Replace with viable candidate 10 | 11 | # LLM used for transcription 12 | audio_lm = "lm_studio/llama-3.2-3b-instruct" # TODO - Replace with viable candidate 13 | 14 | [embedding] 15 | provider = "litellm" 16 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5" 17 | base_dimension = nan 18 | batch_size = 128 19 | concurrent_request_limit = 2 20 | 21 | [completion_embedding] 22 | # Generally this should be the same as the embedding config, but advanced users may want to run with a different provider to reduce latency 23 | provider = "litellm" 24 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5" 25 | base_dimension = nan 26 | batch_size = 128 27 | concurrent_request_limit = 2 28 | 29 | [agent] 30 | tools = ["search_file_knowledge"] 31 | 32 | [completion] 33 | provider = "litellm" 34 | concurrent_request_limit = 1 35 | 36 | [completion.generation_config] 37 | temperature = 0.1 38 | top_p = 1 39 | max_tokens_to_sample = 1_024 40 | stream = false 41 | -------------------------------------------------------------------------------- /py/core/configs/ollama.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "ollama/llama3.1" ### NOTE - RECOMMENDED TO USE `openai` with `api_base = "http://localhost:11434/v1"` for best results, otherwise `ollama` with `litellm` is acceptable 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "ollama/llama3.1" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "ollama/llama3.1" # TODO - Replace with viable candidate 10 | 11 | # LLM used for transcription 12 | audio_lm = "ollama/llama3.1" # TODO - Replace with viable candidate 13 | 14 | 15 | # Reasoning model, used for `research` agent 16 | reasoning_llm = "ollama/llama3.1" 17 | # Planning model, used for `research` agent 18 | planning_llm = "ollama/llama3.1" 19 | 20 | [embedding] 21 | provider = "ollama" 22 | base_model = "mxbai-embed-large" 23 | base_dimension = 1_024 24 | batch_size = 128 25 | concurrent_request_limit = 2 26 | 27 | [completion_embedding] 28 | provider = "ollama" 29 | base_model = "mxbai-embed-large" 30 | base_dimension = 1_024 31 | batch_size = 128 32 | concurrent_request_limit = 2 33 | 34 | [agent] 35 | tools = ["search_file_knowledge"] 36 | 37 | [completion] 38 | provider = "litellm" 39 | concurrent_request_limit = 1 40 | 41 | [completion.generation_config] 42 | temperature = 0.1 43 | top_p = 1 44 | max_tokens_to_sample = 1_024 45 | stream = false 46 | api_base = "http://localhost:11434/v1" 47 | -------------------------------------------------------------------------------- /py/core/configs/r2r_azure.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "azure/gpt-4.1-mini" 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "azure/gpt-4.1" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "azure/gpt-4.1" 10 | 11 | # LLM used for transcription 12 | audio_lm = "azure/whisper-1" 13 | 14 | # Reasoning model, used for `research` agent 15 | reasoning_llm = "azure/o3-mini" 16 | # Planning model, used for `research` agent 17 | planning_llm = "azure/o3-mini" 18 | 19 | [embedding] 20 | base_model = "azure/text-embedding-3-small" 21 | 22 | [completion_embedding] 23 | base_model = "azure/text-embedding-3-small" 24 | -------------------------------------------------------------------------------- /py/core/configs/r2r_azure_with_test_limits.toml: -------------------------------------------------------------------------------- 1 | [app] 2 | # LLM used for internal operations, like deriving conversation names 3 | fast_llm = "azure/gpt-4.1-mini" 4 | 5 | # LLM used for user-facing output, like RAG replies 6 | quality_llm = "azure/gpt-4.1" 7 | 8 | # LLM used for ingesting visual inputs 9 | vlm = "azure/gpt-4.1" 10 | 11 | # LLM used for transcription 12 | audio_lm = "azure/whisper-1" 13 | 14 | 15 | # Reasoning model, used for `research` agent 16 | reasoning_llm = "azure/o3-mini" 17 | # Planning model, used for `research` agent 18 | planning_llm = "azure/o3-mini" 19 | 20 | [embedding] 21 | base_model = "openai/text-embedding-3-small" 22 | base_dimension = 512 23 | 24 | [completion_embedding] 25 | base_model = "openai/text-embedding-3-small" 26 | 27 | [database] 28 | [database.limits] 29 | global_per_min = 10 # Small enough to test quickly 30 | monthly_limit = 20 # Small enough to test in one run 31 | 32 | [database.route_limits] 33 | "/v3/retrieval/search" = { route_per_min = 5, monthly_limit = 10 } 34 | 35 | [database.user_limits."47e53676-b478-5b3f-a409-234ca2164de5"] 36 | global_per_min = 2 37 | route_per_min = 1 38 | -------------------------------------------------------------------------------- /py/core/configs/r2r_with_auth.toml: -------------------------------------------------------------------------------- 1 | [auth] 2 | provider = "r2r" 3 | access_token_lifetime_in_minutes = 60 4 | refresh_token_lifetime_in_days = 7 5 | require_authentication = true 6 | require_email_verification = false 7 | default_admin_email = "admin@example.com" 8 | default_admin_password = "change_me_immediately" 9 | -------------------------------------------------------------------------------- /py/core/configs/tavily.toml: -------------------------------------------------------------------------------- 1 | [completion] 2 | provider = "r2r" 3 | concurrent_request_limit = 128 4 | 5 | [ingestion] 6 | provider = "unstructured_local" 7 | strategy = "auto" 8 | chunking_strategy = "by_title" 9 | new_after_n_chars = 2_048 10 | max_characters = 4_096 11 | combine_under_n_chars = 1_024 12 | overlap = 1_024 13 | [ingestion.extra_parsers] 14 | pdf = "zerox" 15 | 16 | [orchestration] 17 | provider = "hatchet" 18 | kg_creation_concurrency_limit = 32 19 | ingestion_concurrency_limit = 16 20 | kg_concurrency_limit = 8 21 | 22 | [agent] 23 | # Enable the Tavily search and extraction tools 24 | rag_tools = [ 25 | "search_file_descriptions", 26 | "search_file_knowledge", 27 | "get_file_content", 28 | "tavily_search", 29 | "tavily_extract" 30 | ] 31 | -------------------------------------------------------------------------------- /py/core/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/__init__.py -------------------------------------------------------------------------------- /py/core/examples/data/DeepSeek_R1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/DeepSeek_R1.pdf -------------------------------------------------------------------------------- /py/core/examples/data/aristotle_v2.txt: -------------------------------------------------------------------------------- 1 | Aristotle[A] (Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science. 2 | 3 | Little is known about Aristotle's life. He was born in the city of Stagira in northern Greece during the Classical period. His father, Nicomachus, died when Aristotle was a child, and he was brought up by a guardian. At 17 or 18, he joined Plato's Academy in Athens and remained there until the age of 37 (c. 347 BC). Shortly after Plato died, Aristotle left Athens and, at the request of Philip II of Macedon, tutored his son Alexander the Great beginning in 343 BC. He established a library in the Lyceum, which helped him to produce many of his hundreds of books on papyrus scrolls. 4 | 5 | Though Aristotle wrote many elegant treatises and dialogues for publication, only around a third of his original output has survived, none of it intended for publication. Aristotle provided a complex synthesis of the various philosophies existing prior to him. His teachings and methods of inquiry have had a significant impact across the world, and remain a subject of contemporary philosophical discussion. 6 | 7 | Aristotle's views profoundly shaped medieval scholarship. The influence of his physical science extended from late antiquity and the Early Middle Ages into the Renaissance, and was not replaced systematically until the Enlightenment and theories such as classical mechanics were developed. He influenced Judeo-Islamic philosophies during the Middle Ages, as well as Christian theology, especially the Neoplatonism of the Early Church and the scholastic tradition of the Catholic Church. 8 | 9 | Aristotle was revered among medieval Muslim scholars as "The First Teacher", and among medieval Christians like Thomas Aquinas as simply "The Philosopher", while the poet Dante called him "the master of those who know". His works contain the earliest known formal study of logic, and were studied by medieval scholars such as Peter Abelard and Jean Buridan. Aristotle's influence on logic continued well into the 19th century. In addition, his ethics, although always influential, gained renewed interest with the modern advent of virtue ethics. 10 | -------------------------------------------------------------------------------- /py/core/examples/data/graphrag.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/graphrag.pdf -------------------------------------------------------------------------------- /py/core/examples/data/lyft_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/lyft_2021.pdf -------------------------------------------------------------------------------- /py/core/examples/data/pg_essay_1.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_1.html -------------------------------------------------------------------------------- /py/core/examples/data/pg_essay_2.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_2.html -------------------------------------------------------------------------------- /py/core/examples/data/pg_essay_3.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_3.html -------------------------------------------------------------------------------- /py/core/examples/data/pg_essay_4.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_4.html -------------------------------------------------------------------------------- /py/core/examples/data/sample.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/sample.mp3 -------------------------------------------------------------------------------- /py/core/examples/data/sample2.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/sample2.mp3 -------------------------------------------------------------------------------- /py/core/examples/data/screen_shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/screen_shot.png -------------------------------------------------------------------------------- /py/core/examples/data/test.txt: -------------------------------------------------------------------------------- 1 | this is a test text 2 | -------------------------------------------------------------------------------- /py/core/examples/data/uber_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/uber_2021.pdf -------------------------------------------------------------------------------- /py/core/examples/hello_r2r.py: -------------------------------------------------------------------------------- 1 | from r2r import R2RClient 2 | 3 | client = R2RClient() 4 | 5 | with open("test.txt", "w") as file: 6 | file.write("John is a person that works at Google.") 7 | 8 | client.ingest_files(file_paths=["test.txt"]) 9 | 10 | # Call RAG directly on an R2R object 11 | rag_response = client.rag( 12 | query="Who is john", 13 | rag_generation_config={"model": "gpt-4.1-mini", "temperature": 0.0}, 14 | ) 15 | results = rag_response["results"] 16 | print(f"Search Results:\n{results['search_results']}") 17 | print(f"Completion:\n{results['completion']}") 18 | 19 | # RAG Results: 20 | # Search Results: 21 | # AggregateSearchResult(chunk_search_results=[ChunkSearchResult(id=2d71e689-0a0e-5491-a50b-4ecb9494c832, score=0.6848798582029441, metadata={'text': 'John is a person that works at Google.', 'version': 'v0', 'chunk_order': 0, 'document_id': 'ed76b6ee-dd80-5172-9263-919d493b439a', 'id': '1ba494d7-cb2f-5f0e-9f64-76c31da11381', 'associatedQuery': 'Who is john'})], graph_search_results=None) 22 | # Completion: 23 | # ChatCompletion(id='chatcmpl-9g0HnjGjyWDLADe7E2EvLWa35cMkB', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='John is a person that works at Google [1].', role='assistant', function_call=None, tool_calls=None))], created=1719797903, model='gpt-4o-mini', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=11, prompt_tokens=145, total_tokens=156)) 24 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/bmp.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/bmp.bmp -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/csv.csv: -------------------------------------------------------------------------------- 1 | Date,Customer ID,Product,Quantity,Unit Price,Total 2 | 2024-01-15,C1001,Laptop Pro X,2,999.99,1999.98 3 | 2024-01-15,C1002,Wireless Mouse,5,29.99,149.95 4 | 2024-01-16,C1003,External SSD 1TB,3,159.99,479.97 5 | 2024-01-16,C1001,USB-C Cable,4,19.99,79.96 6 | 2024-01-17,C1004,Monitor 27",1,349.99,349.99 7 | 2024-01-17,C1005,Keyboard Elite,2,129.99,259.98 8 | 2024-01-18,C1002,Headphones Pro,1,199.99,199.99 9 | 2024-01-18,C1006,Webcam HD,3,79.99,239.97 10 | 2024-01-19,C1007,Power Bank,2,49.99,99.98 11 | 2024-01-19,C1003,Phone Case,5,24.99,124.95 12 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/doc.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/doc.doc -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/docx.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/docx.docx -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/eml.eml: -------------------------------------------------------------------------------- 1 | From: sender@example.com 2 | To: recipient@example.com 3 | Subject: Meeting Summary - Q4 Planning 4 | Date: Mon, 16 Dec 2024 10:30:00 -0500 5 | Content-Type: multipart/mixed; boundary="boundary123" 6 | 7 | --boundary123 8 | Content-Type: text/plain; charset="utf-8" 9 | Content-Transfer-Encoding: quoted-printable 10 | 11 | Hi Team, 12 | 13 | Here's a summary of our Q4 planning meeting: 14 | 15 | Key Points: 16 | 1. Revenue targets increased by 15% 17 | 2. New product launch scheduled for November 18 | 3. Marketing budget approved for expansion 19 | 20 | Action Items: 21 | - Sarah: Prepare detailed product roadmap 22 | - Mike: Contact vendors for pricing 23 | - Jennifer: Update financial projections 24 | 25 | Please review and let me know if you have any questions. 26 | 27 | Best regards, 28 | Alex 29 | 30 | --boundary123 31 | Content-Type: text/html; charset="utf-8" 32 | Content-Transfer-Encoding: quoted-printable 33 | 34 | 35 | 36 |

Hi Team,

37 | 38 |

Here's a summary of our Q4 planning meeting:

39 | 40 |

Key Points:

41 |
    42 |
  • Revenue targets increased by 15%
  • 43 |
  • New product launch scheduled for November
  • 44 |
  • Marketing budget approved for expansion
  • 45 |
46 | 47 |

Action Items:

48 |
    49 |
  • Sarah: Prepare detailed product roadmap
  • 50 |
  • Mike: Contact vendors for pricing
  • 51 |
  • Jennifer: Update financial projections
  • 52 |
53 | 54 |

Please review and let me know if you have any questions.

55 | 56 |

Best regards,
57 | Alex

58 | 59 | 60 | 61 | --boundary123-- 62 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/epub.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/epub.epub -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/heic.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/heic.heic -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/html.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Product Dashboard 7 | 38 | 39 | 40 |
41 |
42 |

Product Performance Dashboard

43 |

Real-time metrics and analytics

44 |
45 |
46 |
47 |

Active Users

48 |

1,234

49 |
50 |
51 |

Revenue

52 |

$45,678

53 |
54 |
55 |

Conversion Rate

56 |

2.34%

57 |
58 |
59 |
60 |

Recent Activity

61 |
    62 |
  • New feature deployed: Enhanced search
  • 63 |
  • Bug fix: Mobile navigation issue
  • 64 |
  • Performance improvement: Cache optimization
  • 65 |
66 |
67 |
68 | 69 | 70 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/jpeg.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/jpeg.jpeg -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/jpg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/jpg.jpg -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/js.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const { r2rClient } = require("r2r-js"); 3 | 4 | // Create an account at SciPhi Cloud https://app.sciphi.ai and set an R2R_API_KEY environment variable 5 | // or set the base URL to your instance. E.g. r2rClient("http://localhost:7272") 6 | const client = new r2rClient(); 7 | 8 | async function main() { 9 | const filePath = path.resolve(__dirname, "data/raskolnikov.txt"); 10 | 11 | 12 | console.log("Ingesting file..."); 13 | const ingestResult = await client.documents.create({ 14 | file: { 15 | path: filePath, 16 | name: "raskolnikov.txt" 17 | }, 18 | metadata: { author: "Dostoevsky" }, 19 | }); 20 | console.log("Ingest result:", JSON.stringify(ingestResult, null, 2)); 21 | 22 | console.log("Waiting for the file to be ingested..."); 23 | await new Promise((resolve) => setTimeout(resolve, 10000)); 24 | 25 | console.log("Performing RAG..."); 26 | const ragResponse = await client.retrieval.rag({ 27 | query: "To whom was Raskolnikov desperately in debt to?", 28 | }); 29 | 30 | console.log("Search Results:"); 31 | ragResponse.results.searchResults.chunkSearchResults.forEach( 32 | (result, index) => { 33 | console.log(`\nResult ${index + 1}:`); 34 | console.log(`Text: ${result.text.substring(0, 100)}...`); 35 | console.log(`Score: ${result.score}`); 36 | }, 37 | ); 38 | 39 | console.log("\nCompletion:"); 40 | console.log(ragResponse.results.completion); 41 | } 42 | 43 | main(); 44 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/json.json: -------------------------------------------------------------------------------- 1 | { 2 | "dashboard": { 3 | "name": "Product Performance Dashboard", 4 | "lastUpdated": "2024-12-16T10:30:00Z", 5 | "metrics": { 6 | "activeUsers": { 7 | "current": 1234, 8 | "previousPeriod": 1156, 9 | "percentChange": 6.75 10 | }, 11 | "revenue": { 12 | "current": 45678.90, 13 | "previousPeriod": 41234.56, 14 | "percentChange": 10.78, 15 | "currency": "USD" 16 | }, 17 | "conversionRate": { 18 | "current": 2.34, 19 | "previousPeriod": 2.12, 20 | "percentChange": 10.38, 21 | "unit": "percent" 22 | } 23 | }, 24 | "recentActivity": [ 25 | { 26 | "type": "deployment", 27 | "title": "Enhanced search", 28 | "description": "New feature deployed: Enhanced search functionality", 29 | "timestamp": "2024-12-15T15:45:00Z", 30 | "status": "successful" 31 | }, 32 | { 33 | "type": "bugfix", 34 | "title": "Mobile navigation", 35 | "description": "Bug fix: Mobile navigation issue resolved", 36 | "timestamp": "2024-12-14T09:20:00Z", 37 | "status": "successful" 38 | }, 39 | { 40 | "type": "performance", 41 | "title": "Cache optimization", 42 | "description": "Performance improvement: Cache optimization completed", 43 | "timestamp": "2024-12-13T11:15:00Z", 44 | "status": "successful" 45 | } 46 | ], 47 | "settings": { 48 | "refreshInterval": 300, 49 | "timezone": "UTC", 50 | "theme": "light", 51 | "notifications": { 52 | "email": true, 53 | "slack": true, 54 | "inApp": true 55 | } 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/msg.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/msg.msg -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/odt.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/odt.odt -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/pdf.pdf -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/png.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/png.png -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/ppt.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/ppt.ppt -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/pptx.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/pptx.pptx -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/py.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from core.base.parsers.base_parser import AsyncParser 7 | from core.base.providers import ( 8 | CompletionProvider, 9 | DatabaseProvider, 10 | IngestionConfig, 11 | ) 12 | 13 | 14 | class HTMLParser(AsyncParser[str | bytes]): 15 | """A parser for HTML data.""" 16 | 17 | def __init__( 18 | self, 19 | config: IngestionConfig, 20 | database_provider: DatabaseProvider, 21 | llm_provider: CompletionProvider, 22 | ): 23 | self.database_provider = database_provider 24 | self.llm_provider = llm_provider 25 | self.config = config 26 | 27 | async def ingest( 28 | self, data: str | bytes, *args, **kwargs 29 | ) -> AsyncGenerator[str, None]: 30 | """Ingest HTML data and yield text.""" 31 | soup = BeautifulSoup(data, "html.parser") 32 | yield soup.get_text() 33 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/rst.rst: -------------------------------------------------------------------------------- 1 | Header 1 2 | ======== 3 | -------- 4 | Subtitle 5 | -------- 6 | 7 | Example text. 8 | 9 | .. contents:: Table of Contents 10 | 11 | Header 2 12 | -------- 13 | 14 | 1. Blah blah ``code`` blah 15 | 16 | 2. More ``code``, hooray 17 | 18 | 3. Somé UTF-8° 19 | 20 | The UTF-8 quote character in this table used to cause python to go boom. Now docutils just silently ignores it. 21 | 22 | .. csv-table:: Things that are Awesome (on a scale of 1-11) 23 | :quote: ” 24 | 25 | Thing,Awesomeness 26 | Icecream, 7 27 | Honey Badgers, 10.5 28 | Nickelback, -2 29 | Iron Man, 10 30 | Iron Man 2, 3 31 | Tabular Data, 5 32 | Made up ratings, 11 33 | 34 | .. code:: 35 | 36 | A block of code 37 | 38 | .. code:: python 39 | 40 | python.code('hooray') 41 | 42 | .. code:: javascript 43 | 44 | export function ƒ(ɑ, β) {} 45 | 46 | .. doctest:: ignored 47 | 48 | >>> some_function() 49 | 'result' 50 | 51 | >>> some_function() 52 | 'result' 53 | 54 | ============== ========================================================== 55 | Travis http://travis-ci.org/tony/pullv 56 | Docs http://pullv.rtfd.org 57 | API http://pullv.readthedocs.org/en/latest/api.html 58 | Issues https://github.com/tony/pullv/issues 59 | Source https://github.com/tony/pullv 60 | ============== ========================================================== 61 | 62 | 63 | .. image:: https://scan.coverity.com/projects/621/badge.svg 64 | :target: https://scan.coverity.com/projects/621 65 | :alt: Coverity Scan Build Status 66 | 67 | .. image:: https://scan.coverity.com/projects/621/badge.svg 68 | :alt: Coverity Scan Build Status 69 | 70 | Field list 71 | ---------- 72 | 73 | :123456789 123456789 123456789 123456789 123456789 1: Uh-oh! This name is too long! 74 | :123456789 123456789 123456789 123456789 1234567890: this is a long name, 75 | but no problem! 76 | :123456789 12345: this is not so long, but long enough for the default! 77 | :123456789 1234: this should work even with the default :) 78 | 79 | someone@somewhere.org 80 | 81 | Press :kbd:`Ctrl+C` to quit 82 | 83 | 84 | .. raw:: html 85 | 86 |

RAW HTML!

87 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/rtf.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\deff0 2 | {\fonttbl{\f0\froman\fcharset0 Times New Roman;}} 3 | \viewkind4\uc1\pard\f0\fs24 4 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\par 5 | } 6 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/tiff.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/tiff.tiff -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/tsv.tsv: -------------------------------------------------------------------------------- 1 | Region Year Quarter Sales Employees Growth Rate 2 | North America 2024 Q1 1250000 45 5.2 3 | Europe 2024 Q1 980000 38 4.8 4 | Asia Pacific 2024 Q1 1450000 52 6.1 5 | South America 2024 Q1 580000 25 3.9 6 | Africa 2024 Q1 320000 18 4.2 7 | North America 2024 Q2 1380000 47 5.5 8 | Europe 2024 Q2 1050000 40 4.9 9 | Asia Pacific 2024 Q2 1520000 54 5.8 10 | South America 2024 Q2 620000 27 4.1 11 | Africa 2024 Q2 350000 20 4.4 12 | -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/xls.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/xls.xls -------------------------------------------------------------------------------- /py/core/examples/supported_file_types/xlsx.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/xlsx.xlsx -------------------------------------------------------------------------------- /py/core/main/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstractions import R2RProviders 2 | from .api import * 3 | from .app import * 4 | 5 | # from .app_entry import r2r_app 6 | from .assembly import * 7 | from .orchestration import * 8 | from .services import * 9 | 10 | __all__ = [ 11 | # R2R Primary 12 | "R2RProviders", 13 | "R2RApp", 14 | "R2RBuilder", 15 | "R2RConfig", 16 | # Factory 17 | "R2RProviderFactory", 18 | ## R2R SERVICES 19 | "AuthService", 20 | "IngestionService", 21 | "MaintenanceService", 22 | "ManagementService", 23 | "RetrievalService", 24 | "GraphService", 25 | ] 26 | -------------------------------------------------------------------------------- /py/core/main/assembly/__init__.py: -------------------------------------------------------------------------------- 1 | from ..config import R2RConfig 2 | from .builder import R2RBuilder 3 | from .factory import R2RProviderFactory 4 | 5 | __all__ = [ 6 | # Builder 7 | "R2RBuilder", 8 | # Config 9 | "R2RConfig", 10 | # Factory 11 | "R2RProviderFactory", 12 | ] 13 | -------------------------------------------------------------------------------- /py/core/main/middleware/__init__.py: -------------------------------------------------------------------------------- 1 | from .project_schema import ProjectSchemaMiddleware 2 | 3 | __all__ = [ 4 | "ProjectSchemaMiddleware", 5 | ] 6 | -------------------------------------------------------------------------------- /py/core/main/middleware/project_schema.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from fastapi import Request 5 | from fastapi.responses import JSONResponse 6 | from starlette.middleware.base import BaseHTTPMiddleware 7 | 8 | from core.utils.context import project_schema_context, set_project_schema 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class ProjectSchemaMiddleware(BaseHTTPMiddleware): 14 | def __init__( 15 | self, app, default_schema: str = "r2r_default", schema_exists_func=None 16 | ): 17 | super().__init__(app) 18 | self.default_schema = default_schema 19 | self.schema_exists_func = schema_exists_func 20 | 21 | async def dispatch(self, request: Request, call_next): 22 | # Skip schema check for static files, docs, etc. 23 | if request.url.path.startswith( 24 | ("/docs", "/redoc", "/static", "/openapi.json") 25 | ): 26 | return await call_next(request) 27 | 28 | # Get the project name from the x-project-name header or use default 29 | schema_name = request.headers.get( 30 | "x-project-name", self.default_schema 31 | ) 32 | 33 | # Validate schema name format (prevent SQL injection) 34 | if not re.match(r"^[a-zA-Z0-9_]+$", schema_name): 35 | return JSONResponse( 36 | status_code=400, 37 | content={"detail": "Invalid schema name format"}, 38 | ) 39 | 40 | # Check if schema exists (optional) 41 | if self.schema_exists_func and schema_name != self.default_schema: 42 | try: 43 | schema_exists = await self.schema_exists_func(schema_name) 44 | if not schema_exists: 45 | return JSONResponse( 46 | status_code=403, 47 | content={ 48 | "detail": f"Schema '{schema_name}' does not exist" 49 | }, 50 | ) 51 | except Exception as e: 52 | logger.error(f"Error checking schema existence: {e}") 53 | return JSONResponse( 54 | status_code=500, 55 | content={ 56 | "detail": "Internal server error checking schema" 57 | }, 58 | ) 59 | 60 | # Set the project schema in the context for this request 61 | schema_name = schema_name.replace('"', "") 62 | 63 | token = set_project_schema(schema_name) 64 | 65 | try: 66 | # Process the request with the set schema 67 | return await call_next(request) 68 | finally: 69 | # Reset context when done 70 | project_schema_context.reset(token) 71 | -------------------------------------------------------------------------------- /py/core/main/orchestration/__init__.py: -------------------------------------------------------------------------------- 1 | # FIXME: Once the Hatchet workflows are type annotated, remove the type: ignore comments 2 | from .hatchet.graph_workflow import ( # type: ignore 3 | hatchet_graph_search_results_factory, 4 | ) 5 | from .hatchet.ingestion_workflow import ( # type: ignore 6 | hatchet_ingestion_factory, 7 | ) 8 | from .simple.graph_workflow import simple_graph_search_results_factory 9 | from .simple.ingestion_workflow import simple_ingestion_factory 10 | 11 | __all__ = [ 12 | "hatchet_ingestion_factory", 13 | "hatchet_graph_search_results_factory", 14 | "simple_ingestion_factory", 15 | "simple_graph_search_results_factory", 16 | ] 17 | -------------------------------------------------------------------------------- /py/core/main/orchestration/hatchet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/main/orchestration/hatchet/__init__.py -------------------------------------------------------------------------------- /py/core/main/orchestration/simple/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/main/orchestration/simple/__init__.py -------------------------------------------------------------------------------- /py/core/main/services/__init__.py: -------------------------------------------------------------------------------- 1 | from .auth_service import AuthService 2 | from .graph_service import GraphService 3 | from .ingestion_service import IngestionService, IngestionServiceAdapter 4 | from .maintenance_service import MaintenanceService 5 | from .management_service import ManagementService 6 | from .retrieval_service import RetrievalService # type: ignore 7 | 8 | __all__ = [ 9 | "AuthService", 10 | "IngestionService", 11 | "IngestionServiceAdapter", 12 | "MaintenanceService", 13 | "ManagementService", 14 | "GraphService", 15 | "RetrievalService", 16 | ] 17 | -------------------------------------------------------------------------------- /py/core/main/services/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | from ..abstractions import R2RProviders 4 | from ..config import R2RConfig 5 | 6 | 7 | class Service(ABC): 8 | def __init__( 9 | self, 10 | config: R2RConfig, 11 | providers: R2RProviders, 12 | ): 13 | self.config = config 14 | self.providers = providers 15 | -------------------------------------------------------------------------------- /py/core/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .media import * 2 | from .structured import * 3 | from .text import * 4 | 5 | __all__ = [ 6 | "AudioParser", 7 | "BMPParser", 8 | "DOCParser", 9 | "DOCXParser", 10 | "ImageParser", 11 | "ODTParser", 12 | "OCRPDFParser", 13 | "VLMPDFParser", 14 | "BasicPDFParser", 15 | "PDFParserUnstructured", 16 | "PPTParser", 17 | "PPTXParser", 18 | "RTFParser", 19 | "CSVParser", 20 | "CSVParserAdvanced", 21 | "EMLParser", 22 | "EPUBParser", 23 | "JSONParser", 24 | "MSGParser", 25 | "ORGParser", 26 | "P7SParser", 27 | "RSTParser", 28 | "TSVParser", 29 | "XLSParser", 30 | "XLSXParser", 31 | "XLSXParserAdvanced", 32 | "MDParser", 33 | "HTMLParser", 34 | "TextParser", 35 | "PythonParser", 36 | "CSSParser", 37 | "JSParser", 38 | "TSParser", 39 | ] 40 | -------------------------------------------------------------------------------- /py/core/parsers/media/__init__.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from .audio_parser import AudioParser 3 | from .bmp_parser import BMPParser 4 | from .doc_parser import DOCParser 5 | from .docx_parser import DOCXParser 6 | from .img_parser import ImageParser 7 | from .odt_parser import ODTParser 8 | from .pdf_parser import ( 9 | BasicPDFParser, 10 | OCRPDFParser, 11 | PDFParserUnstructured, 12 | VLMPDFParser, 13 | ) 14 | from .ppt_parser import PPTParser 15 | from .pptx_parser import PPTXParser 16 | from .rtf_parser import RTFParser 17 | 18 | __all__ = [ 19 | "AudioParser", 20 | "BMPParser", 21 | "DOCParser", 22 | "DOCXParser", 23 | "ImageParser", 24 | "ODTParser", 25 | "OCRPDFParser", 26 | "VLMPDFParser", 27 | "BasicPDFParser", 28 | "PDFParserUnstructured", 29 | "PPTParser", 30 | "PPTXParser", 31 | "RTFParser", 32 | ] 33 | -------------------------------------------------------------------------------- /py/core/parsers/media/audio_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import logging 3 | import os 4 | import tempfile 5 | from typing import AsyncGenerator 6 | 7 | from litellm import atranscription 8 | 9 | from core.base.parsers.base_parser import AsyncParser 10 | from core.base.providers import ( 11 | CompletionProvider, 12 | DatabaseProvider, 13 | IngestionConfig, 14 | ) 15 | 16 | logger = logging.getLogger() 17 | 18 | 19 | class AudioParser(AsyncParser[bytes]): 20 | """A parser for audio data using Whisper transcription.""" 21 | 22 | def __init__( 23 | self, 24 | config: IngestionConfig, 25 | database_provider: DatabaseProvider, 26 | llm_provider: CompletionProvider, 27 | ): 28 | self.database_provider = database_provider 29 | self.llm_provider = llm_provider 30 | self.config = config 31 | self.atranscription = atranscription 32 | 33 | async def ingest( # type: ignore 34 | self, data: bytes, **kwargs 35 | ) -> AsyncGenerator[str, None]: 36 | """Ingest audio data and yield a transcription using Whisper via 37 | LiteLLM. 38 | 39 | Args: 40 | data: Raw audio bytes 41 | *args, **kwargs: Additional arguments passed to the transcription call 42 | 43 | Yields: 44 | Chunks of transcribed text 45 | """ 46 | try: 47 | # Create a temporary file to store the audio data 48 | with tempfile.NamedTemporaryFile( 49 | suffix=".wav", delete=False 50 | ) as temp_file: 51 | temp_file.write(data) 52 | temp_file_path = temp_file.name 53 | 54 | # Call Whisper transcription 55 | response = await self.atranscription( 56 | model=self.config.audio_transcription_model 57 | or self.config.app.audio_lm, 58 | file=open(temp_file_path, "rb"), 59 | **kwargs, 60 | ) 61 | 62 | # The response should contain the transcribed text directly 63 | yield response.text 64 | 65 | except Exception as e: 66 | logger.error(f"Error processing audio with Whisper: {str(e)}") 67 | raise 68 | 69 | finally: 70 | # Clean up the temporary file 71 | try: 72 | os.unlink(temp_file_path) 73 | except Exception as e: 74 | logger.warning(f"Failed to delete temporary file: {str(e)}") 75 | -------------------------------------------------------------------------------- /py/core/parsers/media/bmp_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from core.base.parsers.base_parser import AsyncParser 5 | from core.base.providers import ( 6 | CompletionProvider, 7 | DatabaseProvider, 8 | IngestionConfig, 9 | ) 10 | 11 | 12 | class BMPParser(AsyncParser[str | bytes]): 13 | """A parser for BMP image data.""" 14 | 15 | def __init__( 16 | self, 17 | config: IngestionConfig, 18 | database_provider: DatabaseProvider, 19 | llm_provider: CompletionProvider, 20 | ): 21 | self.database_provider = database_provider 22 | self.llm_provider = llm_provider 23 | self.config = config 24 | 25 | import struct 26 | 27 | self.struct = struct 28 | 29 | async def extract_bmp_metadata(self, data: bytes) -> dict: 30 | """Extract metadata from BMP file header.""" 31 | try: 32 | # BMP header format 33 | header_format = "<2sIHHI" 34 | header_size = self.struct.calcsize(header_format) 35 | 36 | # Unpack header data 37 | ( 38 | signature, 39 | file_size, 40 | reserved, 41 | reserved2, 42 | data_offset, 43 | ) = self.struct.unpack(header_format, data[:header_size]) 44 | 45 | # DIB header 46 | dib_format = " AsyncGenerator[str, None]: 68 | """Ingest BMP data and yield metadata description.""" 69 | if isinstance(data, str): 70 | # Convert base64 string to bytes if needed 71 | import base64 72 | 73 | data = base64.b64decode(data) 74 | 75 | metadata = await self.extract_bmp_metadata(data) 76 | 77 | # Generate description of the BMP file 78 | yield f"BMP image with dimensions {metadata.get('width', 'unknown')}x{metadata.get('height', 'unknown')} pixels, {metadata.get('bits_per_pixel', 'unknown')} bits per pixel, file size: {metadata.get('file_size', 'unknown')} bytes" 79 | -------------------------------------------------------------------------------- /py/core/parsers/media/docx_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from io import BytesIO 3 | from typing import AsyncGenerator 4 | 5 | from docx import Document 6 | 7 | from core.base.parsers.base_parser import AsyncParser 8 | from core.base.providers import ( 9 | CompletionProvider, 10 | DatabaseProvider, 11 | IngestionConfig, 12 | ) 13 | 14 | 15 | class DOCXParser(AsyncParser[str | bytes]): 16 | """A parser for DOCX data.""" 17 | 18 | def __init__( 19 | self, 20 | config: IngestionConfig, 21 | database_provider: DatabaseProvider, 22 | llm_provider: CompletionProvider, 23 | ): 24 | self.database_provider = database_provider 25 | self.llm_provider = llm_provider 26 | self.config = config 27 | self.Document = Document 28 | 29 | async def ingest( 30 | self, data: str | bytes, *args, **kwargs 31 | ) -> AsyncGenerator[str, None]: # type: ignore 32 | """Ingest DOCX data and yield text from each paragraph.""" 33 | if isinstance(data, str): 34 | raise ValueError("DOCX data must be in bytes format.") 35 | 36 | doc = self.Document(BytesIO(data)) 37 | for paragraph in doc.paragraphs: 38 | yield paragraph.text 39 | -------------------------------------------------------------------------------- /py/core/parsers/media/odt_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import xml.etree.ElementTree as ET 3 | import zipfile 4 | from typing import AsyncGenerator 5 | 6 | from core.base.parsers.base_parser import AsyncParser 7 | from core.base.providers import ( 8 | CompletionProvider, 9 | DatabaseProvider, 10 | IngestionConfig, 11 | ) 12 | 13 | 14 | class ODTParser(AsyncParser[str | bytes]): 15 | def __init__( 16 | self, 17 | config: IngestionConfig, 18 | database_provider: DatabaseProvider, 19 | llm_provider: CompletionProvider, 20 | ): 21 | self.database_provider = database_provider 22 | self.llm_provider = llm_provider 23 | self.config = config 24 | self.zipfile = zipfile 25 | self.ET = ET 26 | 27 | async def ingest( 28 | self, data: str | bytes, **kwargs 29 | ) -> AsyncGenerator[str, None]: 30 | if isinstance(data, str): 31 | raise ValueError("ODT data must be in bytes format.") 32 | 33 | from io import BytesIO 34 | 35 | file_obj = BytesIO(data) 36 | 37 | try: 38 | with self.zipfile.ZipFile(file_obj) as odt: 39 | # ODT files are zip archives containing content.xml 40 | content = odt.read("content.xml") 41 | root = self.ET.fromstring(content) 42 | 43 | # ODT XML namespace 44 | ns = {"text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0"} 45 | 46 | # Extract paragraphs and headers 47 | for p in root.findall(".//text:p", ns): 48 | text = "".join(p.itertext()) 49 | if text.strip(): 50 | yield text.strip() 51 | 52 | for h in root.findall(".//text:h", ns): 53 | text = "".join(h.itertext()) 54 | if text.strip(): 55 | yield text.strip() 56 | 57 | except Exception as e: 58 | raise ValueError(f"Error processing ODT file: {str(e)}") from e 59 | finally: 60 | file_obj.close() 61 | -------------------------------------------------------------------------------- /py/core/parsers/media/pptx_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from io import BytesIO 3 | from typing import AsyncGenerator 4 | 5 | from pptx import Presentation 6 | 7 | from core.base.parsers.base_parser import AsyncParser 8 | from core.base.providers import ( 9 | CompletionProvider, 10 | DatabaseProvider, 11 | IngestionConfig, 12 | ) 13 | 14 | 15 | class PPTXParser(AsyncParser[str | bytes]): 16 | """A parser for PPT data.""" 17 | 18 | def __init__( 19 | self, 20 | config: IngestionConfig, 21 | database_provider: DatabaseProvider, 22 | llm_provider: CompletionProvider, 23 | ): 24 | self.database_provider = database_provider 25 | self.llm_provider = llm_provider 26 | self.config = config 27 | self.Presentation = Presentation 28 | 29 | async def ingest( 30 | self, data: str | bytes, **kwargs 31 | ) -> AsyncGenerator[str, None]: # type: ignore 32 | """Ingest PPT data and yield text from each slide.""" 33 | if isinstance(data, str): 34 | raise ValueError("PPT data must be in bytes format.") 35 | 36 | prs = self.Presentation(BytesIO(data)) 37 | for slide in prs.slides: 38 | for shape in slide.shapes: 39 | if hasattr(shape, "text"): 40 | yield shape.text 41 | -------------------------------------------------------------------------------- /py/core/parsers/media/rtf_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from striprtf.striprtf import rtf_to_text 5 | 6 | from core.base.parsers.base_parser import AsyncParser 7 | from core.base.providers import ( 8 | CompletionProvider, 9 | DatabaseProvider, 10 | IngestionConfig, 11 | ) 12 | 13 | 14 | class RTFParser(AsyncParser[str | bytes]): 15 | """Parser for Rich Text Format (.rtf) files.""" 16 | 17 | def __init__( 18 | self, 19 | config: IngestionConfig, 20 | database_provider: DatabaseProvider, 21 | llm_provider: CompletionProvider, 22 | ): 23 | self.database_provider = database_provider 24 | self.llm_provider = llm_provider 25 | self.config = config 26 | self.striprtf = rtf_to_text 27 | 28 | async def ingest( 29 | self, data: str | bytes, **kwargs 30 | ) -> AsyncGenerator[str, None]: 31 | if isinstance(data, bytes): 32 | data = data.decode("utf-8", errors="ignore") 33 | 34 | try: 35 | # Convert RTF to plain text 36 | plain_text = self.striprtf(data) 37 | 38 | # Split into paragraphs and yield non-empty ones 39 | paragraphs = plain_text.split("\n\n") 40 | for paragraph in paragraphs: 41 | if paragraph.strip(): 42 | yield paragraph.strip() 43 | 44 | except Exception as e: 45 | raise ValueError(f"Error processing RTF file: {str(e)}") from e 46 | -------------------------------------------------------------------------------- /py/core/parsers/structured/__init__.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from .csv_parser import CSVParser, CSVParserAdvanced 3 | from .eml_parser import EMLParser 4 | from .epub_parser import EPUBParser 5 | from .json_parser import JSONParser 6 | from .msg_parser import MSGParser 7 | from .org_parser import ORGParser 8 | from .p7s_parser import P7SParser 9 | from .rst_parser import RSTParser 10 | from .tsv_parser import TSVParser 11 | from .xls_parser import XLSParser 12 | from .xlsx_parser import XLSXParser, XLSXParserAdvanced 13 | 14 | __all__ = [ 15 | "CSVParser", 16 | "CSVParserAdvanced", 17 | "EMLParser", 18 | "EPUBParser", 19 | "JSONParser", 20 | "MSGParser", 21 | "ORGParser", 22 | "P7SParser", 23 | "RSTParser", 24 | "TSVParser", 25 | "XLSParser", 26 | "XLSXParser", 27 | "XLSXParserAdvanced", 28 | ] 29 | -------------------------------------------------------------------------------- /py/core/parsers/structured/eml_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from email import message_from_bytes, policy 3 | from typing import AsyncGenerator 4 | 5 | from core.base.parsers.base_parser import AsyncParser 6 | from core.base.providers import ( 7 | CompletionProvider, 8 | DatabaseProvider, 9 | IngestionConfig, 10 | ) 11 | 12 | 13 | class EMLParser(AsyncParser[str | bytes]): 14 | """Parser for EML (email) files.""" 15 | 16 | def __init__( 17 | self, 18 | config: IngestionConfig, 19 | database_provider: DatabaseProvider, 20 | llm_provider: CompletionProvider, 21 | ): 22 | self.database_provider = database_provider 23 | self.llm_provider = llm_provider 24 | self.config = config 25 | 26 | async def ingest( 27 | self, data: str | bytes, **kwargs 28 | ) -> AsyncGenerator[str, None]: 29 | """Ingest EML data and yield email content.""" 30 | if isinstance(data, str): 31 | raise ValueError("EML data must be in bytes format.") 32 | 33 | # Parse email with policy for modern email handling 34 | email_message = message_from_bytes(data, policy=policy.default) 35 | 36 | # Extract and yield email metadata 37 | metadata = [] 38 | if email_message["Subject"]: 39 | metadata.append(f"Subject: {email_message['Subject']}") 40 | if email_message["From"]: 41 | metadata.append(f"From: {email_message['From']}") 42 | if email_message["To"]: 43 | metadata.append(f"To: {email_message['To']}") 44 | if email_message["Date"]: 45 | metadata.append(f"Date: {email_message['Date']}") 46 | 47 | if metadata: 48 | yield "\n".join(metadata) 49 | 50 | # Extract and yield email body 51 | if email_message.is_multipart(): 52 | for part in email_message.walk(): 53 | if part.get_content_type() == "text/plain": 54 | text = part.get_content() 55 | if text.strip(): 56 | yield text.strip() 57 | elif part.get_content_type() == "text/html": 58 | # Could add HTML parsing here if needed 59 | continue 60 | else: 61 | body = email_message.get_content() 62 | if body.strip(): 63 | yield body.strip() 64 | -------------------------------------------------------------------------------- /py/core/parsers/structured/msg_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | import tempfile 4 | from typing import AsyncGenerator 5 | 6 | from msg_parser import MsOxMessage 7 | 8 | from core.base.parsers.base_parser import AsyncParser 9 | from core.base.providers import ( 10 | CompletionProvider, 11 | DatabaseProvider, 12 | IngestionConfig, 13 | ) 14 | 15 | 16 | class MSGParser(AsyncParser[str | bytes]): 17 | """Parser for MSG (Outlook Message) files using msg_parser.""" 18 | 19 | def __init__( 20 | self, 21 | config: IngestionConfig, 22 | database_provider: DatabaseProvider, 23 | llm_provider: CompletionProvider, 24 | ): 25 | self.database_provider = database_provider 26 | self.llm_provider = llm_provider 27 | self.config = config 28 | 29 | async def ingest( 30 | self, data: str | bytes, **kwargs 31 | ) -> AsyncGenerator[str, None]: 32 | """Ingest MSG data and yield email content.""" 33 | if isinstance(data, str): 34 | raise ValueError("MSG data must be in bytes format.") 35 | 36 | tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".msg") 37 | try: 38 | tmp_file.write(data) 39 | tmp_file.close() 40 | 41 | msg = MsOxMessage(tmp_file.name) 42 | 43 | metadata = [] 44 | 45 | if msg.subject: 46 | metadata.append(f"Subject: {msg.subject}") 47 | if msg.sender: 48 | metadata.append(f"From: {msg.sender}") 49 | if msg.to: 50 | metadata.append(f"To: {', '.join(msg.to)}") 51 | if msg.sent_date: 52 | metadata.append(f"Date: {msg.sent_date}") 53 | if metadata: 54 | yield "\n".join(metadata) 55 | if msg.body: 56 | yield msg.body.strip() 57 | 58 | for attachment in msg.attachments: 59 | if attachment.Filename: 60 | yield f"\nAttachment: {attachment.Filename}" 61 | 62 | except Exception as e: 63 | raise ValueError(f"Error processing MSG file: {str(e)}") from e 64 | finally: 65 | os.remove(tmp_file.name) 66 | -------------------------------------------------------------------------------- /py/core/parsers/structured/org_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | import orgparse 5 | 6 | from core.base.parsers.base_parser import AsyncParser 7 | from core.base.providers import ( 8 | CompletionProvider, 9 | DatabaseProvider, 10 | IngestionConfig, 11 | ) 12 | 13 | 14 | class ORGParser(AsyncParser[str | bytes]): 15 | """Parser for ORG (Emacs Org-mode) files.""" 16 | 17 | def __init__( 18 | self, 19 | config: IngestionConfig, 20 | database_provider: DatabaseProvider, 21 | llm_provider: CompletionProvider, 22 | ): 23 | self.database_provider = database_provider 24 | self.llm_provider = llm_provider 25 | self.config = config 26 | self.orgparse = orgparse 27 | 28 | def _process_node(self, node) -> list[str]: 29 | """Process an org-mode node and return its content.""" 30 | contents = [] 31 | 32 | # Add heading with proper level of asterisks 33 | if node.level > 0: 34 | contents.append(f"{'*' * node.level} {node.heading}") 35 | 36 | # Add body content if exists 37 | if node.body: 38 | contents.append(node.body.strip()) 39 | 40 | return contents 41 | 42 | async def ingest( 43 | self, data: str | bytes, **kwargs 44 | ) -> AsyncGenerator[str, None]: 45 | """Ingest ORG data and yield document content.""" 46 | if isinstance(data, bytes): 47 | data = data.decode("utf-8") 48 | 49 | try: 50 | # Create a temporary file-like object for orgparse 51 | from io import StringIO 52 | 53 | file_obj = StringIO(data) 54 | 55 | # Parse the org file 56 | root = self.orgparse.load(file_obj) 57 | 58 | # Process root node if it has content 59 | if root.body: 60 | yield root.body.strip() 61 | 62 | # Process all nodes 63 | for node in root[1:]: # Skip root node in iteration 64 | contents = self._process_node(node) 65 | for content in contents: 66 | if content.strip(): 67 | yield content.strip() 68 | 69 | except Exception as e: 70 | raise ValueError(f"Error processing ORG file: {str(e)}") from e 71 | finally: 72 | file_obj.close() 73 | -------------------------------------------------------------------------------- /py/core/parsers/structured/rst_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from docutils.core import publish_string 5 | from docutils.writers import html5_polyglot 6 | 7 | from core.base.parsers.base_parser import AsyncParser 8 | from core.base.providers import ( 9 | CompletionProvider, 10 | DatabaseProvider, 11 | IngestionConfig, 12 | ) 13 | 14 | 15 | class RSTParser(AsyncParser[str | bytes]): 16 | """Parser for reStructuredText (.rst) files.""" 17 | 18 | def __init__( 19 | self, 20 | config: IngestionConfig, 21 | database_provider: DatabaseProvider, 22 | llm_provider: CompletionProvider, 23 | ): 24 | self.database_provider = database_provider 25 | self.llm_provider = llm_provider 26 | self.config = config 27 | self.publish_string = publish_string 28 | self.html5_polyglot = html5_polyglot 29 | 30 | async def ingest( 31 | self, data: str | bytes, **kwargs 32 | ) -> AsyncGenerator[str, None]: 33 | if isinstance(data, bytes): 34 | data = data.decode("utf-8") 35 | 36 | try: 37 | # Convert RST to HTML 38 | html = self.publish_string( 39 | source=data, 40 | writer=self.html5_polyglot.Writer(), 41 | settings_overrides={"report_level": 5}, 42 | ) 43 | 44 | # Basic HTML cleanup 45 | import re 46 | 47 | text = html.decode("utf-8") 48 | text = re.sub(r"<[^>]+>", " ", text) 49 | text = re.sub(r"\s+", " ", text) 50 | 51 | # Split into paragraphs and yield non-empty ones 52 | paragraphs = text.split("\n\n") 53 | for paragraph in paragraphs: 54 | if paragraph.strip(): 55 | yield paragraph.strip() 56 | 57 | except Exception as e: 58 | raise ValueError(f"Error processing RST file: {str(e)}") from e 59 | -------------------------------------------------------------------------------- /py/core/parsers/text/__init__.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from .css_parser import CSSParser 3 | from .html_parser import HTMLParser 4 | from .js_parser import JSParser 5 | from .md_parser import MDParser 6 | from .python_parser import PythonParser 7 | from .text_parser import TextParser 8 | from .ts_parser import TSParser 9 | 10 | __all__ = [ 11 | "MDParser", 12 | "HTMLParser", 13 | "TextParser", 14 | "PythonParser", 15 | "CSSParser", 16 | "JSParser", 17 | "TSParser", 18 | ] 19 | -------------------------------------------------------------------------------- /py/core/parsers/text/html_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from core.base.parsers.base_parser import AsyncParser 7 | from core.base.providers import ( 8 | CompletionProvider, 9 | DatabaseProvider, 10 | IngestionConfig, 11 | ) 12 | 13 | 14 | class HTMLParser(AsyncParser[str | bytes]): 15 | """A parser for HTML data.""" 16 | 17 | def __init__( 18 | self, 19 | config: IngestionConfig, 20 | database_provider: DatabaseProvider, 21 | llm_provider: CompletionProvider, 22 | ): 23 | self.database_provider = database_provider 24 | self.llm_provider = llm_provider 25 | self.config = config 26 | 27 | async def ingest( 28 | self, data: str | bytes, *args, **kwargs 29 | ) -> AsyncGenerator[str, None]: 30 | """Ingest HTML data and yield text.""" 31 | soup = BeautifulSoup(data, "html.parser") 32 | yield soup.get_text() 33 | -------------------------------------------------------------------------------- /py/core/parsers/text/md_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from core.base.parsers.base_parser import AsyncParser 7 | from core.base.providers import ( 8 | CompletionProvider, 9 | DatabaseProvider, 10 | IngestionConfig, 11 | ) 12 | 13 | 14 | class MDParser(AsyncParser[str | bytes]): 15 | """A parser for Markdown data.""" 16 | 17 | def __init__( 18 | self, 19 | config: IngestionConfig, 20 | database_provider: DatabaseProvider, 21 | llm_provider: CompletionProvider, 22 | ): 23 | self.database_provider = database_provider 24 | self.llm_provider = llm_provider 25 | self.config = config 26 | 27 | import markdown 28 | 29 | self.markdown = markdown 30 | 31 | async def ingest( 32 | self, data: str | bytes, *args, **kwargs 33 | ) -> AsyncGenerator[str, None]: 34 | """Ingest Markdown data and yield text.""" 35 | if isinstance(data, bytes): 36 | data = data.decode("utf-8") 37 | html = self.markdown.markdown(data) 38 | soup = BeautifulSoup(html, "html.parser") 39 | yield soup.get_text() 40 | -------------------------------------------------------------------------------- /py/core/parsers/text/text_parser.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from typing import AsyncGenerator 3 | 4 | from core.base.parsers.base_parser import AsyncParser 5 | from core.base.providers import ( 6 | CompletionProvider, 7 | DatabaseProvider, 8 | IngestionConfig, 9 | ) 10 | 11 | 12 | class TextParser(AsyncParser[str | bytes]): 13 | """A parser for raw text data.""" 14 | 15 | def __init__( 16 | self, 17 | config: IngestionConfig, 18 | database_provider: DatabaseProvider, 19 | llm_provider: CompletionProvider, 20 | ): 21 | self.database_provider = database_provider 22 | self.llm_provider = llm_provider 23 | self.config = config 24 | 25 | async def ingest( 26 | self, data: str | bytes, *args, **kwargs 27 | ) -> AsyncGenerator[str | bytes, None]: 28 | if isinstance(data, bytes): 29 | data = data.decode("utf-8") 30 | yield data 31 | -------------------------------------------------------------------------------- /py/core/providers/__init__.py: -------------------------------------------------------------------------------- 1 | from .auth import ( 2 | ClerkAuthProvider, 3 | JwtAuthProvider, 4 | R2RAuthProvider, 5 | SupabaseAuthProvider, 6 | ) 7 | from .crypto import ( 8 | BcryptCryptoConfig, 9 | BCryptCryptoProvider, 10 | NaClCryptoConfig, 11 | NaClCryptoProvider, 12 | ) 13 | from .database import PostgresDatabaseProvider 14 | from .email import ( 15 | AsyncSMTPEmailProvider, 16 | ConsoleMockEmailProvider, 17 | MailerSendEmailProvider, 18 | SendGridEmailProvider, 19 | ) 20 | from .embeddings import ( 21 | LiteLLMEmbeddingProvider, 22 | OllamaEmbeddingProvider, 23 | OpenAIEmbeddingProvider, 24 | ) 25 | from .file import ( 26 | PostgresFileProvider, 27 | S3FileProvider, 28 | ) 29 | from .ingestion import ( # type: ignore 30 | R2RIngestionConfig, 31 | R2RIngestionProvider, 32 | UnstructuredIngestionConfig, 33 | UnstructuredIngestionProvider, 34 | ) 35 | from .llm import ( 36 | AnthropicCompletionProvider, 37 | LiteLLMCompletionProvider, 38 | OpenAICompletionProvider, 39 | R2RCompletionProvider, 40 | ) 41 | from .ocr import ( 42 | MistralOCRProvider, 43 | ) 44 | from .orchestration import ( 45 | HatchetOrchestrationProvider, 46 | SimpleOrchestrationProvider, 47 | ) 48 | from .scheduler import ( 49 | APSchedulerProvider, 50 | ) 51 | 52 | __all__ = [ 53 | # Auth 54 | "R2RAuthProvider", 55 | "SupabaseAuthProvider", 56 | "JwtAuthProvider", 57 | "ClerkAuthProvider", 58 | # Ingestion 59 | "R2RIngestionProvider", 60 | "R2RIngestionConfig", 61 | "UnstructuredIngestionProvider", 62 | "UnstructuredIngestionConfig", 63 | # Crypto 64 | "BCryptCryptoProvider", 65 | "BcryptCryptoConfig", 66 | "NaClCryptoConfig", 67 | "NaClCryptoProvider", 68 | # Database 69 | "PostgresDatabaseProvider", 70 | # Embeddings 71 | "LiteLLMEmbeddingProvider", 72 | "OllamaEmbeddingProvider", 73 | "OpenAIEmbeddingProvider", 74 | # Email 75 | "AsyncSMTPEmailProvider", 76 | "ConsoleMockEmailProvider", 77 | "SendGridEmailProvider", 78 | "MailerSendEmailProvider", 79 | # File 80 | "PostgresFileProvider", 81 | "S3FileProvider", 82 | # LLM 83 | "AnthropicCompletionProvider", 84 | "OpenAICompletionProvider", 85 | "R2RCompletionProvider", 86 | "LiteLLMCompletionProvider", 87 | # OCR 88 | "MistralOCRProvider", 89 | # Orchestration 90 | "HatchetOrchestrationProvider", 91 | "SimpleOrchestrationProvider", 92 | # Scheduler 93 | "APSchedulerProvider", 94 | ] 95 | -------------------------------------------------------------------------------- /py/core/providers/auth/__init__.py: -------------------------------------------------------------------------------- 1 | from .clerk import ClerkAuthProvider 2 | from .jwt import JwtAuthProvider 3 | from .r2r_auth import R2RAuthProvider 4 | from .supabase import SupabaseAuthProvider 5 | 6 | __all__ = [ 7 | "R2RAuthProvider", 8 | "SupabaseAuthProvider", 9 | "JwtAuthProvider", 10 | "ClerkAuthProvider", 11 | ] 12 | -------------------------------------------------------------------------------- /py/core/providers/crypto/__init__.py: -------------------------------------------------------------------------------- 1 | from .bcrypt import BcryptCryptoConfig, BCryptCryptoProvider 2 | from .nacl import NaClCryptoConfig, NaClCryptoProvider 3 | 4 | __all__ = [ 5 | "BCryptCryptoProvider", 6 | "BcryptCryptoConfig", 7 | "NaClCryptoConfig", 8 | "NaClCryptoProvider", 9 | ] 10 | -------------------------------------------------------------------------------- /py/core/providers/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .postgres import PostgresDatabaseProvider 2 | 3 | __all__ = [ 4 | "PostgresDatabaseProvider", 5 | ] 6 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/providers/database/prompts/__init__.py -------------------------------------------------------------------------------- /py/core/providers/database/prompts/chunk_enrichment.yaml: -------------------------------------------------------------------------------- 1 | chunk_enrichment: 2 | template: > 3 | ## Task: 4 | 5 | Enrich and refine the given chunk of text while maintaining its independence and precision. 6 | 7 | ## Context: 8 | Document Summary: {document_summary} 9 | Preceding Chunks: {preceding_chunks} 10 | Succeeding Chunks: {succeeding_chunks} 11 | 12 | ## Input Chunk: 13 | {chunk} 14 | 15 | ## Semantic Organization Guidelines: 16 | 1. Group related information: 17 | - Combine logically connected data points 18 | - Maintain context within each grouping 19 | - Preserve relationships between entities 20 | 21 | 2. Structure hierarchy: 22 | - Organize from general to specific 23 | - Use clear categorical divisions 24 | - Maintain parent-child relationships 25 | 26 | 3. Information density: 27 | - Balance completeness with clarity 28 | - Ensure each chunk can stand alone 29 | - Preserve essential context 30 | 31 | 4. Pattern recognition: 32 | - Standardize similar information 33 | - Use consistent formatting for similar data types 34 | - It is appropriate to restructure tables or lists in ways that are more advantageous for sematic matching 35 | - Maintain searchable patterns 36 | 37 | ## Output Requirements: 38 | 1. Each chunk should be independently meaningful 39 | 2. Related information should stay together 40 | 3. Format should support efficient matching 41 | 4. Original data relationships must be preserved 42 | 5. Context should be clear without external references 43 | 44 | Maximum length: {chunk_size} characters 45 | 46 | Output the restructured chunk only. 47 | 48 | ## Restructured Chunk: 49 | 50 | input_types: 51 | document_summary: str 52 | chunk: str 53 | preceding_chunks: str 54 | succeeding_chunks: str 55 | chunk_size: int 56 | overwrite_on_diff: true 57 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/collection_summary.yaml: -------------------------------------------------------------------------------- 1 | collection_summary: 2 | template: > 3 | ## Task: 4 | 5 | Generate a comprehensive collection-level summary that describes the overall content, themes, and relationships across multiple documents. The summary should provide a high-level understanding of what the collection contains and represents. 6 | 7 | ### Input Documents: 8 | 9 | Document Summaries: 10 | {document_summaries} 11 | 12 | ### Requirements: 13 | 14 | 1. SCOPE 15 | - Synthesize key themes and patterns across all documents 16 | - Identify common topics, entities, and relationships 17 | - Capture the collection's overall purpose or domain 18 | 19 | 2. STRUCTURE 20 | - Target length: Approximately 3-4 concise sentences 21 | - Focus on collective insights rather than individual document details 22 | 23 | 3. CONTENT GUIDELINES 24 | - Emphasize shared concepts and recurring elements 25 | - Highlight any temporal or thematic progression 26 | - Identify key stakeholders or entities that appear across documents 27 | - Note any significant relationships between documents 28 | 29 | 4. INTEGRATION PRINCIPLES 30 | - Connect related concepts across different documents 31 | - Identify overarching narratives or frameworks 32 | - Preserve important context from individual documents 33 | - Balance breadth of coverage with depth of insight 34 | 35 | ### Query: 36 | 37 | Generate a collection-level summary following the above requirements. Focus on synthesizing the key themes and relationships across all documents while maintaining clarity and concision. 38 | 39 | ## Response: 40 | input_types: 41 | document_summaries: str 42 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/dynamic_rag_agent.yaml: -------------------------------------------------------------------------------- 1 | dynamic_rag_agent: 2 | template: > 3 | ### You are a helpful agent that can search for information, the date is {date}. 4 | 5 | 6 | The response should contain line-item attributions to relevant search results, and be as informative if possible. Note that you will only be able to load {max_tool_context_length} tokens of context at a time, if the context surpasses this then it will be truncated. If possible, set filters which will reduce the context returned to only that which is specific, by means of '$eq' or '$overlap' filters. 7 | 8 | 9 | Search rarely exceeds the context window, while getting raw context can depending on the user data shown below. IF YOU CAN FETCH THE RAW CONTEXT, THEN DO SO. 10 | 11 | 12 | The available user documents and collections are shown below: 13 | 14 | <= Documents => 15 | {document_context} 16 | 17 | 18 | If no relevant results are found, then state that no results were found. If no obvious question is present given the available tools and context, then do not carry out a search, and instead ask for clarification. 19 | 20 | 21 | REMINDER - Use line item references to like [c910e2e], [b12cd2f], to refer to the specific search result IDs returned in the provided context. 22 | 23 | input_types: 24 | date: str 25 | document_context: str 26 | max_tool_context_length: str 27 | 28 | overwrite_on_diff: true 29 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/graph_entity_description.yaml: -------------------------------------------------------------------------------- 1 | graph_entity_description: 2 | template: | 3 | Given the following information about an entity: 4 | 5 | Document Summary: 6 | {document_summary} 7 | 8 | Entity Information: 9 | {entity_info} 10 | 11 | Relationship Data: 12 | {relationships_txt} 13 | 14 | Generate a comprehensive entity description that: 15 | 16 | 1. Opens with a clear definition statement identifying the entity's primary classification and core function 17 | 2. Incorporates key data points from both the document summary and relationship information 18 | 3. Emphasizes the entity's role within its broader context or system 19 | 4. Highlights critical relationships, particularly those that: 20 | - Demonstrate hierarchical connections 21 | - Show functional dependencies 22 | - Indicate primary use cases or applications 23 | 24 | Format Requirements: 25 | - Length: 2-3 sentences 26 | - Style: Technical and precise 27 | - Structure: Definition + Context + Key Relationships 28 | - Tone: Objective and authoritative 29 | 30 | Integration Guidelines: 31 | - Prioritize information that appears in multiple sources 32 | - Resolve any conflicting information by favoring the most specific source 33 | - Include temporal context if relevant to the entity's current state or evolution 34 | 35 | Output should reflect the entity's complete nature while maintaining concision and clarity. 36 | input_types: 37 | document_summary: str 38 | entity_info: str 39 | relationships_txt: str 40 | overwrite_on_diff: true 41 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/hyde.yaml: -------------------------------------------------------------------------------- 1 | hyde: 2 | template: > 3 | ### Instruction: 4 | 5 | Given the query that follows write a double newline separated list of {num_outputs} single paragraph distinct attempted answers to the given query. 6 | 7 | 8 | DO NOT generate any single answer which is likely to require information from multiple distinct documents, 9 | 10 | EACH single answer will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents. 11 | 12 | 13 | FOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two attempted answers would be 14 | 15 | `The key themes of Great Gatsby are ... ANSWER_CONTINUED` and `The key themes of 1984 are ... ANSWER_CONTINUED`, where `ANSWER_CONTINUED` IS TO BE COMPLETED BY YOU in your response. 16 | 17 | 18 | Here is the original user query to be transformed into answers: 19 | 20 | 21 | ### Query: 22 | 23 | {message} 24 | 25 | 26 | ### Response: 27 | input_types: 28 | num_outputs: int 29 | message: str 30 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/rag.yaml: -------------------------------------------------------------------------------- 1 | rag: 2 | template: > 3 | ## Task: 4 | 5 | Answer the query given immediately below given the context which follows later. Use line item references to like [c910e2e], [b12cd2f], ... refer to provided search results. 6 | 7 | 8 | ### Query: 9 | 10 | {query} 11 | 12 | 13 | ### Context: 14 | 15 | {context} 16 | 17 | 18 | ### Query: 19 | 20 | {query} 21 | 22 | 23 | REMINDER - Use line item references to like [c910e2e], [b12cd2f], to refer to the specific search result IDs returned in the provided context. 24 | 25 | ## Response: 26 | input_types: 27 | query: str 28 | context: str 29 | overwrite_on_diff: true 30 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/rag_fusion.yaml: -------------------------------------------------------------------------------- 1 | rag_fusion: 2 | template: > 3 | ### Instruction: 4 | 5 | 6 | Given the following query that follows to write a double newline separated list of up to {num_outputs} queries meant to help answer the original query. 7 | 8 | DO NOT generate any single query which is likely to require information from multiple distinct documents, 9 | 10 | EACH single query will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents. 11 | 12 | FOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two queries would be 13 | 14 | `What are the key themes of Great Gatsby?` and `What are the key themes of 1984?`. 15 | 16 | Here is the original user query to be transformed into answers: 17 | 18 | 19 | ### Query: 20 | 21 | {message} 22 | 23 | 24 | ### Response: 25 | input_types: 26 | num_outputs: int 27 | message: str 28 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/static_rag_agent.yaml: -------------------------------------------------------------------------------- 1 | static_rag_agent: 2 | template: > 3 | ### You are a helpful agent that can search for information, the date is {date}. 4 | 5 | When asked a question, YOU SHOULD ALWAYS USE YOUR SEARCH TOOL TO ATTEMPT TO SEARCH FOR RELEVANT INFORMATION THAT ANSWERS THE USER QUESTION. 6 | 7 | The response should contain line-item attributions to relevant search results, and be as informative if possible. 8 | 9 | If no relevant results are found, then state that no results were found. If no obvious question is present, then do not carry out a search, and instead ask for clarification. 10 | 11 | REMINDER - Use line item references to like [c910e2e], [b12cd2f], to refer to the specific search result IDs returned in the provided context. 12 | 13 | input_types: 14 | date: str 15 | 16 | overwrite_on_diff: true 17 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/summary.yaml: -------------------------------------------------------------------------------- 1 | summary: 2 | template: > 3 | ## Task: 4 | 5 | Your task is to generate a descriptive summary of the document that follows. Your objective is to return a summary that is roughly 10% of the input document size while retaining as many key points as possible. Your response should begin with `The document contains `. 6 | 7 | ### Document: 8 | 9 | {document} 10 | 11 | 12 | ### Query: 13 | 14 | Reminder: Your task is to generate a descriptive summary of the document that was given. Your objective is to return a summary that is roughly 10% of the input document size while retaining as many key points as possible. Your response should begin with `The document contains `. 15 | 16 | ## Response: 17 | input_types: 18 | document: str 19 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/system.yaml: -------------------------------------------------------------------------------- 1 | system: 2 | template: You are a helpful agent. 3 | input_types: {} 4 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/vision_img.yaml: -------------------------------------------------------------------------------- 1 | vision_img: 2 | template: > 3 | First, provide a title for the image, then explain everything that you see. Be very thorough in your analysis as a user will need to understand the image without seeing it. If it is possible to transcribe the image to text directly, then do so. The more detail you provide, the better the user will understand the image. 4 | input_types: {} 5 | -------------------------------------------------------------------------------- /py/core/providers/database/prompts/vision_pdf.yaml: -------------------------------------------------------------------------------- 1 | vision_pdf: 2 | template: > 3 | Convert this PDF page to markdown format, preserving all content and formatting. Follow these guidelines: 4 | 5 | Text: 6 | - Maintain the original text hierarchy (headings, paragraphs, lists) 7 | - Preserve any special formatting (bold, italic, underline) 8 | - Include all footnotes, citations, and references 9 | - Keep text in its original reading order 10 | 11 | Tables: 12 | - Recreate tables using markdown table syntax 13 | - Preserve all headers, rows, and columns 14 | - Maintain alignment and formatting where possible 15 | - Include any table captions or notes 16 | 17 | Equations: 18 | - Convert mathematical equations using LaTeX notation 19 | - Preserve equation numbers if present 20 | - Include any surrounding context or references 21 | 22 | Images: 23 | - Enclose image descriptions within [FIG] and [/FIG] tags 24 | - Include detailed descriptions of: 25 | * Main subject matter 26 | * Text overlays or captions 27 | * Charts, graphs, or diagrams 28 | * Relevant colors, patterns, or visual elements 29 | - Maintain image placement relative to surrounding text 30 | 31 | Additional Elements: 32 | - Include page numbers if visible 33 | - Preserve headers and footers 34 | - Maintain sidebars or callout boxes 35 | - Keep any special symbols or characters 36 | 37 | Quality Requirements: 38 | - Ensure 100% content preservation 39 | - Maintain logical document flow 40 | - Verify all markdown syntax is valid 41 | - Double-check completeness before submitting 42 | input_types: {} 43 | -------------------------------------------------------------------------------- /py/core/providers/database/tokens.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from typing import Optional 3 | 4 | from core.base import Handler 5 | 6 | from .base import PostgresConnectionManager 7 | 8 | 9 | class PostgresTokensHandler(Handler): 10 | TABLE_NAME = "blacklisted_tokens" 11 | 12 | def __init__( 13 | self, project_name: str, connection_manager: PostgresConnectionManager 14 | ): 15 | super().__init__(project_name, connection_manager) 16 | 17 | async def create_tables(self): 18 | query = f""" 19 | CREATE TABLE IF NOT EXISTS {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} ( 20 | id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), 21 | token TEXT NOT NULL, 22 | blacklisted_at TIMESTAMPTZ DEFAULT NOW() 23 | ); 24 | CREATE INDEX IF NOT EXISTS idx_{self.project_name}_{PostgresTokensHandler.TABLE_NAME}_token 25 | ON {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (token); 26 | CREATE INDEX IF NOT EXISTS idx_{self.project_name}_{PostgresTokensHandler.TABLE_NAME}_blacklisted_at 27 | ON {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (blacklisted_at); 28 | """ 29 | await self.connection_manager.execute_query(query) 30 | 31 | async def blacklist_token( 32 | self, token: str, current_time: Optional[datetime] = None 33 | ): 34 | if current_time is None: 35 | current_time = datetime.utcnow() 36 | 37 | query = f""" 38 | INSERT INTO {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (token, blacklisted_at) 39 | VALUES ($1, $2) 40 | """ 41 | await self.connection_manager.execute_query( 42 | query, [token, current_time] 43 | ) 44 | 45 | async def is_token_blacklisted(self, token: str) -> bool: 46 | query = f""" 47 | SELECT 1 FROM {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} 48 | WHERE token = $1 49 | LIMIT 1 50 | """ 51 | result = await self.connection_manager.fetchrow_query(query, [token]) 52 | return bool(result) 53 | 54 | async def clean_expired_blacklisted_tokens( 55 | self, 56 | max_age_hours: int = 7 * 24, 57 | current_time: Optional[datetime] = None, 58 | ): 59 | if current_time is None: 60 | current_time = datetime.utcnow() 61 | expiry_time = current_time - timedelta(hours=max_age_hours) 62 | 63 | query = f""" 64 | DELETE FROM {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} 65 | WHERE blacklisted_at < $1 66 | """ 67 | await self.connection_manager.execute_query(query, [expiry_time]) 68 | -------------------------------------------------------------------------------- /py/core/providers/database/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Database utility functions for PostgreSQL operations. 3 | """ 4 | 5 | 6 | def psql_quote_literal(value: str) -> str: 7 | """Safely quote a string literal for PostgreSQL to prevent SQL injection. 8 | 9 | This is a simple implementation - in production, you should use proper parameterization 10 | or your database driver's quoting functions. 11 | """ 12 | return "'" + value.replace("'", "''") + "'" 13 | -------------------------------------------------------------------------------- /py/core/providers/email/__init__.py: -------------------------------------------------------------------------------- 1 | from .console_mock import ConsoleMockEmailProvider 2 | from .mailersend import MailerSendEmailProvider 3 | from .sendgrid import SendGridEmailProvider 4 | from .smtp import AsyncSMTPEmailProvider 5 | 6 | __all__ = [ 7 | "ConsoleMockEmailProvider", 8 | "AsyncSMTPEmailProvider", 9 | "SendGridEmailProvider", 10 | "MailerSendEmailProvider", 11 | ] 12 | -------------------------------------------------------------------------------- /py/core/providers/email/console_mock.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional 3 | 4 | from core.base import EmailProvider 5 | 6 | logger = logging.getLogger() 7 | 8 | 9 | class ConsoleMockEmailProvider(EmailProvider): 10 | """A simple email provider that logs emails to console, useful for 11 | testing.""" 12 | 13 | async def send_email( 14 | self, 15 | to_email: str, 16 | subject: str, 17 | body: str, 18 | html_body: Optional[str] = None, 19 | *args, 20 | **kwargs, 21 | ) -> None: 22 | logger.info(f""" 23 | -------- Email Message -------- 24 | To: {to_email} 25 | Subject: {subject} 26 | Body: 27 | {body} 28 | ----------------------------- 29 | """) 30 | 31 | async def send_verification_email( 32 | self, to_email: str, verification_code: str, *args, **kwargs 33 | ) -> None: 34 | logger.info(f""" 35 | -------- Email Message -------- 36 | To: {to_email} 37 | Subject: Please verify your email address 38 | Body: 39 | Verification code: {verification_code} 40 | ----------------------------- 41 | """) 42 | 43 | async def send_password_reset_email( 44 | self, to_email: str, reset_token: str, *args, **kwargs 45 | ) -> None: 46 | logger.info(f""" 47 | -------- Email Message -------- 48 | To: {to_email} 49 | Subject: Password Reset Request 50 | Body: 51 | Reset token: {reset_token} 52 | ----------------------------- 53 | """) 54 | 55 | async def send_password_changed_email( 56 | self, to_email: str, *args, **kwargs 57 | ) -> None: 58 | logger.info(f""" 59 | -------- Email Message -------- 60 | To: {to_email} 61 | Subject: Your Password Has Been Changed 62 | Body: 63 | Your password has been successfully changed. 64 | 65 | For security reasons, you will need to log in again on all your devices. 66 | ----------------------------- 67 | """) 68 | -------------------------------------------------------------------------------- /py/core/providers/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .litellm import LiteLLMEmbeddingProvider 2 | from .ollama import OllamaEmbeddingProvider 3 | from .openai import OpenAIEmbeddingProvider 4 | 5 | __all__ = [ 6 | "LiteLLMEmbeddingProvider", 7 | "OpenAIEmbeddingProvider", 8 | "OllamaEmbeddingProvider", 9 | ] 10 | -------------------------------------------------------------------------------- /py/core/providers/embeddings/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from litellm import get_model_info, token_counter 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def truncate_texts_to_token_limit(texts: list[str], model: str) -> list[str]: 9 | """ 10 | Truncate texts to fit within the model's token limit. 11 | """ 12 | try: 13 | model_info = get_model_info(model=model) 14 | if not model_info.get("max_input_tokens"): 15 | return texts # No truncation needed if no limit specified 16 | 17 | truncated_texts = [] 18 | for text in texts: 19 | text_tokens = token_counter(model=model, text=text) 20 | assert model_info["max_input_tokens"] 21 | if text_tokens > model_info["max_input_tokens"]: 22 | estimated_chars = ( 23 | model_info["max_input_tokens"] * 3 24 | ) # Estimate 3 chars per token 25 | truncated_text = text[:estimated_chars] 26 | truncated_texts.append(truncated_text) 27 | logger.warning( 28 | f"Truncated text from {text_tokens} to ~{model_info['max_input_tokens']} tokens" 29 | ) 30 | else: 31 | truncated_texts.append(text) 32 | 33 | return truncated_texts 34 | except Exception as e: 35 | logger.warning(f"Failed to truncate texts: {str(e)}") 36 | return texts # Return original texts if truncation fails 37 | -------------------------------------------------------------------------------- /py/core/providers/file/__init__.py: -------------------------------------------------------------------------------- 1 | from .postgres import PostgresFileProvider 2 | from .s3 import S3FileProvider 3 | 4 | __all__ = [ 5 | "PostgresFileProvider", 6 | "S3FileProvider", 7 | ] 8 | -------------------------------------------------------------------------------- /py/core/providers/ingestion/__init__.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | from .r2r.base import R2RIngestionConfig, R2RIngestionProvider 3 | from .unstructured.base import ( 4 | UnstructuredIngestionConfig, 5 | UnstructuredIngestionProvider, 6 | ) 7 | 8 | __all__ = [ 9 | "R2RIngestionConfig", 10 | "R2RIngestionProvider", 11 | "UnstructuredIngestionProvider", 12 | "UnstructuredIngestionConfig", 13 | ] 14 | -------------------------------------------------------------------------------- /py/core/providers/llm/__init__.py: -------------------------------------------------------------------------------- 1 | from .anthropic import AnthropicCompletionProvider 2 | from .litellm import LiteLLMCompletionProvider 3 | from .openai import OpenAICompletionProvider 4 | from .r2r_llm import R2RCompletionProvider 5 | 6 | __all__ = [ 7 | "AnthropicCompletionProvider", 8 | "LiteLLMCompletionProvider", 9 | "OpenAICompletionProvider", 10 | "R2RCompletionProvider", 11 | ] 12 | -------------------------------------------------------------------------------- /py/core/providers/ocr/__init__.py: -------------------------------------------------------------------------------- 1 | from .mistral import MistralOCRProvider 2 | 3 | __all__ = [ 4 | "MistralOCRProvider", 5 | ] 6 | -------------------------------------------------------------------------------- /py/core/providers/orchestration/__init__.py: -------------------------------------------------------------------------------- 1 | from .hatchet import HatchetOrchestrationProvider 2 | from .simple import SimpleOrchestrationProvider 3 | 4 | __all__ = ["HatchetOrchestrationProvider", "SimpleOrchestrationProvider"] 5 | -------------------------------------------------------------------------------- /py/core/providers/orchestration/simple.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from core.base import OrchestrationConfig, OrchestrationProvider, Workflow 4 | 5 | 6 | class SimpleOrchestrationProvider(OrchestrationProvider): 7 | def __init__(self, config: OrchestrationConfig): 8 | super().__init__(config) 9 | self.config = config 10 | self.messages: dict[str, str] = {} 11 | 12 | async def start_worker(self): 13 | pass 14 | 15 | def get_worker(self, name: str, max_runs: int) -> Any: 16 | pass 17 | 18 | def step(self, *args, **kwargs) -> Any: 19 | pass 20 | 21 | def workflow(self, *args, **kwargs) -> Any: 22 | pass 23 | 24 | def failure(self, *args, **kwargs) -> Any: 25 | pass 26 | 27 | def register_workflows( 28 | self, workflow: Workflow, service: Any, messages: dict 29 | ) -> None: 30 | for key, msg in messages.items(): 31 | self.messages[key] = msg 32 | 33 | if workflow == Workflow.INGESTION: 34 | from core.main.orchestration import simple_ingestion_factory 35 | 36 | self.ingestion_workflows = simple_ingestion_factory(service) 37 | 38 | elif workflow == Workflow.GRAPH: 39 | from core.main.orchestration.simple.graph_workflow import ( 40 | simple_graph_search_results_factory, 41 | ) 42 | 43 | self.graph_search_results_workflows = ( 44 | simple_graph_search_results_factory(service) 45 | ) 46 | 47 | async def run_workflow( 48 | self, workflow_name: str, parameters: dict, options: dict 49 | ) -> dict[str, str]: 50 | if workflow_name in self.ingestion_workflows: 51 | await self.ingestion_workflows[workflow_name]( 52 | parameters.get("request") 53 | ) 54 | return {"message": self.messages[workflow_name]} 55 | elif workflow_name in self.graph_search_results_workflows: 56 | await self.graph_search_results_workflows[workflow_name]( 57 | parameters.get("request") 58 | ) 59 | return {"message": self.messages[workflow_name]} 60 | else: 61 | raise ValueError(f"Workflow '{workflow_name}' not found.") 62 | -------------------------------------------------------------------------------- /py/core/providers/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | from .apscheduler import APSchedulerProvider 2 | 3 | __all__ = ["APSchedulerProvider"] 4 | -------------------------------------------------------------------------------- /py/core/providers/scheduler/apscheduler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from apscheduler.schedulers.asyncio import AsyncIOScheduler 4 | 5 | from core.base import SchedulerConfig, SchedulerProvider 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class APSchedulerProvider(SchedulerProvider): 11 | """Implementation using APScheduler""" 12 | 13 | def __init__(self, config: SchedulerConfig): 14 | super().__init__(config) 15 | self.scheduler = AsyncIOScheduler() 16 | 17 | async def add_job(self, func, trigger, **kwargs): 18 | logger.info( 19 | f"Adding job {func.__name__} with trigger {trigger} and kwargs {kwargs}" 20 | ) 21 | self.scheduler.add_job(func, trigger, **kwargs) 22 | 23 | async def start(self): 24 | self.scheduler.start() 25 | logger.info("Scheduler started") 26 | 27 | async def shutdown(self): 28 | if self.scheduler.running: 29 | self.scheduler.shutdown() 30 | logger.info("Scheduler shutdown") 31 | 32 | async def __aenter__(self): 33 | await self.start() 34 | return self 35 | 36 | async def __aexit__(self, exc_type, exc, tb): 37 | await self.shutdown() 38 | -------------------------------------------------------------------------------- /py/core/utils/context.py: -------------------------------------------------------------------------------- 1 | from contextvars import ContextVar, Token 2 | 3 | project_schema_context: ContextVar[str | None] = ContextVar( 4 | "project_schema_context", default=None 5 | ) 6 | 7 | 8 | def get_current_project_schema() -> str | None: 9 | """Get the current project schema name from context.""" 10 | return project_schema_context.get() 11 | 12 | 13 | def set_project_schema(schema_name: str) -> Token: 14 | """Set the current project schema in context.""" 15 | return project_schema_context.set(schema_name) 16 | -------------------------------------------------------------------------------- /py/core/utils/sentry.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | 4 | import sentry_sdk 5 | 6 | 7 | def init_sentry(): 8 | dsn = os.getenv("R2R_SENTRY_DSN") 9 | if not dsn: 10 | return 11 | 12 | with contextlib.suppress(Exception): 13 | sentry_sdk.init( 14 | dsn=dsn, 15 | environment=os.getenv("R2R_SENTRY_ENVIRONMENT", "not_set"), 16 | traces_sample_rate=float( 17 | os.getenv("R2R_SENTRY_TRACES_SAMPLE_RATE", 1.0) 18 | ), 19 | profiles_sample_rate=float( 20 | os.getenv("R2R_SENTRY_PROFILES_SAMPLE_RATE", 1.0) 21 | ), 22 | ) 23 | -------------------------------------------------------------------------------- /py/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. 2 | -------------------------------------------------------------------------------- /py/migrations/alembic.ini: -------------------------------------------------------------------------------- 1 | [alembic] 2 | script_location = . 3 | sqlalchemy.url = postgresql://postgres:postgres@localhost:5432/postgres 4 | 5 | [loggers] 6 | keys = root,sqlalchemy,alembic 7 | 8 | [handlers] 9 | keys = console 10 | 11 | [formatters] 12 | keys = generic 13 | 14 | [logger_root] 15 | level = WARN 16 | handlers = console 17 | qualname = 18 | 19 | [logger_sqlalchemy] 20 | level = WARN 21 | handlers = 22 | qualname = sqlalchemy.engine 23 | 24 | [logger_alembic] 25 | level = INFO 26 | handlers = 27 | qualname = alembic 28 | 29 | [handler_console] 30 | class = StreamHandler 31 | args = (sys.stderr,) 32 | level = NOTSET 33 | formatter = generic 34 | 35 | [formatter_generic] 36 | format = %(levelname)-5.5s [%(name)s] %(message)s 37 | datefmt = %H:%M:%S 38 | -------------------------------------------------------------------------------- /py/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | Schema: %(schema)s 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | ${imports if imports else ""} 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = ${repr(up_revision)} 16 | down_revision: Union[str, None] = ${repr(down_revision)} 17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} 18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} 19 | 20 | def upgrade() -> None: 21 | # Get the schema name 22 | schema = op.get_context().get_context_kwargs.get('version_table_schema') 23 | 24 | """ 25 | ### Schema-aware migration 26 | All table operations should include the schema name, for example: 27 | 28 | op.create_tables( 29 | 'my_table', 30 | sa.Column('id', sa.Integer(), nullable=False), 31 | sa.Column('name', sa.String(), nullable=True), 32 | schema=schema 33 | ) 34 | 35 | op.create_index( 36 | 'idx_my_table_name', 37 | 'my_table', 38 | ['name'], 39 | schema=schema 40 | ) 41 | """ 42 | ${upgrades if upgrades else "pass"} 43 | 44 | def downgrade() -> None: 45 | # Get the schema name 46 | schema = op.get_context().get_context_kwargs.get('version_table_schema') 47 | 48 | """ 49 | ### Schema-aware downgrade 50 | Remember to include schema in all operations, for example: 51 | 52 | op.drop_table('my_table', schema=schema) 53 | """ 54 | ${downgrades if downgrades else "pass"} 55 | -------------------------------------------------------------------------------- /py/migrations/versions/7eb70560f406_add_limits_overrides_to_users.py: -------------------------------------------------------------------------------- 1 | """add_limits_overrides_to_users. 2 | 3 | Revision ID: 7eb70560f406 4 | Revises: c45a9cf6a8a4 5 | Create Date: 2025-01-03 20:27:16.139511 6 | """ 7 | 8 | import os 9 | from typing import Sequence, Union 10 | 11 | import sqlalchemy as sa 12 | from alembic import op 13 | from sqlalchemy import inspect 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = "7eb70560f406" 17 | down_revision: Union[str, None] = "c45a9cf6a8a4" 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | project_name = os.getenv("R2R_PROJECT_NAME", "r2r_default") 22 | 23 | 24 | def check_if_upgrade_needed(): 25 | """Check if the upgrade has already been applied.""" 26 | connection = op.get_bind() 27 | inspector = inspect(connection) 28 | 29 | # Check if users table exists 30 | if not inspector.has_table("users", schema=project_name): 31 | print( 32 | f"Migration not needed: '{project_name}.users' table doesn't exist" 33 | ) 34 | return False 35 | 36 | users_columns = { 37 | col["name"] 38 | for col in inspector.get_columns("users", schema=project_name) 39 | } 40 | 41 | if "limits_overrides" in users_columns: 42 | print( 43 | "Migration not needed: users table already has limits_overrides column" 44 | ) 45 | return False 46 | else: 47 | print("Migration needed: users table needs limits_overrides column") 48 | return True 49 | 50 | 51 | def upgrade() -> None: 52 | if not check_if_upgrade_needed(): 53 | return 54 | 55 | # Add the limits_overrides column as JSONB with default NULL 56 | op.add_column( 57 | "users", 58 | sa.Column("limits_overrides", sa.JSON(), nullable=True), 59 | schema=project_name, 60 | ) 61 | 62 | 63 | def downgrade() -> None: 64 | # Remove the limits_overrides column 65 | op.drop_column("users", "limits_overrides", schema=project_name) 66 | -------------------------------------------------------------------------------- /py/r2r/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import metadata 2 | 3 | from sdk.async_client import R2RAsyncClient 4 | from sdk.sync_client import R2RClient 5 | from shared import * 6 | from shared import __all__ as shared_all 7 | 8 | __version__ = metadata.version("r2r") 9 | 10 | __all__ = [ 11 | "R2RAsyncClient", 12 | "R2RClient", 13 | "__version__", 14 | "R2RException", 15 | ] + shared_all 16 | 17 | 18 | def get_version(): 19 | return __version__ 20 | -------------------------------------------------------------------------------- /py/sdk/README.md: -------------------------------------------------------------------------------- 1 | # R2R Python SDK Documentation 2 | 3 | For the complete look at the R2R Python SDK, [visit our documentation.](https://r2r-docs.sciphi.ai/documentation/python-sdk/introduction) 4 | 5 | ## Installation 6 | 7 | Before starting, make sure you have completed the [R2R installation](/documentation/installation). 8 | 9 | Install the R2R Python SDK: 10 | 11 | ```bash 12 | pip install r2r 13 | ``` 14 | 15 | ## Getting Started 16 | 17 | 1. Import the R2R client: 18 | 19 | ```python 20 | from r2r import R2RClient 21 | ``` 22 | 23 | 2. Initialize the client: 24 | 25 | ```python 26 | client = R2RClient("http://localhost:7272") 27 | ``` 28 | 29 | 30 | 3. Check if R2R is running correctly: 31 | 32 | ```python 33 | health_response = client.health() 34 | # {"status":"ok"} 35 | ``` 36 | 37 | 4. Login (Optional): 38 | ```python 39 | client.register("me@email.com", "my_password") 40 | # client.verify_email("me@email.com", "my_verification_code") 41 | client.login("me@email.com", "my_password") 42 | ``` 43 | When using authentication the commands below automatically restrict the scope to a user's available documents. 44 | -------------------------------------------------------------------------------- /py/sdk/__init__.py: -------------------------------------------------------------------------------- 1 | from .async_client import R2RAsyncClient 2 | from .sync_client import R2RClient 3 | 4 | __all__ = ["R2RAsyncClient", "R2RClient"] 5 | -------------------------------------------------------------------------------- /py/sdk/asnyc_methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .chunks import ChunksSDK 2 | from .collections import CollectionsSDK 3 | from .conversations import ConversationsSDK 4 | from .documents import DocumentsSDK 5 | from .graphs import GraphsSDK 6 | from .indices import IndicesSDK 7 | from .prompts import PromptsSDK 8 | from .retrieval import RetrievalSDK 9 | from .system import SystemSDK 10 | from .users import UsersSDK 11 | 12 | __all__ = [ 13 | "ChunksSDK", 14 | "CollectionsSDK", 15 | "ConversationsSDK", 16 | "DocumentsSDK", 17 | "GraphsSDK", 18 | "IndicesSDK", 19 | "PromptsSDK", 20 | "RetrievalSDK", 21 | "SystemSDK", 22 | "UsersSDK", 23 | ] 24 | -------------------------------------------------------------------------------- /py/sdk/asnyc_methods/system.py: -------------------------------------------------------------------------------- 1 | from shared.api.models import ( 2 | WrappedGenericMessageResponse, 3 | WrappedServerStatsResponse, 4 | WrappedSettingsResponse, 5 | ) 6 | 7 | 8 | class SystemSDK: 9 | def __init__(self, client): 10 | self.client = client 11 | 12 | async def health(self) -> WrappedGenericMessageResponse: 13 | """Check the health of the R2R server.""" 14 | response_dict = await self.client._make_request( 15 | "GET", "health", version="v3" 16 | ) 17 | 18 | return WrappedGenericMessageResponse(**response_dict) 19 | 20 | async def settings(self) -> WrappedSettingsResponse: 21 | """Get the configuration settings for the R2R server. 22 | 23 | Returns: 24 | dict: The server settings. 25 | """ 26 | response_dict = await self.client._make_request( 27 | "GET", "system/settings", version="v3" 28 | ) 29 | 30 | return WrappedSettingsResponse(**response_dict) 31 | 32 | async def status(self) -> WrappedServerStatsResponse: 33 | """Get statistics about the server, including the start time, uptime, 34 | CPU usage, and memory usage. 35 | 36 | Returns: 37 | dict: The server statistics. 38 | """ 39 | response_dict = await self.client._make_request( 40 | "GET", "system/status", version="v3" 41 | ) 42 | 43 | return WrappedServerStatsResponse(**response_dict) 44 | -------------------------------------------------------------------------------- /py/sdk/base/__init_.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/sdk/base/__init_.py -------------------------------------------------------------------------------- /py/sdk/base/base_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from shared.abstractions import R2RClientException 4 | 5 | 6 | class BaseClient: 7 | def __init__( 8 | self, 9 | base_url: str | None = None, 10 | timeout: float = 300.0, 11 | ): 12 | self.base_url = base_url or os.getenv( 13 | "R2R_API_BASE", "http://localhost:7272" 14 | ) 15 | self.timeout = timeout 16 | self.access_token: str | None = None 17 | self._refresh_token: str | None = None 18 | self._user_id: str | None = None 19 | self.api_key: str | None = os.getenv("R2R_API_KEY", None) 20 | self.project_name: str | None = None 21 | 22 | def _get_auth_header(self) -> dict[str, str]: 23 | if self.access_token and self.api_key: 24 | raise R2RClientException( 25 | message="Cannot have both access token and api key.", 26 | ) 27 | if self.access_token: 28 | return {"Authorization": f"Bearer {self.access_token}"} 29 | elif self.api_key: 30 | return {"x-api-key": self.api_key} 31 | else: 32 | return {} 33 | 34 | def _get_full_url(self, endpoint: str, version: str = "v3") -> str: 35 | return f"{self.base_url}/{version}/{endpoint}" 36 | 37 | def _prepare_request_args(self, endpoint: str, **kwargs) -> dict: 38 | headers = kwargs.pop("headers", {}) 39 | if (self.access_token or self.api_key) and endpoint not in [ 40 | "register", 41 | "login", 42 | "verify_email", 43 | ]: 44 | headers.update(self._get_auth_header()) 45 | 46 | if self.project_name: 47 | headers["x-project-name"] = self.project_name 48 | 49 | if ( 50 | kwargs.get("params", None) == {} 51 | or kwargs.get("params", None) is None 52 | ): 53 | kwargs.pop("params", None) 54 | 55 | return {"headers": headers, **kwargs} 56 | -------------------------------------------------------------------------------- /py/sdk/models.py: -------------------------------------------------------------------------------- 1 | from shared.abstractions import ( 2 | AggregateSearchResult, 3 | ChunkSearchResult, 4 | GenerationConfig, 5 | GraphCommunityResult, 6 | GraphEntityResult, 7 | GraphRelationshipResult, 8 | GraphSearchResult, 9 | GraphSearchResultType, 10 | GraphSearchSettings, 11 | HybridSearchSettings, 12 | IngestionMode, 13 | Message, 14 | MessageType, 15 | R2RException, 16 | R2RSerializable, 17 | SearchMode, 18 | SearchSettings, 19 | Token, 20 | User, 21 | select_search_filters, 22 | ) 23 | from shared.abstractions.graph import ( 24 | GraphCreationSettings, 25 | GraphEnrichmentSettings, 26 | ) 27 | from shared.api.models import ( 28 | AgentEvent, 29 | AgentResponse, 30 | Citation, 31 | CitationData, 32 | CitationEvent, 33 | Delta, 34 | DeltaPayload, 35 | FinalAnswerData, 36 | FinalAnswerEvent, 37 | MessageData, 38 | MessageDelta, 39 | MessageEvent, 40 | RAGResponse, 41 | SearchResultsData, 42 | SearchResultsEvent, 43 | SSEEventBase, 44 | ThinkingData, 45 | ThinkingEvent, 46 | ToolCallData, 47 | ToolCallEvent, 48 | ToolResultData, 49 | ToolResultEvent, 50 | UnknownEvent, 51 | ) 52 | 53 | __all__ = [ 54 | "AggregateSearchResult", 55 | "GenerationConfig", 56 | "HybridSearchSettings", 57 | "GraphCommunityResult", 58 | "GraphCreationSettings", 59 | "GraphEnrichmentSettings", 60 | "GraphEntityResult", 61 | "GraphRelationshipResult", 62 | "GraphSearchResult", 63 | "GraphSearchResultType", 64 | "GraphSearchSettings", 65 | "Message", 66 | "MessageType", 67 | "R2RException", 68 | "R2RSerializable", 69 | "Token", 70 | "ChunkSearchResult", 71 | "SearchSettings", 72 | "select_search_filters", 73 | "IngestionMode", 74 | "SearchMode", 75 | # "RAGResponse", 76 | "Citation", 77 | "RAGResponse", 78 | "AgentEvent", 79 | "AgentResponse", 80 | "SSEEventBase", 81 | "SearchResultsData", 82 | "SearchResultsEvent", 83 | "MessageData", 84 | "MessageDelta", 85 | "MessageEvent", 86 | "DeltaPayload", 87 | "Delta", 88 | "CitationData", 89 | "CitationEvent", 90 | "FinalAnswerData", 91 | "FinalAnswerEvent", 92 | "ToolCallData", 93 | "ToolCallEvent", 94 | "ToolResultData", 95 | "ToolResultEvent", 96 | "ThinkingEvent", 97 | "ThinkingData", 98 | "UnknownEvent", 99 | "User", 100 | ] 101 | -------------------------------------------------------------------------------- /py/sdk/sync_methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .chunks import ChunksSDK 2 | from .collections import CollectionsSDK 3 | from .conversations import ConversationsSDK 4 | from .documents import DocumentsSDK 5 | from .graphs import GraphsSDK 6 | from .indices import IndicesSDK 7 | from .prompts import PromptsSDK 8 | from .retrieval import RetrievalSDK 9 | from .system import SystemSDK 10 | from .users import UsersSDK 11 | 12 | __all__ = [ 13 | "ChunksSDK", 14 | "CollectionsSDK", 15 | "ConversationsSDK", 16 | "DocumentsSDK", 17 | "GraphsSDK", 18 | "IndicesSDK", 19 | "PromptsSDK", 20 | "RetrievalSDK", 21 | "SystemSDK", 22 | "UsersSDK", 23 | ] 24 | -------------------------------------------------------------------------------- /py/sdk/sync_methods/system.py: -------------------------------------------------------------------------------- 1 | from shared.api.models import ( 2 | WrappedGenericMessageResponse, 3 | WrappedServerStatsResponse, 4 | WrappedSettingsResponse, 5 | ) 6 | 7 | 8 | class SystemSDK: 9 | def __init__(self, client): 10 | self.client = client 11 | 12 | def health(self) -> WrappedGenericMessageResponse: 13 | """Check the health of the R2R server.""" 14 | response_dict = self.client._make_request( 15 | "GET", "health", version="v3" 16 | ) 17 | 18 | return WrappedGenericMessageResponse(**response_dict) 19 | 20 | def settings(self) -> WrappedSettingsResponse: 21 | """Get the configuration settings for the R2R server. 22 | 23 | Returns: 24 | dict: The server settings. 25 | """ 26 | response_dict = self.client._make_request( 27 | "GET", "system/settings", version="v3" 28 | ) 29 | 30 | return WrappedSettingsResponse(**response_dict) 31 | 32 | def status(self) -> WrappedServerStatsResponse: 33 | """Get statistics about the server, including the start time, uptime, 34 | CPU usage, and memory usage. 35 | 36 | Returns: 37 | dict: The server statistics. 38 | """ 39 | response_dict = self.client._make_request( 40 | "GET", "system/status", version="v3" 41 | ) 42 | 43 | return WrappedServerStatsResponse(**response_dict) 44 | -------------------------------------------------------------------------------- /py/shared/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstractions import * 2 | from .abstractions import __all__ as abstractions_all 3 | from .api.models import * 4 | from .api.models import __all__ as api_models_all 5 | from .utils import * 6 | 7 | __all__ = abstractions_all + api_models_all 8 | -------------------------------------------------------------------------------- /py/shared/abstractions/prompt.py: -------------------------------------------------------------------------------- 1 | """Abstraction for a prompt that can be formatted with inputs.""" 2 | 3 | import logging 4 | from datetime import datetime 5 | from typing import Any 6 | from uuid import UUID, uuid4 7 | 8 | from pydantic import BaseModel, Field 9 | 10 | logger = logging.getLogger() 11 | 12 | 13 | class Prompt(BaseModel): 14 | """A prompt that can be formatted with inputs.""" 15 | 16 | id: UUID = Field(default_factory=uuid4) 17 | name: str 18 | template: str 19 | input_types: dict[str, str] 20 | created_at: datetime = Field(default_factory=datetime.utcnow) 21 | updated_at: datetime = Field(default_factory=datetime.utcnow) 22 | 23 | def format_prompt(self, inputs: dict[str, Any]) -> str: 24 | self._validate_inputs(inputs) 25 | return self.template.format(**inputs) 26 | 27 | def _validate_inputs(self, inputs: dict[str, Any]) -> None: 28 | for var, expected_type_name in self.input_types.items(): 29 | expected_type = self._convert_type(expected_type_name) 30 | if var not in inputs: 31 | raise ValueError(f"Missing input: {var}") 32 | if not isinstance(inputs[var], expected_type): 33 | raise TypeError( 34 | f"Input '{var}' must be of type {expected_type.__name__}, got {type(inputs[var]).__name__} instead." 35 | ) 36 | 37 | def _convert_type(self, type_name: str) -> type: 38 | type_mapping = {"int": int, "str": str} 39 | return type_mapping.get(type_name, str) 40 | -------------------------------------------------------------------------------- /py/shared/abstractions/tool.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Optional 2 | 3 | from ..abstractions import R2RSerializable 4 | 5 | 6 | class Tool(R2RSerializable): 7 | name: str 8 | description: str 9 | results_function: Callable 10 | llm_format_function: Optional[Callable] = None 11 | stream_function: Optional[Callable] = None 12 | parameters: Optional[dict[str, Any]] = None 13 | context: Optional[Any] = None 14 | 15 | class Config: 16 | populate_by_name = True 17 | arbitrary_types_allowed = True 18 | 19 | def set_context(self, context: Any) -> None: 20 | """Set the context for this tool.""" 21 | self.context = context 22 | 23 | async def execute(self, *args, **kwargs): 24 | """ 25 | Execute the tool with context awareness. 26 | This wraps the results_function to ensure context is available. 27 | """ 28 | if self.context is None: 29 | raise ValueError( 30 | f"Tool '{self.name}' requires context but none was provided" 31 | ) 32 | 33 | # Call the actual implementation with context 34 | return await self.results_function(context=self.context, **kwargs) 35 | 36 | 37 | class ToolResult(R2RSerializable): 38 | raw_result: Any 39 | llm_formatted_result: str 40 | stream_result: Optional[str] = None 41 | -------------------------------------------------------------------------------- /py/shared/abstractions/user.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | from uuid import UUID 4 | 5 | from pydantic import BaseModel, Field 6 | 7 | from shared.abstractions import R2RSerializable 8 | 9 | from ..utils import generate_default_user_collection_id 10 | 11 | 12 | class Collection(BaseModel): 13 | id: UUID 14 | name: str 15 | description: Optional[str] = None 16 | created_at: datetime = Field( 17 | default_factory=datetime.utcnow, 18 | ) 19 | updated_at: datetime = Field( 20 | default_factory=datetime.utcnow, 21 | ) 22 | 23 | class Config: 24 | populate_by_name = True 25 | from_attributes = True 26 | 27 | def __init__(self, **data): 28 | super().__init__(**data) 29 | if self.id is None: 30 | self.id = generate_default_user_collection_id(self.name) 31 | 32 | 33 | class Token(BaseModel): 34 | token: str 35 | token_type: str 36 | 37 | 38 | class TokenData(BaseModel): 39 | email: str 40 | token_type: str 41 | exp: datetime 42 | 43 | 44 | class User(R2RSerializable): 45 | id: UUID 46 | email: str 47 | is_active: bool = True 48 | is_superuser: bool = False 49 | created_at: datetime = datetime.now() 50 | updated_at: datetime = datetime.now() 51 | is_verified: bool = False 52 | collection_ids: list[UUID] = [] 53 | graph_ids: list[UUID] = [] 54 | document_ids: list[UUID] = [] 55 | 56 | # Optional fields (to update or set at creation) 57 | limits_overrides: Optional[dict] = None 58 | metadata: Optional[dict] = None 59 | verification_code_expiry: Optional[datetime] = None 60 | name: Optional[str] = None 61 | bio: Optional[str] = None 62 | profile_picture: Optional[str] = None 63 | total_size_in_bytes: Optional[int] = None 64 | num_files: Optional[int] = None 65 | 66 | account_type: str = "password" 67 | hashed_password: Optional[str] = None 68 | google_id: Optional[str] = None 69 | github_id: Optional[str] = None 70 | -------------------------------------------------------------------------------- /py/shared/api/models/auth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/auth/__init__.py -------------------------------------------------------------------------------- /py/shared/api/models/auth/responses.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from shared.abstractions import Token 4 | from shared.api.models.base import R2RResults 5 | 6 | 7 | class TokenResponse(BaseModel): 8 | access_token: Token 9 | refresh_token: Token 10 | 11 | 12 | # Create wrapped versions of each response 13 | WrappedTokenResponse = R2RResults[TokenResponse] 14 | -------------------------------------------------------------------------------- /py/shared/api/models/base.py: -------------------------------------------------------------------------------- 1 | from typing import Generic, TypeVar 2 | 3 | from pydantic import BaseModel 4 | 5 | T = TypeVar("T") 6 | 7 | 8 | class R2RResults(BaseModel, Generic[T]): 9 | results: T 10 | 11 | 12 | class PaginatedR2RResult(BaseModel, Generic[T]): 13 | results: T 14 | total_entries: int 15 | 16 | 17 | class GenericBooleanResponse(BaseModel): 18 | success: bool 19 | 20 | 21 | class GenericMessageResponse(BaseModel): 22 | message: str 23 | 24 | 25 | WrappedBooleanResponse = R2RResults[GenericBooleanResponse] 26 | WrappedGenericMessageResponse = R2RResults[GenericMessageResponse] 27 | -------------------------------------------------------------------------------- /py/shared/api/models/graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/graph/__init__.py -------------------------------------------------------------------------------- /py/shared/api/models/graph/responses.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | from uuid import UUID 4 | 5 | from pydantic import BaseModel 6 | 7 | from shared.abstractions.graph import Community, Entity, Relationship 8 | from shared.api.models.base import PaginatedR2RResult, R2RResults 9 | 10 | WrappedEntityResponse = R2RResults[Entity] 11 | WrappedEntitiesResponse = PaginatedR2RResult[list[Entity]] 12 | WrappedRelationshipResponse = R2RResults[Relationship] 13 | WrappedRelationshipsResponse = PaginatedR2RResult[list[Relationship]] 14 | WrappedCommunityResponse = R2RResults[Community] 15 | WrappedCommunitiesResponse = PaginatedR2RResult[list[Community]] 16 | 17 | 18 | class GraphResponse(BaseModel): 19 | id: UUID 20 | collection_id: UUID 21 | name: str 22 | description: Optional[str] 23 | status: str 24 | created_at: datetime 25 | updated_at: datetime 26 | document_ids: list[UUID] 27 | 28 | 29 | # Graph Responses 30 | WrappedGraphResponse = R2RResults[GraphResponse] 31 | WrappedGraphsResponse = PaginatedR2RResult[list[GraphResponse]] 32 | -------------------------------------------------------------------------------- /py/shared/api/models/ingestion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/ingestion/__init__.py -------------------------------------------------------------------------------- /py/shared/api/models/ingestion/responses.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, TypeVar 2 | from uuid import UUID 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from shared.api.models.base import PaginatedR2RResult, R2RResults 7 | 8 | T = TypeVar("T") 9 | 10 | 11 | class IngestionResponse(BaseModel): 12 | message: str = Field( 13 | ..., 14 | description="A message describing the result of the ingestion request.", 15 | ) 16 | task_id: Optional[UUID] = Field( 17 | None, 18 | description="The task ID of the ingestion request.", 19 | ) 20 | document_id: UUID = Field( 21 | ..., 22 | description="The ID of the document that was ingested.", 23 | ) 24 | 25 | class Config: 26 | json_schema_extra = { 27 | "example": { 28 | "message": "Ingestion task queued successfully.", 29 | "task_id": "c68dc72e-fc23-5452-8f49-d7bd46088a96", 30 | "document_id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1", 31 | } 32 | } 33 | 34 | 35 | class UpdateResponse(BaseModel): 36 | message: str = Field( 37 | ..., 38 | description="A message describing the result of the ingestion request.", 39 | ) 40 | task_id: Optional[UUID] = Field( 41 | None, 42 | description="The task ID of the ingestion request.", 43 | ) 44 | document_ids: list[UUID] = Field( 45 | ..., 46 | description="The ID of the document that was ingested.", 47 | ) 48 | 49 | class Config: 50 | json_schema_extra = { 51 | "example": { 52 | "message": "Update task queued successfully.", 53 | "task_id": "c68dc72e-fc23-5452-8f49-d7bd46088a96", 54 | "document_ids": ["9fbe403b-c11c-5aae-8ade-ef22980c3ad1"], 55 | } 56 | } 57 | 58 | 59 | class VectorIndexResponse(BaseModel): 60 | index: dict[str, Any] 61 | 62 | 63 | class VectorIndicesResponse(BaseModel): 64 | indices: list[VectorIndexResponse] 65 | 66 | 67 | WrappedIngestionResponse = R2RResults[IngestionResponse] 68 | WrappedMetadataUpdateResponse = R2RResults[IngestionResponse] 69 | WrappedUpdateResponse = R2RResults[UpdateResponse] 70 | 71 | WrappedVectorIndexResponse = R2RResults[VectorIndexResponse] 72 | WrappedVectorIndicesResponse = PaginatedR2RResult[VectorIndicesResponse] 73 | -------------------------------------------------------------------------------- /py/shared/api/models/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/management/__init__.py -------------------------------------------------------------------------------- /py/shared/api/models/retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/retrieval/__init__.py -------------------------------------------------------------------------------- /py/shared/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_utils import ( 2 | _decorate_vector_type, 3 | _get_vector_column_str, 4 | deep_update, 5 | dump_collector, 6 | dump_obj, 7 | format_search_results_for_llm, 8 | generate_default_prompt_id, 9 | generate_default_user_collection_id, 10 | generate_document_id, 11 | generate_entity_document_id, 12 | generate_extraction_id, 13 | generate_id, 14 | generate_user_id, 15 | validate_uuid, 16 | yield_sse_event, 17 | ) 18 | from .splitter.text import RecursiveCharacterTextSplitter, TextSplitter 19 | 20 | __all__ = [ 21 | "format_search_results_for_llm", 22 | # ID generation 23 | "generate_id", 24 | "generate_document_id", 25 | "generate_extraction_id", 26 | "generate_default_user_collection_id", 27 | "generate_user_id", 28 | "generate_default_prompt_id", 29 | "generate_entity_document_id", 30 | # Other 31 | "validate_uuid", 32 | "deep_update", 33 | # Text splitter 34 | "RecursiveCharacterTextSplitter", 35 | "TextSplitter", 36 | # Vector utils 37 | "_decorate_vector_type", 38 | "_get_vector_column_str", 39 | "yield_sse_event", 40 | "dump_collector", 41 | "dump_obj", 42 | ] 43 | -------------------------------------------------------------------------------- /py/shared/utils/splitter/__init__.py: -------------------------------------------------------------------------------- 1 | from .text import RecursiveCharacterTextSplitter 2 | 3 | __all__ = ["RecursiveCharacterTextSplitter"] 4 | -------------------------------------------------------------------------------- /py/tests/integration/test_base.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from r2r import R2RException 4 | 5 | 6 | class BaseTest: 7 | """Base class for all test classes with common utilities.""" 8 | 9 | @staticmethod 10 | async def cleanup_resource(cleanup_func, 11 | resource_id: Optional[str] = None) -> None: 12 | """Generic cleanup helper that won't fail the test if cleanup fails.""" 13 | if resource_id: 14 | try: 15 | await cleanup_func(id=resource_id) 16 | except R2RException: 17 | pass 18 | -------------------------------------------------------------------------------- /py/tests/scaling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/tests/scaling/__init__.py -------------------------------------------------------------------------------- /py/tests/unit/retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/tests/unit/retrieval/__init__.py -------------------------------------------------------------------------------- /services/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/services/README.md -------------------------------------------------------------------------------- /services/clustering/Dockerfile.clustering: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim AS builder 2 | 3 | # Install system dependencies 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | gcc g++ musl-dev curl libffi-dev \ 6 | && apt-get clean && rm -rf /var/lib/apt/lists/* \ 7 | && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 8 | 9 | RUN pip install --no-cache-dir poetry 10 | 11 | # Add Rust to PATH 12 | ENV PATH="/root/.cargo/bin:${PATH}" 13 | 14 | ENV PYTHONDONTWRITEBYTECODE=1 15 | ENV PYTHONUNBUFFERED=1 16 | 17 | WORKDIR /app 18 | 19 | # Install graspologic and other dependencies 20 | RUN pip install --no-cache-dir fastapi uvicorn networkx "graspologic[leiden]" future pydantic==2.8.2 21 | 22 | COPY main.py . 23 | 24 | EXPOSE 7276 25 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7276"] 26 | -------------------------------------------------------------------------------- /services/unstructured/Dockerfile.unstructured: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim AS builder 2 | 3 | # Install system dependencies (including those needed for Unstructured and OpenCV) 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \ 6 | tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \ 7 | poppler-utils libmagic1 pandoc libreoffice \ 8 | libgl1-mesa-glx libglib2.0-0 \ 9 | && apt-get clean && rm -rf /var/lib/apt/lists/* 10 | 11 | ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata 12 | 13 | ENV PYTHONDONTWRITEBYTECODE=1 14 | ENV PYTHONUNBUFFERED=1 15 | 16 | WORKDIR /app 17 | 18 | RUN pip install --no-cache-dir unstructured "unstructured[all-docs]" 19 | 20 | 21 | ENV NLTK_DATA=/usr/share/nltk_data 22 | RUN mkdir -p ${NLTK_DATA} 23 | RUN python -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng 24 | 25 | RUN python -c "from unstructured.partition.model_init import initialize; initialize()" 26 | 27 | RUN pip install gunicorn uvicorn fastapi httpx 28 | 29 | COPY main.py . 30 | 31 | EXPOSE 7275 32 | 33 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7275"] 34 | -------------------------------------------------------------------------------- /services/unstructured/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/services/unstructured/README.md -------------------------------------------------------------------------------- /services/unstructured/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import concurrent.futures 4 | import logging 5 | import os 6 | from io import BytesIO 7 | from typing import Optional 8 | 9 | from fastapi import FastAPI, HTTPException 10 | from pydantic import BaseModel 11 | from unstructured.partition.auto import partition 12 | 13 | logger = logging.getLogger() 14 | 15 | app = FastAPI() 16 | 17 | 18 | class PartitionRequestModel(BaseModel): 19 | file_content: bytes 20 | ingestion_config: dict 21 | filename: Optional[str] = None 22 | 23 | 24 | class PartitionResponseModel(BaseModel): 25 | elements: list[dict] 26 | 27 | 28 | executor = concurrent.futures.ThreadPoolExecutor( 29 | max_workers=int(os.environ.get("MAX_INGESTION_WORKERS", 10)) 30 | ) 31 | 32 | 33 | def run_partition(file_content: str, filename: str, ingestion_config: dict) -> list[dict]: 34 | file_content_bytes = base64.b64decode(file_content) 35 | file_io = BytesIO(file_content_bytes) 36 | elements = partition(file=file_io, file_filename=filename, **ingestion_config) 37 | return [element.to_dict() for element in elements] 38 | 39 | 40 | @app.get("/health") 41 | async def health_endpoint(): 42 | return {"status": "ok"} 43 | 44 | 45 | @app.post("/partition", response_model=PartitionResponseModel) 46 | async def partition_endpoint(request: PartitionRequestModel): 47 | try: 48 | logger.info(f"Partitioning request received: {request}") 49 | loop = asyncio.get_event_loop() 50 | elements = await loop.run_in_executor( 51 | executor, 52 | run_partition, 53 | request.file_content, 54 | request.filename, 55 | request.ingestion_config, 56 | ) 57 | logger.info("Partitioning completed") 58 | return PartitionResponseModel(elements=elements) 59 | except Exception as e: 60 | logger.error(f"Error partitioning file: {str(e)}") 61 | raise HTTPException(status_code=500, detail=str(e)) 62 | --------------------------------------------------------------------------------