├── .env.example
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
    ├── actions
    │   ├── login-docker
    │   │   └── action.yml
    │   ├── setup-docker
    │   │   └── action.yml
    │   ├── setup-postgres-ext
    │   │   └── action.yml
    │   ├── setup-python-full
    │   │   └── action.yml
    │   ├── setup-python-light
    │   │   └── action.yml
    │   ├── start-r2r-full
    │   │   └── action.yml
    │   └── start-r2r-light
    │   │   └── action.yml
    └── workflows
    │   ├── build-cluster-service-docker.yml
    │   ├── build-r2r-docker.yml
    │   ├── build-unst-service-docker.yml
    │   ├── publish-to-npm.yml
    │   ├── publish-to-pypi.yml
    │   ├── quality.yml
    │   ├── r2r-full-py-integration-tests.yml
    │   ├── r2r-js-sdk-ci.yml
    │   ├── r2r-js-sdk-integration-tests.yml
    │   └── r2r-light-py-integration-tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.md
├── MANIFEST.md
├── README.md
├── SECURITY.md
├── docker
    ├── compose.full.swarm.yaml
    ├── compose.full.yaml
    ├── compose.yaml
    ├── env
    │   ├── hatchet.env
    │   ├── minio.env
    │   ├── postgres.env
    │   ├── r2r-dashboard.env
    │   ├── r2r-full.env
    │   └── r2r.env
    ├── fluent-bit
    │   ├── fluent-bit.conf
    │   └── parsers.conf
    ├── scripts
    │   ├── create-hatchet-db.sh
    │   ├── setup-token.sh
    │   └── start-r2r.sh
    ├── user_configs
    │   └── README.md
    └── user_tools
    │   ├── README.md
    │   └── user_requirements.txt
├── js
    ├── README.md
    └── sdk
    │   ├── .prettierignore
    │   ├── README.md
    │   ├── __tests__
    │       ├── ChunksIntegrationSuperUser.test.ts
    │       ├── CollectionsIntegrationSuperUser.test.ts
    │       ├── ConversationsIntegrationSuperUser.test.ts
    │       ├── ConversationsIntegrationUser.test.ts
    │       ├── DocumentsAndCollectionsIntegrationUser.test.ts
    │       ├── DocumentsIntegrationSuperUser.test.ts
    │       ├── GraphsIntegrationSuperUser.test.ts
    │       ├── PromptsIntegrationSuperUser.test.ts
    │       ├── RetrievalIntegrationSuperUser.test.ts
    │       ├── SystemIntegrationSuperUser.test.ts
    │       ├── SystemIntegrationUser.test.ts
    │       ├── UsersIntegrationSuperUser.test.ts
    │       └── util
    │       │   └── typeTransformer.test.ts
    │   ├── examples
    │       ├── data
    │       │   ├── folder
    │       │   │   ├── karamozov.txt
    │       │   │   └── myshkin.txt
    │       │   ├── invalid.json
    │       │   ├── marmeladov.txt
    │       │   ├── raskolnikov.txt
    │       │   ├── raskolnikov_2.txt
    │       │   ├── sonia.txt
    │       │   └── zametov.txt
    │       └── hello_r2r.js
    │   ├── jest.config.js
    │   ├── package-lock.json
    │   ├── package.json
    │   ├── pnpm-lock.yaml
    │   ├── src
    │       ├── baseClient.ts
    │       ├── index.ts
    │       ├── r2rClient.ts
    │       ├── types.ts
    │       ├── utils
    │       │   ├── index.ts
    │       │   ├── typeTransformer.ts
    │       │   └── utils.ts
    │       └── v3
    │       │   └── clients
    │       │       ├── chunks.ts
    │       │       ├── collections.ts
    │       │       ├── conversations.ts
    │       │       ├── documents.ts
    │       │       ├── graphs.ts
    │       │       ├── indices.ts
    │       │       ├── prompts.ts
    │       │       ├── retrieval.ts
    │       │       ├── system.ts
    │       │       └── users.ts
    │   └── tsconfig.json
├── llms.txt
├── py
    ├── .dockerignore
    ├── Dockerfile
    ├── README.md
    ├── all_possible_config.toml
    ├── core
    │   ├── __init__.py
    │   ├── agent
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── rag.py
    │   │   └── research.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── abstractions
    │   │   │   └── __init__.py
    │   │   ├── agent
    │   │   │   ├── __init__.py
    │   │   │   ├── agent.py
    │   │   │   └── tools
    │   │   │   │   ├── built_in
    │   │   │   │       ├── get_file_content.py
    │   │   │   │       ├── search_file_descriptions.py
    │   │   │   │       ├── search_file_knowledge.py
    │   │   │   │       ├── tavily_extract.py
    │   │   │   │       ├── tavily_search.py
    │   │   │   │       ├── web_scrape.py
    │   │   │   │       └── web_search.py
    │   │   │   │   └── registry.py
    │   │   ├── api
    │   │   │   └── models
    │   │   │   │   └── __init__.py
    │   │   ├── parsers
    │   │   │   ├── __init__.py
    │   │   │   └── base_parser.py
    │   │   ├── providers
    │   │   │   ├── __init__.py
    │   │   │   ├── auth.py
    │   │   │   ├── base.py
    │   │   │   ├── crypto.py
    │   │   │   ├── database.py
    │   │   │   ├── email.py
    │   │   │   ├── embedding.py
    │   │   │   ├── file.py
    │   │   │   ├── ingestion.py
    │   │   │   ├── llm.py
    │   │   │   ├── ocr.py
    │   │   │   ├── orchestration.py
    │   │   │   └── scheduler.py
    │   │   └── utils
    │   │   │   └── __init__.py
    │   ├── configs
    │   │   ├── full.toml
    │   │   ├── full_azure.toml
    │   │   ├── full_lm_studio.toml
    │   │   ├── full_ollama.toml
    │   │   ├── gemini.toml
    │   │   ├── lm_studio.toml
    │   │   ├── ollama.toml
    │   │   ├── r2r_azure.toml
    │   │   ├── r2r_azure_with_test_limits.toml
    │   │   ├── r2r_with_auth.toml
    │   │   └── tavily.toml
    │   ├── examples
    │   │   ├── __init__.py
    │   │   ├── data
    │   │   │   ├── DeepSeek_R1.pdf
    │   │   │   ├── aristotle.txt
    │   │   │   ├── aristotle_v2.txt
    │   │   │   ├── aristotle_v3.txt
    │   │   │   ├── got.txt
    │   │   │   ├── graphrag.pdf
    │   │   │   ├── lyft_2021.pdf
    │   │   │   ├── pg_essay_1.html
    │   │   │   ├── pg_essay_2.html
    │   │   │   ├── pg_essay_3.html
    │   │   │   ├── pg_essay_4.html
    │   │   │   ├── pg_essay_5.html
    │   │   │   ├── sample.mp3
    │   │   │   ├── sample2.mp3
    │   │   │   ├── screen_shot.png
    │   │   │   ├── test.txt
    │   │   │   ├── uber_2021.pdf
    │   │   │   └── yc_companies.txt
    │   │   ├── hello_r2r.ipynb
    │   │   ├── hello_r2r.py
    │   │   └── supported_file_types
    │   │   │   ├── bmp.bmp
    │   │   │   ├── css.css
    │   │   │   ├── csv.csv
    │   │   │   ├── doc.doc
    │   │   │   ├── docx.docx
    │   │   │   ├── eml.eml
    │   │   │   ├── epub.epub
    │   │   │   ├── heic.heic
    │   │   │   ├── html.html
    │   │   │   ├── jpeg.jpeg
    │   │   │   ├── jpg.jpg
    │   │   │   ├── js.js
    │   │   │   ├── json.json
    │   │   │   ├── md.md
    │   │   │   ├── msg.msg
    │   │   │   ├── odt.odt
    │   │   │   ├── org.org
    │   │   │   ├── p7s.p7s
    │   │   │   ├── pdf.pdf
    │   │   │   ├── png.png
    │   │   │   ├── ppt.ppt
    │   │   │   ├── pptx.pptx
    │   │   │   ├── py.py
    │   │   │   ├── rst.rst
    │   │   │   ├── rtf.rtf
    │   │   │   ├── tiff.tiff
    │   │   │   ├── ts.ts
    │   │   │   ├── tsv.tsv
    │   │   │   ├── txt.txt
    │   │   │   ├── xls.xls
    │   │   │   └── xlsx.xlsx
    │   ├── main
    │   │   ├── __init__.py
    │   │   ├── abstractions.py
    │   │   ├── api
    │   │   │   └── v3
    │   │   │   │   ├── base_router.py
    │   │   │   │   ├── chunks_router.py
    │   │   │   │   ├── collections_router.py
    │   │   │   │   ├── conversations_router.py
    │   │   │   │   ├── documents_router.py
    │   │   │   │   ├── graph_router.py
    │   │   │   │   ├── indices_router.py
    │   │   │   │   ├── prompts_router.py
    │   │   │   │   ├── retrieval_router.py
    │   │   │   │   ├── system_router.py
    │   │   │   │   └── users_router.py
    │   │   ├── app.py
    │   │   ├── app_entry.py
    │   │   ├── assembly
    │   │   │   ├── __init__.py
    │   │   │   ├── builder.py
    │   │   │   ├── factory.py
    │   │   │   └── utils.py
    │   │   ├── config.py
    │   │   ├── middleware
    │   │   │   ├── __init__.py
    │   │   │   └── project_schema.py
    │   │   ├── orchestration
    │   │   │   ├── __init__.py
    │   │   │   ├── hatchet
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph_workflow.py
    │   │   │   │   └── ingestion_workflow.py
    │   │   │   └── simple
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph_workflow.py
    │   │   │   │   └── ingestion_workflow.py
    │   │   └── services
    │   │   │   ├── __init__.py
    │   │   │   ├── auth_service.py
    │   │   │   ├── base.py
    │   │   │   ├── graph_service.py
    │   │   │   ├── ingestion_service.py
    │   │   │   ├── maintenance_service.py
    │   │   │   ├── management_service.py
    │   │   │   └── retrieval_service.py
    │   ├── parsers
    │   │   ├── __init__.py
    │   │   ├── media
    │   │   │   ├── __init__.py
    │   │   │   ├── audio_parser.py
    │   │   │   ├── bmp_parser.py
    │   │   │   ├── doc_parser.py
    │   │   │   ├── docx_parser.py
    │   │   │   ├── img_parser.py
    │   │   │   ├── odt_parser.py
    │   │   │   ├── pdf_parser.py
    │   │   │   ├── ppt_parser.py
    │   │   │   ├── pptx_parser.py
    │   │   │   └── rtf_parser.py
    │   │   ├── structured
    │   │   │   ├── __init__.py
    │   │   │   ├── csv_parser.py
    │   │   │   ├── eml_parser.py
    │   │   │   ├── epub_parser.py
    │   │   │   ├── json_parser.py
    │   │   │   ├── msg_parser.py
    │   │   │   ├── org_parser.py
    │   │   │   ├── p7s_parser.py
    │   │   │   ├── rst_parser.py
    │   │   │   ├── tsv_parser.py
    │   │   │   ├── xls_parser.py
    │   │   │   └── xlsx_parser.py
    │   │   └── text
    │   │   │   ├── __init__.py
    │   │   │   ├── css_parser.py
    │   │   │   ├── html_parser.py
    │   │   │   ├── js_parser.py
    │   │   │   ├── md_parser.py
    │   │   │   ├── python_parser.py
    │   │   │   ├── text_parser.py
    │   │   │   └── ts_parser.py
    │   ├── providers
    │   │   ├── __init__.py
    │   │   ├── auth
    │   │   │   ├── __init__.py
    │   │   │   ├── clerk.py
    │   │   │   ├── jwt.py
    │   │   │   ├── r2r_auth.py
    │   │   │   └── supabase.py
    │   │   ├── crypto
    │   │   │   ├── __init__.py
    │   │   │   ├── bcrypt.py
    │   │   │   └── nacl.py
    │   │   ├── database
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── chunks.py
    │   │   │   ├── collections.py
    │   │   │   ├── conversations.py
    │   │   │   ├── documents.py
    │   │   │   ├── filters.py
    │   │   │   ├── graphs.py
    │   │   │   ├── limits.py
    │   │   │   ├── maintenance.py
    │   │   │   ├── postgres.py
    │   │   │   ├── prompts
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── chunk_enrichment.yaml
    │   │   │   │   ├── collection_summary.yaml
    │   │   │   │   ├── dynamic_rag_agent.yaml
    │   │   │   │   ├── dynamic_rag_agent_xml_tooling.yaml
    │   │   │   │   ├── graph_communities.yaml
    │   │   │   │   ├── graph_entity_description.yaml
    │   │   │   │   ├── graph_extraction.yaml
    │   │   │   │   ├── hyde.yaml
    │   │   │   │   ├── rag.yaml
    │   │   │   │   ├── rag_fusion.yaml
    │   │   │   │   ├── static_rag_agent.yaml
    │   │   │   │   ├── static_research_agent.yaml
    │   │   │   │   ├── summary.yaml
    │   │   │   │   ├── system.yaml
    │   │   │   │   ├── vision_img.yaml
    │   │   │   │   └── vision_pdf.yaml
    │   │   │   ├── prompts_handler.py
    │   │   │   ├── tokens.py
    │   │   │   ├── users.py
    │   │   │   └── utils.py
    │   │   ├── email
    │   │   │   ├── __init__.py
    │   │   │   ├── console_mock.py
    │   │   │   ├── mailersend.py
    │   │   │   ├── sendgrid.py
    │   │   │   └── smtp.py
    │   │   ├── embeddings
    │   │   │   ├── __init__.py
    │   │   │   ├── litellm.py
    │   │   │   ├── ollama.py
    │   │   │   ├── openai.py
    │   │   │   └── utils.py
    │   │   ├── file
    │   │   │   ├── __init__.py
    │   │   │   ├── postgres.py
    │   │   │   └── s3.py
    │   │   ├── ingestion
    │   │   │   ├── __init__.py
    │   │   │   ├── r2r
    │   │   │   │   └── base.py
    │   │   │   └── unstructured
    │   │   │   │   └── base.py
    │   │   ├── llm
    │   │   │   ├── __init__.py
    │   │   │   ├── anthropic.py
    │   │   │   ├── azure_foundry.py
    │   │   │   ├── litellm.py
    │   │   │   ├── openai.py
    │   │   │   ├── r2r_llm.py
    │   │   │   └── utils.py
    │   │   ├── ocr
    │   │   │   ├── __init__.py
    │   │   │   └── mistral.py
    │   │   ├── orchestration
    │   │   │   ├── __init__.py
    │   │   │   ├── hatchet.py
    │   │   │   └── simple.py
    │   │   └── scheduler
    │   │   │   ├── __init__.py
    │   │   │   └── apscheduler.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── context.py
    │   │   ├── logging_config.py
    │   │   ├── sentry.py
    │   │   └── serper.py
    ├── migrations
    │   ├── README
    │   ├── alembic.ini
    │   ├── env.py
    │   ├── script.py.mako
    │   └── versions
    │   │   ├── 2fac23e4d91b_migrate_to_document_search.py
    │   │   ├── 3efc7b3b1b3d_add_total_tokens_count.py
    │   │   ├── 7eb70560f406_add_limits_overrides_to_users.py
    │   │   ├── 8077140e1e99_v3_api_database_revision.py
    │   │   ├── c45a9cf6a8a4_add_user_and_document_count_to_.py
    │   │   └── d342e632358a_migrate_to_asyncpg.py
    ├── pyproject.toml
    ├── r2r
    │   ├── __init__.py
    │   ├── mcp.py
    │   ├── r2r.toml
    │   └── serve.py
    ├── sdk
    │   ├── README.md
    │   ├── __init__.py
    │   ├── asnyc_methods
    │   │   ├── __init__.py
    │   │   ├── chunks.py
    │   │   ├── collections.py
    │   │   ├── conversations.py
    │   │   ├── documents.py
    │   │   ├── graphs.py
    │   │   ├── indices.py
    │   │   ├── prompts.py
    │   │   ├── retrieval.py
    │   │   ├── system.py
    │   │   └── users.py
    │   ├── async_client.py
    │   ├── base
    │   │   ├── __init_.py
    │   │   └── base_client.py
    │   ├── models.py
    │   ├── sync_client.py
    │   └── sync_methods
    │   │   ├── __init__.py
    │   │   ├── chunks.py
    │   │   ├── collections.py
    │   │   ├── conversations.py
    │   │   ├── documents.py
    │   │   ├── graphs.py
    │   │   ├── indices.py
    │   │   ├── prompts.py
    │   │   ├── retrieval.py
    │   │   ├── system.py
    │   │   └── users.py
    ├── shared
    │   ├── __init__.py
    │   ├── abstractions
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── document.py
    │   │   ├── exception.py
    │   │   ├── graph.py
    │   │   ├── llm.py
    │   │   ├── prompt.py
    │   │   ├── search.py
    │   │   ├── tool.py
    │   │   ├── user.py
    │   │   └── vector.py
    │   ├── api
    │   │   └── models
    │   │   │   ├── __init__.py
    │   │   │   ├── auth
    │   │   │       ├── __init__.py
    │   │   │       └── responses.py
    │   │   │   ├── base.py
    │   │   │   ├── graph
    │   │   │       ├── __init__.py
    │   │   │       └── responses.py
    │   │   │   ├── ingestion
    │   │   │       ├── __init__.py
    │   │   │       └── responses.py
    │   │   │   ├── management
    │   │   │       ├── __init__.py
    │   │   │       └── responses.py
    │   │   │   └── retrieval
    │   │   │       ├── __init__.py
    │   │   │       └── responses.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── base_utils.py
    │   │   └── splitter
    │   │       ├── __init__.py
    │   │       └── text.py
    ├── tests
    │   ├── integration
    │   │   ├── conftest.py
    │   │   ├── test_agent.py
    │   │   ├── test_base.py
    │   │   ├── test_chunks.py
    │   │   ├── test_collections.py
    │   │   ├── test_collections_users_interaction.py
    │   │   ├── test_conversations.py
    │   │   ├── test_documents.py
    │   │   ├── test_filters.py
    │   │   ├── test_graphs.py
    │   │   ├── test_indices.py
    │   │   ├── test_ingestion.py
    │   │   ├── test_retrieval.py
    │   │   ├── test_retrieval_advanced.py
    │   │   ├── test_system.py
    │   │   └── test_users.py
    │   ├── scaling
    │   │   ├── __init__.py
    │   │   └── loadTester.py
    │   └── unit
    │   │   ├── agent
    │   │       ├── test_agent.py
    │   │       ├── test_agent_citations.py
    │   │       ├── test_agent_citations_old.py
    │   │       ├── test_agent_old.py
    │   │       └── test_streaming_agent.py
    │   │   ├── app
    │   │       ├── test_config.py
    │   │       └── test_routes.py
    │   │   ├── conftest.py
    │   │   ├── database
    │   │       ├── test_collections.py
    │   │       ├── test_conversations.py
    │   │       ├── test_graphs.py
    │   │       └── test_limits.py
    │   │   ├── document
    │   │       ├── test_chunks.py
    │   │       ├── test_document_processing.py
    │   │       └── test_documents.py
    │   │   └── retrieval
    │   │       ├── __init__.py
    │   │       ├── conftest.py
    │   │       ├── test_citations.py
    │   │       ├── test_database_filters.py
    │   │       ├── test_rag_processing.py
    │   │       └── test_retrieval_old.py
    └── uv.lock
└── services
    ├── README.md
    ├── clustering
        ├── Dockerfile.clustering
        └── main.py
    └── unstructured
        ├── Dockerfile.unstructured
        ├── README.md
        └── main.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # Environment variables for LLM provider(s)
 2 | export OPENAI_API_KEY=sk-...
 3 | #  uncomment the following lines to enable other LLM providers
 4 | # export ANTHROPIC_API_KEY=...
 5 | # export VERTEX_API_KEY=...
 6 | # export XAI_API_KEY=...
 7 | # Add other provider keys as needed
 8 | 
 9 | # Environment variables for the Postgres database
10 | export R2R_POSTGRES_USER=your_user
11 | export R2R_POSTGRES_PASSWORD=your_password
12 | export R2R_POSTGRES_HOST=your_host
13 | export R2R_POSTGRES_PORT=your_port
14 | export R2R_POSTGRES_DBNAME=your_db
15 | export R2R_PROJECT_NAME=your_project_name
16 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.html linguist-documentation
2 | *.ipynb linguist-documentation
3 | templates/** linguist-vendored
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/actions/login-docker/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Login Docker'
 2 | description: 'Sets up Docker for running R2R'
 3 | inputs:
 4 |   docker_username:
 5 |     description: 'Docker Hub username'
 6 |     required: true
 7 |   docker_password:
 8 |     description: 'Docker Hub password or token'
 9 |     required: true
10 | runs:
11 |   using: "composite"
12 |   steps:
13 |     - name: Login to Docker Hub
14 |       uses: docker/login-action@v2
15 |       with:
16 |         username: ${{ inputs.docker_username }}
17 |         password: ${{ inputs.docker_password }}
18 | 


--------------------------------------------------------------------------------
/.github/actions/setup-docker/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Setup Docker'
 2 | description: 'Sets up Docker for running R2R'
 3 | runs:
 4 |   using: "composite"
 5 |   steps:
 6 |     - name: Set up Docker
 7 |       uses: docker-practice/actions-setup-docker@master
 8 |       with:
 9 |         docker_version: 20.10
10 |         docker_buildx: true
11 | 
12 |     - name: Set up Docker Buildx
13 |       uses: docker/setup-buildx-action@v2
14 | 


--------------------------------------------------------------------------------
/.github/actions/setup-python-full/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Setup Python for R2R Full'
 2 | description: 'Sets up Python and installs R2R dependencies for full installation'
 3 | 
 4 | inputs:
 5 |   os:
 6 |     description: 'Operating system'
 7 |     required: true
 8 |   python-version:
 9 |     description: 'Python version to use'
10 |     required: false
11 |     default: '3.12'
12 | 
13 | runs:
14 |   using: "composite"
15 |   steps:
16 |     - name: Set up Python
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: ${{ inputs.python-version }}
20 |         cache: 'pip'
21 | 
22 |     - name: Install R2R CLI & Python SDK
23 |       shell: bash
24 |       run: |
25 |         pip install r2r
26 | 
27 |     - name: Install uv
28 |       shell: bash
29 |       run: |
30 |         pip install uv
31 | 
32 |     - name: Install uv
33 |       shell: bash
34 |       run: |
35 |         pip install uv
36 | 
37 |     - name: Cache uv dependencies
38 |       uses: actions/cache@v4
39 |       with:
40 |         path: |
41 |           py/.venv
42 |           py/uv.lock
43 |         key: ${{ runner.os }}-uv-${{ hashFiles('py/pyproject.toml', 'py/uv.lock') }}
44 |         restore-keys: |
45 |           ${{ runner.os }}-uv-
46 | 
47 |     - name: Install dependencies with uv
48 |       shell: bash
49 |       working-directory: py
50 |       run: |
51 |         uv sync --extra core
52 | 


--------------------------------------------------------------------------------
/.github/actions/setup-python-light/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Setup Python for R2R Light'
 2 | description: 'Sets up Python environment and installs dependencies using uv'
 3 | 
 4 | inputs:
 5 |   os:
 6 |     description: 'Operating system'
 7 |     required: true
 8 |   python-version:
 9 |     description: 'Python version to use'
10 |     required: false
11 |     default: '3.12'
12 | 
13 | runs:
14 |   using: "composite"
15 |   steps:
16 |     - name: Set up Python environment
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: ${{ inputs.python-version }}
20 |         cache: 'pip'
21 | 
22 |     - name: Install uv
23 |       shell: bash
24 |       run: |
25 |         pip install uv
26 | 
27 |     - name: Cache uv dependencies
28 |       uses: actions/cache@v4
29 |       with:
30 |         path: |
31 |           py/.venv
32 |           py/uv.lock
33 |         key: ${{ runner.os }}-uv-${{ hashFiles('py/pyproject.toml', 'py/uv.lock') }}
34 |         restore-keys: |
35 |           ${{ runner.os }}-uv-
36 | 
37 |     - name: Install dependencies with uv
38 |       shell: bash
39 |       working-directory: py
40 |       run: |
41 |         uv sync --extra core
42 |         uv pip install pip wheel
43 | 


--------------------------------------------------------------------------------
/.github/actions/start-r2r-full/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Start R2R Server'
 2 | description: 'Starts the R2R server'
 3 | runs:
 4 |   using: "composite"
 5 |   steps:
 6 |   - name: Inspect Docker image manifests
 7 |     shell: bash
 8 |     run: |
 9 |       docker manifest inspect ragtoriches/prod:latest
10 | 
11 |   - name: Start R2R Server
12 |     shell: bash
13 |     run: |
14 |       cd py
15 |       docker build -t r2r/local .
16 |       export R2R_CONFIG_NAME=full_azure
17 |       export R2R_IMAGE=r2r/local
18 |       docker compose -f r2r/compose.full.yaml --project-name r2r-full up -d
19 |       uv run r2r serve --docker --full --config-name=full_azure --build --image=r2r-local
20 | 


--------------------------------------------------------------------------------
/.github/actions/start-r2r-light/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Start R2R Server'
 2 | description: 'Starts the R2R server'
 3 | inputs:
 4 |   config-name:
 5 |     description: 'The R2R configuration name to use'
 6 |     required: false
 7 |     default: 'r2r_azure_with_test_limits'
 8 | runs:
 9 |   using: "composite"
10 |   steps:
11 |     - name: Start R2R server
12 |       shell: bash
13 |       run: |
14 |         cd py
15 |         export R2R_CONFIG_NAME=${{ inputs.config-name }}
16 |         uv run python -m r2r.serve &
17 |         echo "Waiting for services to start..."
18 |         sleep 30
19 | 


--------------------------------------------------------------------------------
/.github/workflows/build-cluster-service-docker.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish Cluster Service Docker Image
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | env:
 7 |   REGISTRY_BASE: ragtoriches
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout Repository
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: '3.12'
20 | 
21 |       - name: Install toml package
22 |         run: pip install toml
23 | 
24 |       - name: Determine version
25 |         id: version
26 |         run: |
27 |           echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/cluster-prod" >> $GITHUB_OUTPUT
28 | 
29 |       - name: Set up Docker Buildx
30 |         uses: docker/setup-buildx-action@v3
31 | 
32 |       - name: Docker Auth
33 |         uses: docker/login-action@v3
34 |         with:
35 |           username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }}
36 |           password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }}
37 | 
38 |       - name: Build and push image
39 |         uses: docker/build-push-action@v5
40 |         with:
41 |           context: ./services/clustering
42 |           file: ./services/clustering/Dockerfile.clustering
43 |           platforms: linux/amd64,linux/arm64
44 |           push: true
45 |           tags: ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
46 |           provenance: false
47 |           sbom: false
48 | 
49 |       - name: Verify manifest
50 |         run: |
51 |           docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
52 | 


--------------------------------------------------------------------------------
/.github/workflows/build-unst-service-docker.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish Unstructured Service Docker Image
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | env:
 7 |   REGISTRY_BASE: ragtoriches
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout Repository
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: '3.12'
20 | 
21 |       - name: Install toml package
22 |         run: pip install toml
23 | 
24 |       - name: Determine version
25 |         id: version
26 |         run: |
27 |           echo "REGISTRY_IMAGE=${{ env.REGISTRY_BASE }}/unst-prod" >> $GITHUB_OUTPUT
28 | 
29 |       - name: Set up Docker Buildx
30 |         uses: docker/setup-buildx-action@v3
31 | 
32 |       - name: Docker Auth
33 |         uses: docker/login-action@v3
34 |         with:
35 |           username: ${{ secrets.RAGTORICHES_DOCKER_UNAME }}
36 |           password: ${{ secrets.RAGTORICHES_DOCKER_TOKEN }}
37 | 
38 |       - name: Build and push image
39 |         uses: docker/build-push-action@v5
40 |         with:
41 |           context: ./services/unstructured
42 |           file: ./services/unstructured/Dockerfile.unstructured
43 |           platforms: linux/amd64,linux/arm64
44 |           push: true
45 |           tags: ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
46 |           provenance: false
47 |           sbom: false
48 | 
49 |       - name: Verify manifest
50 |         run: |
51 |           docker buildx imagetools inspect ${{ steps.version.outputs.REGISTRY_IMAGE }}:latest
52 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-npm.yml:
--------------------------------------------------------------------------------
 1 | name: Publish NPM Package
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-latest
 9 |     defaults:
10 |       run:
11 |         working-directory: js/sdk
12 |     steps:
13 |       - uses: actions/checkout@v4
14 | 
15 |       - name: Set up Node.js
16 |         uses: actions/setup-node@v3
17 |         with:
18 |           node-version: '20'
19 |           registry-url: 'https://registry.npmjs.org'
20 | 
21 |       - name: Install pnpm
22 |         uses: pnpm/action-setup@v2
23 |         with:
24 |           version: 6.0.2
25 | 
26 |       - name: Install dependencies
27 |         run: pnpm install
28 | 
29 |       - name: Build
30 |         run: pnpm run build
31 | 
32 |       - name: Publish to npm
33 |         run: pnpm publish --no-git-checks
34 |         env:
35 |           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - dev
 7 |       - dev-minor
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   publish:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout code
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v5
19 |         with:
20 |           python-version: '3.12'
21 | 
22 |       - name: Install tools
23 |         run: pip install twine tomlkit build
24 | 
25 |       - name: Bump version for dev branches (TestPyPI)
26 |         if: github.event_name == 'push'
27 |         run: |
28 |           cd py
29 |           old_version=$(python -c "import tomlkit; d=tomlkit.parse(open('pyproject.toml').read()); print(d['project']['version'])")
30 |           new_version="${old_version}a$(date +'%Y%m%d%H%M')"
31 |           python -c "import tomlkit; d=tomlkit.parse(open('pyproject.toml').read()); d['project']['version']='$new_version'; open('pyproject.toml','w').write(tomlkit.dumps(d))"
32 | 
33 |       - name: Build distributions
34 |         run: |
35 |           cd py
36 |           python -m build
37 | 
38 |       - name: Publish to TestPyPI
39 |         if: github.event_name == 'push'
40 |         env:
41 |           PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
42 |           TEST_PYPI_API_TOKEN: ${{ secrets.TEST_PYPI_API_TOKEN }}
43 |         run: |
44 |           cd py
45 |           twine upload --repository-url https://test.pypi.org/legacy/ -u __token__ -p "$TEST_PYPI_API_TOKEN" dist/*
46 | 
47 |       - name: Publish to PyPI
48 |         if: github.event_name == 'workflow_dispatch'
49 |         env:
50 |           PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
51 |           PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
52 |         run: |
53 |           cd py
54 |           twine upload -u __token__ -p "$PYPI_API_TOKEN" dist/*
55 | 


--------------------------------------------------------------------------------
/.github/workflows/quality.yml:
--------------------------------------------------------------------------------
 1 | name: Code Quality Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ '**' ]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   pre-commit:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 | 
14 |       - name: Set up Python
15 |         uses: actions/setup-python@v4
16 |         with:
17 |           python-version: '3.x'
18 | 
19 |       - name: Install dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install pre-commit
23 |           pip install mypy
24 |           pip install types-requests types-toml types-aiofiles
25 | 
26 |       - name: Run pre-commit hooks
27 |         run: |
28 |           pre-commit run --all-files
29 | 


--------------------------------------------------------------------------------
/.github/workflows/r2r-js-sdk-ci.yml:
--------------------------------------------------------------------------------
 1 | name: R2R JS SDK Integration CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |     paths:
 7 |       - 'js/sdk/**'
 8 |   pull_request:
 9 |     branches: [main]
10 |     paths:
11 |       - 'js/sdk/**'
12 | 
13 | jobs:
14 |   build-and-test:
15 |     runs-on: ubuntu-latest
16 | 
17 |     defaults:
18 |       run:
19 |         working-directory: ./js/sdk
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v4
23 | 
24 |       - name: Use Node.js
25 |         uses: actions/setup-node@v4
26 |         with:
27 |           node-version: "18"
28 | 
29 |       - name: Install pnpm
30 |         uses: pnpm/action-setup@v4
31 |         with:
32 |           version: 8
33 | 
34 |       - name: Install dependencies
35 |         run: pnpm install
36 | 
37 |       - name: Build
38 |         run: pnpm run build
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | js/sdk/**/*.js
 2 | js/tests/*
 3 | **/.DS_Store
 4 | 
 5 | .env
 6 | .gitignore
 7 | *.log
 8 | .DS_Store
 9 | *.gguf
10 | logs/
11 | workspace/
12 | py/workspace/
13 | uploads/
14 | **/__pycache__
15 | **/.mypy_cache
16 | **/.pytest_cache
17 | dump/*
18 | .next
19 | node_modules
20 | .idea
21 | 
22 | coverage.xml
23 | .coverage
24 | 
25 | **/*.sqlite*
26 | **/*.sqlite3*
27 | 
28 | node_modules/
29 | dist/
30 | **/.data/*
31 | 
32 | *.exe
33 | *.exe~
34 | *.dll
35 | *.so
36 | *.dylib
37 | *.test
38 | go.work
39 | go.work.sum
40 | 
41 | .vscode/
42 | .python-version
43 | .ruff_cache/
44 | *.egg-info
45 | .venv
46 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.0
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |         exclude: ^.venv/
 7 |       - id: end-of-file-fixer
 8 |         exclude: ^.venv/
 9 |       - id: check-added-large-files
10 |         exclude: ^.venv/
11 |       - id: check-ast
12 |         exclude: ^.venv/
13 |       - id: check-yaml
14 |         exclude: ^.venv/
15 | 
16 |   - repo: local
17 |     hooks:
18 |       - id: check-typing-imports
19 |         name: Check for Dict, List, or Union usage
20 |         entry: bash -c 'echo "Checking for typing imports..." && FOUND=$(cd "$(git rev-parse --show-toplevel)" && find . -path "*/py/*.py" | grep -v "venv" | grep -v "/.venv/" | grep -v "/site-packages/" | grep -v "test_" | grep -v "/migrations/" | xargs grep -l "from typing.*import.*[^d]Dict\\|from typing.*import.*List\\|from typing.*import.*Union" 2>/dev/null || echo "") && if [ -n "$FOUND" ]; then echo "$FOUND"; echo "  Please import dict instead of Dict, list instead of List, and the logical OR operator"; exit 1; else echo "No problematic imports found!"; exit 0; fi'
21 |         language: system
22 |         types: [python]
23 |         pass_filenames: false
24 | 
25 |   - repo: local
26 |     hooks:
27 |       - id: check-print-statements
28 |         name: Check for print statements
29 |         entry: bash -c 'echo "Checking for print statements..." && FOUND=$(cd "$(git rev-parse --show-toplevel)" && find . -path "*/py/*.py" | grep -v "venv" | grep -v "/.venv/" | grep -v "/site-packages/" | grep -v "test_" | grep -v "/core/examples/" | grep -v "/migrations/" | grep -v "/tests/" | grep -v "/examples.py" | xargs grep -l "print(" 2>/dev/null || echo "") && if [ -n "$FOUND" ]; then echo "$FOUND"; echo "Found print statements!"; exit 1; else echo "No print statements found!"; exit 0; fi'
30 |         language: system
31 |         types: [python]
32 |         pass_filenames: false
33 |         exclude: ^(.venv/|py/.venv/|py/core/examples/|py/migrations/|py/tests/)
34 | 
35 |   - repo: https://github.com/astral-sh/ruff-pre-commit
36 |     rev: v0.9.6
37 |     hooks:
38 |       - id: ruff
39 |         args: [--fix]
40 |         files: ^py/
41 |         exclude: ^(py/tests/|.venv/)
42 |       - id: ruff-format
43 |         files: ^py/
44 |         exclude: ^(py/tests/|.venv/)
45 | 
46 |   - repo: local
47 |     hooks:
48 |       - id: mypy
49 |         name: mypy
50 |         entry: bash -c 'cd "$(git rev-parse --show-toplevel)/py" && python -m mypy --exclude "migrations" --exclude "venv*" --exclude "test_*" .'
51 |         language: system
52 |         types: [python]
53 |         pass_filenames: false
54 |         exclude: ^(.venv/|migrations/)
55 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct Summary
 2 | 
 3 | TL;DR: Be nice. Be respectful. Be professional. Don't be a jerk.
 4 | 
 5 | ## Commitment
 6 | 
 7 | We strive for a harassment-free, inclusive, and healthy community experience for all, regardless of personal characteristics or background.
 8 | 
 9 | ## Expected Behaviors
10 | 
11 | - **Empathy and Kindness**: Show understanding and kindness to others.
12 | - **Respect**: Value different viewpoints and experiences.
13 | - **Constructive Feedback**: Offer and accept feedback graciously.
14 | - **Accountability**: Own up to mistakes and learn from them.
15 | - **Community Focus**: Prioritize what's best for the whole community.
16 | 
17 | ## Unacceptable Behaviors
18 | 
19 | - **Sexualized Content**: Avoid sexual language and unwelcome sexual attention.
20 | - **Disrespect**: No trolling, insults, or derogatory comments.
21 | - **Harassment**: Public or private harassment is unacceptable.
22 | - **Privacy Violations**: Do not share private information without consent.
23 | - **Inappropriate Conduct**: Behavior not suitable for a professional setting is not allowed.
24 | 
25 | ## Enforcement
26 | 
27 | - **Leaders' Responsibility**: Leaders clarify standards and take corrective actions.
28 | - **Scope**: Applies to all community spaces and when representing the community.
29 | - **Reporting**: Incidents can be reported to owen@sciphi.ai.
30 | 
31 | ## Enforcement Guidelines
32 | 
33 | - **Correction**: Private warning for unprofessional behavior.
34 | - **Warning**: Consequences for repeated violations.
35 | - **Temporary Ban**: For serious or sustained inappropriate behavior.
36 | - **Permanent Ban**: For egregious violations, including harassment.
37 | 
38 | ## Attribution
39 | 
40 | Adapted from the [Contributor Covenant version 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html), with Community Impact Guidelines inspired by [Mozilla's code of conduct enforcement ladder](https://www.mozilla.org/en-US/about/governance/policies/participation/).
41 | 
42 | For more details and FAQs, visit [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). Translations are available [here](https://www.contributor-covenant.org/translations).
43 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # R2R Contribution Guide
 2 | 
 3 | ## Quick Start
 4 | 
 5 | - **Pre-Discussion**: Feel free to propose your ideas via issues, [Discord](https://discord.gg/p6KqD2kjtB) if you want to get early feedback.
 6 | - **Code of Conduct**: Adhere to our [Code of Conduct](./CODE_OF_CONDUCT.md) in all interactions.
 7 | - **Pull Requests (PRs)**: Follow the PR process for contributions.
 8 | 
 9 | ## Pull Request Process
10 | 
11 | 1. **Dependencies**: Ensure all dependencies are necessary and documented.
12 | 2. **Documentation**: Update README.md with any changes to interfaces, including new environment variables, exposed ports, and other relevant details.
13 | 3. **Versioning**: Increment version numbers in examples and README.md following [SemVer](http://semver.org/).
14 | 4. **Review**: A PR can be merged after receiving approval from at least two other developers. If you lack merge permissions, request a review for merging.
15 | 
16 | ## Attribution
17 | 
18 | This Code of Conduct adapts from the [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4/).
19 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2024 EmergentAGI Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.md:
--------------------------------------------------------------------------------
1 | # The R2R Manifest
2 | 
3 | We will do our best to build useful AI tools for developers _(before AGI)_.
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ./py/README.md


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Security Policy
 3 | 
 4 | At R2R, we take the security of our project and its users seriously. We appreciate the contributions of security researchers and developers in helping us identify and address potential vulnerabilities.
 5 | 
 6 | ## Reporting a Vulnerability
 7 | 
 8 | If you discover a potential security vulnerability in R2R, please follow these steps to report it:
 9 | 
10 | 1. Create a new issue on the GitHub repository using the "Vulnerability Disclosure" issue template.
11 | 2. Set the issue as "confidential" if you are unsure whether the issue is a potential vulnerability or not. It is easier to make a confidential issue public than to remediate an issue that should have been confidential.
12 | 3. Label the issue with the `security` label at a minimum. Additional labels may be applied by the security team and other project maintainers to assist with the triage process.
13 | 4. Provide a detailed description of the vulnerability, including steps to reproduce, potential impact, and any other relevant information.
14 | 5. If the issue contains sensitive information or user-specific data, such as private repository contents, assign the `keep confidential` label to the issue. If possible, avoid including such information directly in the issue and instead provide links to resources that are only accessible to the project maintainers.
15 | 
16 | ## Vulnerability Handling Process
17 | 
18 | Once a vulnerability is reported, the R2R security team will follow these steps:
19 | 
20 | 1. Acknowledge receipt of the vulnerability report within 48 hours.
21 | 2. Assess the severity and impact of the vulnerability.
22 | 3. Develop a fix or mitigation plan for the vulnerability.
23 | 4. Notify the reporter about the progress and estimated timeline for the fix.
24 | 5. Once the fix is ready, release a new version of R2R that addresses the vulnerability.
25 | 6. Publicly disclose the vulnerability and the fix after a reasonable period to allow users to update their installations.
26 | 
27 | ## Scope
28 | 
29 | This security policy applies to the R2R codebase and its dependencies. It does not cover vulnerabilities in the underlying operating systems, hardware, or third-party libraries used by R2R.
30 | 
31 | ## Recognition
32 | 
33 | We greatly appreciate the efforts of security researchers and developers who responsibly disclose vulnerabilities to us. With your permission, we will acknowledge your contribution in the release notes and any public disclosures related to the vulnerability.
34 | 
35 | ## Contact
36 | 
37 | If you have any questions or concerns regarding the security of R2R, please contact the project maintainers at [security@r2r.com](mailto:security@r2r.com).
38 | 
39 | Thank you for helping us keep R2R and its users secure!
40 | 


--------------------------------------------------------------------------------
/docker/compose.yaml:
--------------------------------------------------------------------------------
 1 | volumes:
 2 |   postgres_data:
 3 |     name: postgres_data
 4 |   minio_data:
 5 |     name: minio_data
 6 | 
 7 | services:
 8 |   postgres:
 9 |     image: pgvector/pgvector:pg16
10 |     profiles: [postgres]
11 |     env_file:
12 |       - ./env/postgres.env
13 |     volumes:
14 |       - postgres_data:/var/lib/postgresql/data
15 |     ports:
16 |       - "5432:5432"
17 |     healthcheck:
18 |       test: ["CMD-SHELL", "pg_isready -U postgres"]
19 |       interval: 10s
20 |       timeout: 5s
21 |       retries: 5
22 |     restart: on-failure
23 |     command: >
24 |       postgres
25 |       -c max_connections=1024
26 | 
27 |   minio:
28 |     image: minio/minio
29 |     profiles: [minio]
30 |     env_file:
31 |       - ./env/minio.env
32 |     volumes:
33 |       - minio_data:/data
34 |     ports:
35 |       - "9000:9000"
36 |       - "9001:9001"
37 |     healthcheck:
38 |       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
39 |       interval: 10s
40 |       timeout: 5s
41 |       retries: 5
42 |     restart: on-failure
43 |     command: server /data --console-address ":9001"
44 | 
45 |   graph_clustering:
46 |     image: ragtoriches/cluster-prod
47 |     ports:
48 |       - "7276:7276"
49 |     healthcheck:
50 |       test: ["CMD", "curl", "-f", "http://localhost:7276/health"]
51 |       interval: 10s
52 |       timeout: 5s
53 |       retries: 5
54 | 
55 |   r2r:
56 |     image: sciphiai/r2r:latest
57 |     ports:
58 |       - "7272:7272"
59 |     env_file:
60 |       - ./env/r2r.env
61 |     healthcheck:
62 |       test: ["CMD", "curl", "-f", "http://localhost:7272/v3/health"]
63 |       interval: 6s
64 |       timeout: 5s
65 |       retries: 5
66 |     restart: on-failure
67 |     volumes:
68 |       - ./user_configs:/app/user_configs
69 |       - ./user_tools:/app/user_tools
70 |     extra_hosts:
71 |       - host.docker.internal:host-gateway
72 | 
73 |   r2r-dashboard:
74 |     image: sciphiai/r2r-dashboard:1.0.3
75 |     env_file:
76 |       - ./env/r2r-dashboard.env
77 |     ports:
78 |       - "7273:3000"
79 | 


--------------------------------------------------------------------------------
/docker/env/hatchet.env:
--------------------------------------------------------------------------------
 1 | DATABASE_URL="postgres://hatchet_user:hatchet_password@hatchet-postgres:5432/hatchet?sslmode=disable"
 2 | 
 3 | HATCHET_CLIENT_GRPC_MAX_RECV_MESSAGE_LENGTH=134217728
 4 | HATCHET_CLIENT_GRPC_MAX_SEND_MESSAGE_LENGTH=134217728
 5 | 
 6 | DATABASE_POSTGRES_PORT=5432
 7 | DATABASE_POSTGRES_HOST=hatchet-postgres
 8 | DATABASE_POSTGRES_USERNAME=hatchet_user
 9 | DATABASE_POSTGRES_PASSWORD=hatchet_password
10 | HATCHET_DATABASE_POSTGRES_DB_NAME=hatchet
11 | POSTGRES_DB=hatchet
12 | POSTGRES_USER=hatchet_user
13 | POSTGRES_PASSWORD=hatchet_password
14 | 
15 | SERVER_TASKQUEUE_RABBITMQ_URL=amqp://user:password@hatchet-rabbitmq:5672/
16 | SERVER_AUTH_COOKIE_DOMAIN=http://host.docker.internal:7274
17 | SERVER_URL=http://host.docker.internal:7274
18 | SERVER_AUTH_COOKIE_INSECURE=t
19 | SERVER_GRPC_BIND_ADDRESS=0.0.0.0
20 | SERVER_GRPC_INSECURE=t
21 | SERVER_GRPC_BROADCAST_ADDRESS=hatchet-engine:7077
22 | SERVER_GRPC_MAX_MSG_SIZE=134217728
23 | SERVER_GRPC_PORT="7077"
24 | 
25 | RABBITMQ_DEFAULT_USER=user
26 | RABBITMQ_DEFAULT_PASS=password
27 | 


--------------------------------------------------------------------------------
/docker/env/minio.env:
--------------------------------------------------------------------------------
1 | MINIO_ROOT_USER=minioadmin
2 | MINIO_ROOT_PASSWORD=minioadmin
3 | 


--------------------------------------------------------------------------------
/docker/env/postgres.env:
--------------------------------------------------------------------------------
1 | POSTGRES_USER=postgres
2 | POSTGRES_PASSWORD=postgres
3 | POSTGRES_HOST=postgres
4 | POSTGRES_PORT=5432
5 | POSTGRES_MAX_CONNECTIONS=1024
6 | PGPORT=5432
7 | 


--------------------------------------------------------------------------------
/docker/env/r2r-dashboard.env:
--------------------------------------------------------------------------------
1 | NEXT_PUBLIC_R2R_DEPLOYMENT_URL=http://localhost:7272
2 | NEXT_PUBLIC_HATCHET_DASHBOARD_URL=http://localhost:7274
3 | NEXT_PUBLIC_R2R_DEFAULT_EMAIL="admin@example.com"
4 | NEXT_PUBLIC_R2R_DEFAULT_PASSWORD="change_me_immediately"
5 | 


--------------------------------------------------------------------------------
/docker/env/r2r-full.env:
--------------------------------------------------------------------------------
  1 | # R2R
  2 | R2R_PORT=7272
  3 | R2R_HOST=0.0.0.0
  4 | R2R_LOG_LEVEL=INFO
  5 | R2R_CONFIG_NAME=full
  6 | R2R_CONFIG_PATH=
  7 | R2R_PROJECT_NAME=r2r_default
  8 | R2R_SECRET_KEY=
  9 | R2R_USER_TOOLS_PATH=/app/user_tools
 10 | R2R_LOG_FORMAT=
 11 | 
 12 | # Postgres Configuration
 13 | R2R_POSTGRES_USER=postgres
 14 | R2R_POSTGRES_PASSWORD=postgres
 15 | R2R_POSTGRES_HOST=postgres
 16 | R2R_POSTGRES_PORT=5432
 17 | R2R_POSTGRES_DBNAME=postgres
 18 | R2R_POSTGRES_MAX_CONNECTIONS=1024
 19 | R2R_POSTGRES_STATEMENT_CACHE_SIZE=100
 20 | 
 21 | # Hatchet
 22 | HATCHET_CLIENT_TLS_STRATEGY=none
 23 | 
 24 | # OpenAI
 25 | OPENAI_API_KEY=
 26 | OPENAI_API_BASE=
 27 | 
 28 | # Azure Foundry
 29 | AZURE_FOUNDRY_API_ENDPOINT=
 30 | AZURE_FOUNDRY_API_KEY=
 31 | 
 32 | # XAI / GROK
 33 | XAI_API_KEY=
 34 | 
 35 | # Anthropic
 36 | ANTHROPIC_API_KEY=
 37 | 
 38 | # Azure
 39 | AZURE_API_KEY=
 40 | AZURE_API_BASE=
 41 | AZURE_API_VERSION=
 42 | 
 43 | # Google Vertex AI
 44 | GOOGLE_APPLICATION_CREDENTIALS=
 45 | VERTEX_PROJECT=
 46 | VERTEX_LOCATION=
 47 | 
 48 | # Google Gemini
 49 | GEMINI_API_KEY=
 50 | 
 51 | # Mistral
 52 | MISTRAL_API_KEY=
 53 | 
 54 | # AWS Bedrock
 55 | AWS_ACCESS_KEY_ID=
 56 | AWS_SECRET_ACCESS_KEY=
 57 | AWS_REGION_NAME=
 58 | 
 59 | # Groq
 60 | GROQ_API_KEY=
 61 | 
 62 | # Cohere
 63 | COHERE_API_KEY=
 64 | 
 65 | # Anyscale
 66 | ANYSCALE_API_KEY=
 67 | 
 68 | # Ollama
 69 | OLLAMA_API_BASE=http://host.docker.internal:11434
 70 | 
 71 | # LM Studio
 72 | LM_STUDIO_API_BASE=http://host.docker.internal:1234
 73 | LM_STUDIO_API_KEY=1234
 74 | 
 75 | # Huggingface
 76 | HUGGINGFACE_API_BASE=http://host.docker.internal:8080
 77 | HUGGINGFACE_API_KEY=
 78 | 
 79 | # Unstructured
 80 | UNSTRUCTURED_API_KEY=
 81 | UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general
 82 | UNSTRUCTURED_SERVICE_URL=http://unstructured:7275
 83 | UNSTRUCTURED_NUM_WORKERS=10
 84 | 
 85 | # Graphologic
 86 | CLUSTERING_SERVICE_URL=http://graph_clustering:7276
 87 | 
 88 | # OAuth Credentials
 89 | GOOGLE_CLIENT_ID=
 90 | GOOGLE_CLIENT_SECRET=
 91 | GOOGLE_REDIRECT_URI=
 92 | 
 93 | GITHUB_CLIENT_ID=
 94 | GITHUB_CLIENT_SECRET=
 95 | GITHUB_REDIRECT_URI=
 96 | 
 97 | # Email
 98 | MAILERSEND_API_KEY=
 99 | SENDGRID_API_KEY=
100 | 
101 | # Websearch
102 | FIRECRAWL_API_KEY=
103 | SERPER_API_KEY=
104 | TAVILY_API_KEY=
105 | 
106 | # Sentry Tracing
107 | R2R_SENTRY_DSN=
108 | R2R_SENTRY_ENVIRONMENT=
109 | R2R_SENTRY_TRACES_SAMPLE_RATE=
110 | R2R_SENTRY_PROFILES_SAMPLE_RATE=
111 | 


--------------------------------------------------------------------------------
/docker/env/r2r.env:
--------------------------------------------------------------------------------
  1 | # R2R
  2 | R2R_PORT=7272
  3 | R2R_HOST=0.0.0.0
  4 | R2R_LOG_LEVEL=INFO
  5 | R2R_CONFIG_NAME=
  6 | R2R_CONFIG_PATH=
  7 | R2R_PROJECT_NAME=r2r_default
  8 | R2R_SECRET_KEY=
  9 | R2R_USER_TOOLS_PATH=/app/user_tools
 10 | R2R_LOG_FORMAT=
 11 | 
 12 | # Postgres Configuration
 13 | R2R_POSTGRES_USER=postgres
 14 | R2R_POSTGRES_PASSWORD=postgres
 15 | R2R_POSTGRES_HOST=postgres
 16 | R2R_POSTGRES_PORT=5432
 17 | R2R_POSTGRES_DBNAME=postgres
 18 | R2R_POSTGRES_MAX_CONNECTIONS=1024
 19 | R2R_POSTGRES_STATEMENT_CACHE_SIZE=100
 20 | 
 21 | # Hatchet
 22 | HATCHET_CLIENT_TLS_STRATEGY=none
 23 | 
 24 | # OpenAI
 25 | OPENAI_API_KEY=
 26 | OPENAI_API_BASE=
 27 | 
 28 | # Azure Foundry
 29 | AZURE_FOUNDRY_API_ENDPOINT=
 30 | AZURE_FOUNDRY_API_KEY=
 31 | 
 32 | # XAI / GROK
 33 | XAI_API_KEY=
 34 | 
 35 | # Anthropic
 36 | ANTHROPIC_API_KEY=
 37 | 
 38 | # Azure
 39 | AZURE_API_KEY=
 40 | AZURE_API_BASE=
 41 | AZURE_API_VERSION=
 42 | 
 43 | # Google Vertex AI
 44 | GOOGLE_APPLICATION_CREDENTIALS=
 45 | VERTEX_PROJECT=
 46 | VERTEX_LOCATION=
 47 | 
 48 | # Google Gemini
 49 | GEMINI_API_KEY=
 50 | 
 51 | # Mistral
 52 | MISTRAL_API_KEY=
 53 | 
 54 | # AWS Bedrock
 55 | AWS_ACCESS_KEY_ID=
 56 | AWS_SECRET_ACCESS_KEY=
 57 | AWS_REGION_NAME=
 58 | 
 59 | # Groq
 60 | GROQ_API_KEY=
 61 | 
 62 | # Cohere
 63 | COHERE_API_KEY=
 64 | 
 65 | # Anyscale
 66 | ANYSCALE_API_KEY=
 67 | 
 68 | # Ollama
 69 | OLLAMA_API_BASE=http://host.docker.internal:11434
 70 | 
 71 | # LM Studio
 72 | LM_STUDIO_API_BASE=http://host.docker.internal:1234
 73 | LM_STUDIO_API_KEY=1234
 74 | 
 75 | # Huggingface
 76 | HUGGINGFACE_API_BASE=http://host.docker.internal:8080
 77 | HUGGINGFACE_API_KEY=
 78 | 
 79 | # Unstructured
 80 | UNSTRUCTURED_API_KEY=
 81 | UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general
 82 | UNSTRUCTURED_SERVICE_URL=http://unstructured:7275
 83 | UNSTRUCTURED_NUM_WORKERS=10
 84 | 
 85 | # Graphologic
 86 | CLUSTERING_SERVICE_URL=http://graph_clustering:7276
 87 | 
 88 | # OAuth Credentials
 89 | GOOGLE_CLIENT_ID=
 90 | GOOGLE_CLIENT_SECRET=
 91 | GOOGLE_REDIRECT_URI=
 92 | 
 93 | GITHUB_CLIENT_ID=
 94 | GITHUB_CLIENT_SECRET=
 95 | GITHUB_REDIRECT_URI=
 96 | 
 97 | # Email
 98 | MAILERSEND_API_KEY=
 99 | SENDGRID_API_KEY=
100 | 
101 | # Websearch
102 | FIRECRAWL_API_KEY=
103 | SERPER_API_KEY=
104 | TAVILY_API_KEY=
105 | 
106 | # Sentry Tracing
107 | R2R_SENTRY_DSN=
108 | R2R_SENTRY_ENVIRONMENT=
109 | R2R_SENTRY_TRACES_SAMPLE_RATE=
110 | R2R_SENTRY_PROFILES_SAMPLE_RATE=
111 | 


--------------------------------------------------------------------------------
/docker/fluent-bit/fluent-bit.conf:
--------------------------------------------------------------------------------
 1 | [SERVICE]
 2 |     Flush        1
 3 |     Daemon       Off
 4 |     Log_Level    info
 5 |     Parsers_File parsers.conf
 6 | 
 7 | [INPUT]
 8 |     Tag    backend
 9 |     Name   forward
10 |     Listen 0.0.0.0
11 |     Port   24224
12 | 
13 | [FILTER]
14 |     Match    backend
15 |     Name     parser
16 |     Key_Name log
17 |     Parser   json
18 | 
19 | [OUTPUT]
20 |     Match            backend
21 |     Name             http
22 |     host             host.docker.internal
23 |     port             9428
24 |     uri              /insert/jsonline?_stream_fields=log&_msg_field=msg,message&_time_field=date
25 |     format           json_lines
26 |     json_date_format iso8601
27 | 


--------------------------------------------------------------------------------
/docker/fluent-bit/parsers.conf:
--------------------------------------------------------------------------------
1 | [PARSER]
2 |     Name   json
3 |     Format json
4 | 


--------------------------------------------------------------------------------
/docker/scripts/create-hatchet-db.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | echo 'Waiting for PostgreSQL to be ready...'
 5 | while ! pg_isready -h hatchet-postgres -p 5432 -U ${HATCHET_POSTGRES_USER:-hatchet_user}; do
 6 |   sleep 1
 7 | done
 8 | 
 9 | echo 'PostgreSQL is ready, checking if database exists...'
10 | if ! PGPASSWORD=${HATCHET_POSTGRES_PASSWORD:-hatchet_password} psql -h hatchet-postgres -p 5432 -U ${HATCHET_POSTGRES_USER:-hatchet_user} -lqt | grep -qw ${HATCHET_POSTGRES_DBNAME:-hatchet}; then
11 |   echo 'Database does not exist, creating it...'
12 |   PGPASSWORD=${HATCHET_POSTGRES_PASSWORD:-hatchet_password} createdb -h hatchet-postgres -p 5432 -U ${HATCHET_POSTGRES_USER:-hatchet_user} -w ${HATCHET_POSTGRES_DBNAME:-hatchet}
13 | else
14 |   echo 'Database already exists, skipping creation.'
15 | fi
16 | 


--------------------------------------------------------------------------------
/docker/scripts/setup-token.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | echo 'Starting token creation process...'
 5 | 
 6 | # Attempt to create token and capture both stdout and stderr
 7 | TOKEN_OUTPUT=$(/hatchet/hatchet-admin token create --config /hatchet/config --tenant-id 707d0855-80ab-4e1f-a156-f1c4546cbf52 2>&1)
 8 | 
 9 | # Extract the token (assuming it's the only part that looks like a JWT)
10 | TOKEN=$(echo "$TOKEN_OUTPUT" | grep -Eo 'eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*')
11 | 
12 | if [ -z "$TOKEN" ]; then
13 |     echo 'Error: Failed to extract token. Full command output:' >&2
14 |     echo "$TOKEN_OUTPUT" >&2
15 |     exit 1
16 | fi
17 | 
18 | echo "$TOKEN" > /tmp/hatchet_api_key
19 | echo 'Token created and saved to /tmp/hatchet_api_key'
20 | 
21 | # Copy token to final destination
22 | echo -n "$TOKEN" > /hatchet_api_key/api_key.txt
23 | echo 'Token copied to /hatchet_api_key/api_key.txt'
24 | 
25 | # Verify token was copied correctly
26 | if [ "$(cat /tmp/hatchet_api_key)" != "$(cat /hatchet_api_key/api_key.txt)" ]; then
27 |     echo 'Error: Token copy failed, files do not match' >&2
28 |     echo 'Content of /tmp/hatchet_api_key:'
29 |     cat /tmp/hatchet_api_key
30 |     echo 'Content of /hatchet_api_key/api_key.txt:'
31 |     cat /hatchet_api_key/api_key.txt
32 |     exit 1
33 | fi
34 | 
35 | echo 'Hatchet API key has been saved successfully'
36 | echo 'Token length:' ${#TOKEN}
37 | echo 'Token (first 20 chars):' ${TOKEN:0:20}
38 | echo 'Token structure:' $(echo $TOKEN | awk -F. '{print NF-1}') 'parts'
39 | 
40 | # Check each part of the token
41 | for i in 1 2 3; do
42 |     PART=$(echo $TOKEN | cut -d. -f$i)
43 |     echo 'Part' $i 'length:' ${#PART}
44 |     echo 'Part' $i 'base64 check:' $(echo $PART | base64 -d >/dev/null 2>&1 && echo 'Valid' || echo 'Invalid')
45 | done
46 | 
47 | # Final validation attempt
48 | if ! echo $TOKEN | awk -F. '{print $2}' | base64 -d 2>/dev/null | jq . >/dev/null 2>&1; then
49 |     echo 'Warning: Token payload is not valid JSON when base64 decoded' >&2
50 | else
51 |     echo 'Token payload appears to be valid JSON'
52 | fi
53 | 


--------------------------------------------------------------------------------
/docker/scripts/start-r2r.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if HATCHET_CLIENT_TOKEN is set, if not read it from the API key file
 4 | if [ -z "${HATCHET_CLIENT_TOKEN}" ]; then
 5 |   export HATCHET_CLIENT_TOKEN=$(cat /hatchet_api_key/api_key.txt)
 6 | fi
 7 | 
 8 | # Start the application
 9 | exec uvicorn core.main.app_entry:app --host ${R2R_HOST} --port ${R2R_PORT}
10 | 


--------------------------------------------------------------------------------
/docker/user_configs/README.md:
--------------------------------------------------------------------------------
 1 | # User Configs Directory
 2 | 
 3 | ## Overview
 4 | This directory is mounted inside the R2R Docker container and is intended for custom configuration files. Any files placed here will be accessible to the application running in the container.
 5 | 
 6 | ## Usage
 7 | 1. Place your custom configuration files in this directory.
 8 | 2. Set the `R2R_CONFIG_PATH` in the `r2r.env` or `r2r-full.env` files.
 9 | 3. The path format inside the container is: `/app/user_configs/<config>.toml`
10 | 
11 | ## Configuration
12 | The application uses the environment variable you set to locate your configuration file:
13 | ```
14 | R2R_CONFIG_PATH=/app/user_configs/<config>.toml
15 | ```
16 | 
17 | If you want to use a different filename, update the `R2R_CONFIG_PATH` variable in your environment file to point to your custom file, for example:
18 | ```
19 | R2R_CONFIG_PATH=/app/user_configs/my_custom_config.toml
20 | ```
21 | 
22 | ## Troubleshooting
23 | If you encounter configuration errors, check:
24 | 1. Your configuration file exists in this directory
25 | 2. The filename matches what's specified in `R2R_CONFIG_PATH`
26 | 3. The file has proper permissions (readable)
27 | 4. The file contains valid TOML syntax
28 | 
29 | For more detailed configuration information, see the main documentation.
30 | 


--------------------------------------------------------------------------------
/docker/user_tools/README.md:
--------------------------------------------------------------------------------
 1 | # User-Defined Tools Directory
 2 | 
 3 | ## Overview
 4 | This directory is mounted inside the R2R Docker container and is intended for custom tool files. Any files placed here will be accessible to the application running in the container.
 5 | 
 6 | ## Usage
 7 | 1. Place your custom tool definitions in this directory. Utilize the template structure demonstrated here.
 8 | 2. Add any additional dependencies that you may need to the user_requirements.txt file in this directory.
 9 | 3. Include the tool in your agent configuration.
10 | 
11 | ## Creating a tool
12 | ```python
13 | from core.base.agent.tools.base import Tool
14 | 
15 | 
16 | class ToolNameTool(Tool):
17 |     """
18 |     A user defined tool.
19 |     """
20 | 
21 |     def __init__(self):
22 |         super().__init__(
23 |             name="tool_name",
24 |             description="A natural language tool description that is shown to the agent.",
25 |             parameters={
26 |                 "type": "object",
27 |                 "properties": {
28 |                     "input_parameter": {
29 |                         "type": "string",
30 |                         "description": "Define any input parameters by their name and type",
31 |                     },
32 |                 },
33 |                 "required": ["input_parameter"],
34 |             },
35 |             results_function=self.execute,
36 |             llm_format_function=None,
37 |         )
38 | 
39 |     async def execute(self, input_parameter: str, *args, **kwargs):
40 |         """
41 |         Implementation of the tool.
42 |         """
43 | 
44 |         # Any custom tool logic can go here
45 | 
46 |         output_response = some_method(input_parameter)
47 | 
48 |         result = AggregateSearchResult(
49 |             generic_tool_result=[web_response],
50 |         )
51 | 
52 |         # Add to results collector if context is provided
53 |         if context and hasattr(context, "search_results_collector"):
54 |             context.search_results_collector.add_aggregate_result(result)
55 | 
56 |         return result
57 | ```
58 | 
59 | ## Troubleshooting
60 | 
61 | For more detailed configuration information, see the main documentation.
62 | 


--------------------------------------------------------------------------------
/docker/user_tools/user_requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/docker/user_tools/user_requirements.txt


--------------------------------------------------------------------------------
/js/README.md:
--------------------------------------------------------------------------------
 1 | # R2R JavaScript SDK Documentation
 2 | 
 3 | For the complete look at the R2R JavaScript SDK, [visit our documentation.](https://r2r-docs.sciphi.ai/api-and-sdks/introduction)
 4 | 
 5 | ## Installation
 6 | 
 7 | Before starting, make sure you have completed the [R2R installation](https://r2r-docs.sciphi.ai/documentation/installation/overview).
 8 | 
 9 | Install the R2R JavaScript SDK:
10 | 
11 | ```bash
12 | npm install r2r-js
13 | ```
14 | 
15 | ## Getting Started
16 | 
17 | 1. Import the R2R client:
18 | 
19 | ```javascript
20 | const { r2rClient } = require('r2r-js');
21 | ```
22 | 
23 | 2. Initialize the client:
24 | 
25 | ```javascript
26 | const client = new r2rClient('http://localhost:7272');
27 | ```
28 | 
29 | 3. Check if R2R is running correctly:
30 | 
31 | ```javascript
32 | const healthResponse = await client.health();
33 | // {"status":"ok"}
34 | ```
35 | 
36 | 4. Login (Optional):
37 | ```javascript
38 | // client.register("me@email.com", "my_password"),
39 | // client.verify_email("me@email.com", "my_verification_code")
40 | client.login("me@email.com", "my_password")
41 | ```
42 | When using authentication the commands below automatically restrict the scope to a user's available documents.
43 | 


--------------------------------------------------------------------------------
/js/sdk/.prettierignore:
--------------------------------------------------------------------------------
1 | examples/
2 | 


--------------------------------------------------------------------------------
/js/sdk/__tests__/PromptsIntegrationSuperUser.test.ts:
--------------------------------------------------------------------------------
 1 | import { r2rClient } from "../src/index";
 2 | import { describe, test, beforeAll, expect } from "@jest/globals";
 3 | 
 4 | const baseUrl = "http://localhost:7272";
 5 | 
 6 | describe("r2rClient V3 Collections Integration Tests", () => {
 7 |   let client: r2rClient;
 8 | 
 9 |   beforeAll(async () => {
10 |     client = new r2rClient(baseUrl);
11 |     await client.users.login({
12 |       email: "admin@example.com",
13 |       password: "change_me_immediately",
14 |     });
15 |   });
16 | 
17 |   test("List prompts", async () => {
18 |     const response = await client.prompts.list();
19 |     expect(response.results).toBeDefined();
20 |   });
21 | 
22 |   test("Create a prompt", async () => {
23 |     const response = await client.prompts.create({
24 |       name: "test-prompt",
25 |       template: "Hello, {name}!",
26 |       inputTypes: { name: "string" },
27 |     });
28 |     expect(response.results).toBeDefined();
29 |   });
30 | 
31 |   test("Retrieve a prompt", async () => {
32 |     const response = await client.prompts.retrieve({
33 |       name: "test-prompt",
34 |     });
35 |     expect(response.results).toBeDefined();
36 |   });
37 | 
38 |   test("Update a prompt", async () => {
39 |     const response = await client.prompts.update({
40 |       name: "test-prompt",
41 |       template: "Hello, {name}! How are you?",
42 |       inputTypes: { name: "string" },
43 |     });
44 |     expect(response.results).toBeDefined();
45 |   });
46 | 
47 |   test("Delete a prompt", async () => {
48 |     const response = await client.prompts.delete({
49 |       name: "test-prompt",
50 |     });
51 |     expect(response.results).toBeDefined();
52 |   });
53 | });
54 | 


--------------------------------------------------------------------------------
/js/sdk/__tests__/SystemIntegrationSuperUser.test.ts:
--------------------------------------------------------------------------------
 1 | import { r2rClient } from "../src/index";
 2 | import { describe, test, beforeAll, expect } from "@jest/globals";
 3 | 
 4 | const baseUrl = "http://localhost:7272";
 5 | 
 6 | describe("r2rClient V3 Collections Integration Tests", () => {
 7 |   let client: r2rClient;
 8 | 
 9 |   beforeAll(async () => {
10 |     client = new r2rClient(baseUrl);
11 |     await client.users.login({
12 |       email: "admin@example.com",
13 |       password: "change_me_immediately",
14 |     });
15 |   });
16 | 
17 |   test("Get the health of the system", async () => {
18 |     const response = await client.system.health();
19 |     expect(response.results).toBeDefined();
20 |   });
21 | 
22 |   test("Get the settings of the system", async () => {
23 |     const response = await client.system.settings();
24 |     expect(response.results).toBeDefined();
25 |   });
26 | 
27 |   test("Get the status of the system", async () => {
28 |     const response = await client.system.status();
29 |     expect(response.results).toBeDefined();
30 |   });
31 | });
32 | 


--------------------------------------------------------------------------------
/js/sdk/__tests__/SystemIntegrationUser.test.ts:
--------------------------------------------------------------------------------
 1 | import { r2rClient } from "../src/index";
 2 | import { describe, test, beforeAll, expect } from "@jest/globals";
 3 | 
 4 | const baseUrl = "http://localhost:7272";
 5 | 
 6 | describe("r2rClient V3 System Integration Tests User", () => {
 7 |   let client: r2rClient;
 8 |   let userId: string;
 9 |   let name: string | undefined;
10 | 
11 |   beforeAll(async () => {
12 |     client = new r2rClient(baseUrl);
13 |   });
14 | 
15 |   test("Register a new user", async () => {
16 |     const response = await client.users.create({
17 |       email: "system_integration_test_user@example.com",
18 |       password: "change_me_immediately",
19 |       name: "Test User",
20 |       bio: "This is the bio of the test user.",
21 |     });
22 | 
23 |     userId = response.results.id;
24 |     name = response.results.name;
25 |     expect(response.results).toBeDefined();
26 |     expect(response.results.isSuperuser).toBe(false);
27 |     expect(response.results.name).toBe("Test User");
28 |     expect(response.results.bio).toBe("This is the bio of the test user.");
29 |   });
30 | 
31 |   test("Login as a user", async () => {
32 |     const response = await client.users.login({
33 |       email: "system_integration_test_user@example.com",
34 |       password: "change_me_immediately",
35 |     });
36 |     expect(response.results).toBeDefined();
37 |   });
38 | 
39 |   test("Get the health of the system", async () => {
40 |     const response = await client.system.health();
41 |     expect(response.results).toBeDefined();
42 |   });
43 | 
44 |   test("Only a superuser can call the `system/settings` endpoint.", async () => {
45 |     await expect(client.system.settings()).rejects.toThrow(/Status 403/);
46 |   });
47 | 
48 |   test("Only an authorized user can call the `system/status` endpoint.", async () => {
49 |     await expect(client.system.status()).rejects.toThrow(/Status 403/);
50 |   });
51 | 
52 |   test("Delete a user", async () => {
53 |     const response = await client.users.delete({
54 |       id: userId,
55 |       password: "change_me_immediately",
56 |     });
57 |     expect(response.results).toBeDefined();
58 |   });
59 | });
60 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/folder/karamozov.txt:
--------------------------------------------------------------------------------
1 | Alexius Fyodorovich Karamazov erat tertius filius Fyodoris Pavlovich Karamazov
2 | possessoris terrarum in nostro districtu bene noti sua aetate, et adhuc apud nos
3 | memoriae mandati ob mortem tragicam et obscuram, quae tredecim annos abhinc
4 | accidit, quamque suo loco describam.
5 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/folder/myshkin.txt:
--------------------------------------------------------------------------------
1 | Sub finem Novembris, tempore liquationis, hora nona mane, tramen in via
2 | ferrea Varsaviae et Petropoli plenis velocitatibus Petropolim
3 | appropinquabat. Dies ita humidus et nebulosus erat ut magno cum labore
4 | viatores invicem videre possent.
5 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/invalid.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "John Doe"
 3 |   "age": 30,
 4 |   'address': '123 Main St',
 5 |   "phone_numbers": [
 6 |     "555-0123",
 7 |     "555-4567",
 8 |   ],
 9 |   "is_active": True,
10 |   "details": {
11 |     "occupation": "developer"
12 |     "skills": ["python", "javascript"]
13 |   }
14 |   "notes": "Some text with "nested" quotes"
15 | }
16 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/marmeladov.txt:
--------------------------------------------------------------------------------
 1 | His conversation seemed to excite a general though languid interest. The
 2 | boys at the counter fell to sniggering. The innkeeper came down from the
 3 | upper room, apparently on purpose to listen to the “funny fellow”
 4 |  and sat down at a little distance, yawning lazily, but with dignity.
 5 | Evidently Marmeladov was a familiar figure here, and he had most
 6 | likely acquired his weakness for high-flown speeches from the habit of
 7 | frequently entering into conversation with strangers of all sorts in
 8 | the tavern. This habit develops into a necessity in some drunkards, and
 9 | especially in those who are looked after sharply and kept in order
10 | at home. Hence in the company of other drinkers they try to justify
11 | themselves and even if possible obtain consideration.
12 | 
13 | “Funny fellow!” pronounced the innkeeper. “And why don’t you work, why
14 | aren’t you at your duty, if you are in the service?”
15 | 
16 | “Why am I not at my duty, honoured sir,” Marmeladov went on, addressing
17 | himself exclusively to Raskolnikov, as though it had been he who put
18 | that question to him. “Why am I not at my duty? Does not my heart ache
19 | to think what a useless worm I am? A month ago when Mr. Lebeziatnikov
20 | beat my wife with his own hands, and I lay drunk, didn’t I suffer?
21 | Excuse me, young man, has it ever happened to you... hm... well, to
22 | petition hopelessly for a loan?”
23 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/raskolnikov.txt:
--------------------------------------------------------------------------------
1 | In vespera praecipue calida ineunte Iulio iuvenis e cenaculo in quo hospitabatur in
2 | S. loco exiit et lente, quasi dubitans, versus pontem K. ambulavit. Feliciter vitavit
3 | ne domina sua eum in scala occurreret. Cenaculum suum sub tecto domus altae, quinque
4 | tabulatorum, erat, et magis armario quam conclavi simile erat. Domina, quae ei cenaculum,
5 | prandia et ministerium praebebat, in tabulato infra habitabat, et quotienscumque exibat,
6 | praeterire culinam eius, cuius ianua semper aperta erat, cogebatur. Et quoties praeteribat,
7 | iuvenis aegrotum et pavidum sensum habebat, quod eum corrugare frontem et pudere faciebat.
8 | Desperanter apud dominam suam aere alieno obrutus erat, et eam convenire timebat.
9 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/raskolnikov_2.txt:
--------------------------------------------------------------------------------
1 | When Raskolnikov got home, his hair was soaked with sweat and he was
2 | breathing heavily. He went rapidly up the stairs, walked into his
3 | unlocked room and at once fastened the latch. Then in senseless terror
4 | he rushed to the corner, to that hole under the paper where he had put
5 | the things; put his hand in, and for some minutes felt carefully in the
6 | hole, in every crack and fold of the paper. Finding nothing, he got up
7 | and drew a deep breath.
8 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/sonia.txt:
--------------------------------------------------------------------------------
 1 | On the canal bank near the bridge and not two houses away from the one
 2 | where Sonia lodged, there was a crowd of people, consisting principally
 3 | of gutter children. The hoarse broken voice of Katerina Ivanovna could
 4 | be heard from the bridge, and it certainly was a strange spectacle
 5 | likely to attract a street crowd. Katerina Ivanovna in her old dress
 6 | with the green shawl, wearing a torn straw hat, crushed in a hideous way
 7 | on one side, was really frantic. She was exhausted and breathless. Her
 8 | wasted consumptive face looked more suffering than ever, and indeed out
 9 | of doors in the sunshine a consumptive always looks worse than at home.
10 | But her excitement did not flag, and every moment her irritation grew
11 | more intense. She rushed at the children, shouted at them, coaxed
12 | them, told them before the crowd how to dance and what to sing, began
13 | explaining to them why it was necessary, and driven to desperation by
14 | their not understanding, beat them.... Then she would make a rush at the
15 | crowd; if she noticed any decently dressed person stopping to look, she
16 | immediately appealed to him to see what these children “from a genteel,
17 | one may say aristocratic, house” had been brought to. If she heard
18 | laughter or jeering in the crowd, she would rush at once at the scoffers
19 | and begin squabbling with them. Some people laughed, others shook their
20 | heads, but everyone felt curious at the sight of the madwoman with the
21 | frightened children. The frying-pan of which Lebeziatnikov had spoken
22 | was not there, at least Raskolnikov did not see it. But instead of
23 | rapping on the pan, Katerina Ivanovna began clapping her wasted hands,
24 | when she made Lida and Kolya dance and Polenka sing. She too joined in
25 | the singing, but broke down at the second note with a fearful cough,
26 | which made her curse in despair and even shed tears. What made her most
27 | furious was the weeping and terror of Kolya and Lida. Some effort had
28 | been made to dress the children up as street singers are dressed. The
29 | boy had on a turban made of something red and white to look like a Turk.
30 | There had been no costume for Lida; she simply had a red knitted cap,
31 | or rather a night cap that had belonged to Marmeladov, decorated with
32 | a broken piece of white ostrich feather, which had been Katerina
33 | Ivanovna’s grandmother’s and had been preserved as a family possession.
34 | Polenka was in her everyday dress; she looked in timid perplexity at her
35 | mother, and kept at her side, hiding her tears. She dimly realised her
36 | mother’s condition, and looked uneasily about her. She was terribly
37 | frightened of the street and the crowd. Sonia followed Katerina
38 | Ivanovna, weeping and beseeching her to return home, but Katerina
39 | Ivanovna was not to be persuaded.
40 | 


--------------------------------------------------------------------------------
/js/sdk/examples/data/zametov.txt:
--------------------------------------------------------------------------------
 1 | “How he keeps on! Are you afraid of having let out some secret? Don’t
 2 | worry yourself; you said nothing about a countess. But you said a lot
 3 | about a bulldog, and about ear-rings and chains, and about Krestovsky
 4 | Island, and some porter, and Nikodim Fomitch and Ilya Petrovitch, the
 5 | assistant superintendent. And another thing that was of special interest
 6 | to you was your own sock. You whined, ‘Give me my sock.’ Zametov
 7 | hunted all about your room for your socks, and with his own scented,
 8 | ring-bedecked fingers he gave you the rag. And only then were you
 9 | comforted, and for the next twenty-four hours you held the wretched
10 | thing in your hand; we could not get it from you. It is most likely
11 | somewhere under your quilt at this moment. And then you asked so
12 | piteously for fringe for your trousers. We tried to find out what sort
13 | of fringe, but we could not make it out. Now to business! Here are
14 | thirty-five roubles; I take ten of them, and shall give you an account
15 | of them in an hour or two. I will let Zossimov know at the same time,
16 | though he ought to have been here long ago, for it is nearly twelve. And
17 | you, Nastasya, look in pretty often while I am away, to see whether he
18 | wants a drink or anything else. And I will tell Pashenka what is wanted
19 | myself. Good-bye!”
20 | 


--------------------------------------------------------------------------------
/js/sdk/examples/hello_r2r.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const { r2rClient } = require("r2r-js");
 3 | 
 4 | // Create an account at SciPhi Cloud https://app.sciphi.ai and set an R2R_API_KEY environment variable
 5 | // or set the base URL to your instance. E.g. r2rClient("http://localhost:7272")
 6 | const client = new r2rClient();
 7 | 
 8 | async function main() {
 9 |   const filePath = path.resolve(__dirname, "data/raskolnikov.txt");
10 | 
11 | 
12 |   console.log("Ingesting file...");
13 |   const ingestResult = await client.documents.create({
14 |     file: {
15 |       path: filePath,
16 |       name: "raskolnikov.txt"
17 |     },
18 |     metadata: { author: "Dostoevsky" },
19 |   });
20 |   console.log("Ingest result:", JSON.stringify(ingestResult, null, 2));
21 | 
22 |   console.log("Waiting for the file to be ingested...");
23 |   await new Promise((resolve) => setTimeout(resolve, 10000));
24 | 
25 |   console.log("Performing RAG...");
26 |   const ragResponse = await client.retrieval.rag({
27 |     query: "To whom was Raskolnikov desperately in debt to?",
28 |   });
29 | 
30 |   console.log("Search Results:");
31 |   ragResponse.results.searchResults.chunkSearchResults.forEach(
32 |     (result, index) => {
33 |       console.log(`\nResult ${index + 1}:`);
34 |       console.log(`Text: ${result.text.substring(0, 100)}...`);
35 |       console.log(`Score: ${result.score}`);
36 |     },
37 |   );
38 | 
39 |   console.log("\nCompletion:");
40 |   console.log(ragResponse.results.completion);
41 | }
42 | 
43 | main();
44 | 


--------------------------------------------------------------------------------
/js/sdk/jest.config.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   preset: "ts-jest",
 3 |   testEnvironment: "node",
 4 |   testMatch: [
 5 |     "**/__tests__/**/*.ts?(x)",
 6 |     "**/__tests__/**/?(*.)+(spec|test).ts?(x)",
 7 |   ],
 8 |   maxWorkers: 1,
 9 | };
10 | 


--------------------------------------------------------------------------------
/js/sdk/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "r2r-js",
 3 |   "version": "0.4.43",
 4 |   "description": "",
 5 |   "main": "dist/index.js",
 6 |   "browser": "dist/index.browser.js",
 7 |   "types": "dist/index.d.ts",
 8 |   "exports": {
 9 |     ".": "./dist/index.js"
10 |   },
11 |   "scripts": {
12 |     "build": "tsc",
13 |     "prepublishOnly": "npm run build",
14 |     "format": "prettier --write .",
15 |     "pretest:integration": "node setup.js",
16 |     "test": "jest --no-cache",
17 |     "test:watch": "jest --watch",
18 |     "test:coverage": "jest --coverage",
19 |     "test:collections": "jest CollectionsIntegrationSuperUser CollectionsIntegrationUser",
20 |     "test:documents": "jest DocumentsIntegrationSuperUser",
21 |     "test:retrieval": "jest RetrievalIntegrationSuperUser",
22 |     "test:users": "jest UsersIntegrationSuperUser"
23 |   },
24 |   "files": [
25 |     "dist"
26 |   ],
27 |   "keywords": [],
28 |   "author": "",
29 |   "license": "ISC",
30 |   "dependencies": {
31 |     "@jest/globals": "^29.7.0",
32 |     "@rrweb/types": "2.0.0-alpha.17",
33 |     "axios": "^1.8.4",
34 |     "form-data": "^4.0.1",
35 |     "rrweb-snapshot": "2.0.0-alpha.4",
36 |     "uuid": "^10.0.0"
37 |   },
38 |   "devDependencies": {
39 |     "@rrweb/record": "2.0.0-alpha.17",
40 |     "@types/jest": "^29.5.14",
41 |     "@types/node": "^20.17.9",
42 |     "@types/uuid": "^10.0.0",
43 |     "jest": "^29.7.0",
44 |     "prettier": "^3.4.2",
45 |     "ts-jest": "^29.2.5",
46 |     "ts-node": "^10.9.2",
47 |     "typescript": "^5.7.2"
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/js/sdk/src/index.ts:
--------------------------------------------------------------------------------
1 | export { r2rClient } from "./r2rClient";
2 | export * from "./types";
3 | 


--------------------------------------------------------------------------------
/js/sdk/src/utils/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./typeTransformer";
2 | export * from "./utils";
3 | 


--------------------------------------------------------------------------------
/js/sdk/src/utils/utils.ts:
--------------------------------------------------------------------------------
 1 | export function downloadBlob(blob: Blob, filename: string): void {
 2 |   const url = window.URL.createObjectURL(blob);
 3 |   const link = document.createElement("a");
 4 |   link.href = url;
 5 |   link.download = filename;
 6 |   document.body.appendChild(link);
 7 |   link.click();
 8 |   document.body.removeChild(link);
 9 |   window.URL.revokeObjectURL(url);
10 | }
11 | 


--------------------------------------------------------------------------------
/js/sdk/src/v3/clients/system.ts:
--------------------------------------------------------------------------------
 1 | import { r2rClient } from "../../r2rClient";
 2 | import {
 3 |   WrappedGenericMessageResponse,
 4 |   WrappedServerStatsResponse,
 5 |   WrappedSettingsResponse,
 6 | } from "../../types";
 7 | 
 8 | export class SystemClient {
 9 |   constructor(private client: r2rClient) {}
10 | 
11 |   /**
12 |    * Check the health of the R2R server.
13 |    */
14 |   async health(): Promise<WrappedGenericMessageResponse> {
15 |     return await this.client.makeRequest("GET", "health");
16 |   }
17 | 
18 |   /**
19 |    * Get the configuration settings for the R2R server.
20 |    * @returns
21 |    */
22 |   async settings(): Promise<WrappedSettingsResponse> {
23 |     return await this.client.makeRequest("GET", "system/settings");
24 |   }
25 | 
26 |   /**
27 |    * Get statistics about the server, including the start time, uptime,
28 |    * CPU usage, and memory usage.
29 |    * @returns
30 |    */
31 |   async status(): Promise<WrappedServerStatsResponse> {
32 |     return await this.client.makeRequest("GET", "system/status");
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/js/sdk/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es2017",
 4 |     "module": "commonjs",
 5 |     "outDir": "./dist",
 6 |     "rootDir": "./src",
 7 |     "declaration": true,
 8 |     "moduleResolution": "node",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "experimentalDecorators": true,
12 |     "emitDecoratorMetadata": true,
13 |     "forceConsistentCasingInFileNames": true,
14 |     "jsx": "react",
15 |     "lib": ["es2017", "dom"],
16 |     "sourceMap": true,
17 |     "types": ["node", "jest", "@types/jest"],
18 |     "skipLibCheck": true
19 |   },
20 |   "include": ["src/**/*"],
21 |   "exclude": ["node_modules", "**/__tests__/*", "**/*.spec.ts"]
22 | }
23 | 


--------------------------------------------------------------------------------
/py/.dockerignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.pyc
 3 | *.pyo
 4 | *.pyd
 5 | .Python
 6 | env
 7 | pip-log.txt
 8 | pip-delete-this-directory.txt
 9 | .tox
10 | .coverage
11 | .coverage.*
12 | .cache
13 | nosetests.xml
14 | coverage.xml
15 | *.cover
16 | *.log
17 | .git
18 | .mypy_cache
19 | .pytest_cache
20 | .hypothesis
21 | 


--------------------------------------------------------------------------------
/py/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim AS builder
 2 | 
 3 | # Install system dependencies
 4 | RUN apt-get update && apt-get install -y --no-install-recommends \
 5 |     gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \
 6 |     poppler-utils \
 7 |     && apt-get clean && rm -rf /var/lib/apt/lists/* \
 8 |     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 9 | 
10 | # Add Rust to PATH
11 | ENV PATH="/root/.cargo/bin:${PATH}"
12 | 
13 | # Create the /app/py directory
14 | RUN mkdir -p /app/py
15 | WORKDIR /app/py
16 | COPY pyproject.toml ./
17 | RUN pip install -e ".[core]" && \
18 |     pip install gunicorn uvicorn pydantic
19 | 
20 | # Optionally, if you want gunicorn and uvicorn explicitly installed, you can
21 | # either list them under [project] in `pyproject.toml` or install them here:
22 | RUN pip install --no-cache-dir gunicorn uvicorn
23 | 
24 | # Create the final image
25 | FROM python:3.12-slim
26 | 
27 | # Minimal runtime deps
28 | RUN apt-get update && apt-get install -y --no-install-recommends \
29 |     curl poppler-utils \
30 |     && apt-get clean && rm -rf /var/lib/apt/lists/*
31 | 
32 | # Copy the built environment from builder to final image
33 | # (If you want a fully self-contained environment, copy /usr/local)
34 | COPY --from=builder /usr/local /usr/local
35 | 
36 | WORKDIR /app
37 | 
38 | # Copy the rest of your source code
39 | COPY . /app
40 | 
41 | # Expose environment variables and port
42 | ARG R2R_PORT=8000 R2R_HOST=0.0.0.0
43 | ENV R2R_PORT=$R2R_PORT R2R_HOST=$R2R_HOST
44 | EXPOSE $R2R_PORT
45 | 
46 | # Launch the app
47 | CMD ["sh", "-c", "uvicorn core.main.app_entry:app --host $R2R_HOST --port $R2R_PORT"]
48 | 


--------------------------------------------------------------------------------
/py/core/agent/__init__.py:
--------------------------------------------------------------------------------
 1 | # FIXME: Once the agent is properly type annotated, remove the type: ignore comments
 2 | from .base import (  # type: ignore
 3 |     R2RAgent,
 4 |     R2RStreamingAgent,
 5 |     R2RXMLStreamingAgent,
 6 | )
 7 | from .rag import (  # type: ignore
 8 |     R2RRAGAgent,
 9 |     R2RStreamingRAGAgent,
10 |     R2RXMLToolsRAGAgent,
11 |     R2RXMLToolsStreamingRAGAgent,
12 | )
13 | 
14 | # Import the concrete implementations
15 | from .research import (
16 |     R2RResearchAgent,
17 |     R2RStreamingResearchAgent,
18 |     R2RXMLToolsResearchAgent,
19 |     R2RXMLToolsStreamingResearchAgent,
20 | )
21 | 
22 | __all__ = [
23 |     # Base
24 |     "R2RAgent",
25 |     "R2RStreamingAgent",
26 |     "R2RXMLStreamingAgent",
27 |     # RAG Agents
28 |     "R2RRAGAgent",
29 |     "R2RXMLToolsRAGAgent",
30 |     "R2RStreamingRAGAgent",
31 |     "R2RXMLToolsStreamingRAGAgent",
32 |     "R2RResearchAgent",
33 |     "R2RStreamingResearchAgent",
34 |     "R2RXMLToolsResearchAgent",
35 |     "R2RXMLToolsStreamingResearchAgent",
36 | ]
37 | 


--------------------------------------------------------------------------------
/py/core/base/agent/__init__.py:
--------------------------------------------------------------------------------
 1 | # FIXME: Once the agent is properly type annotated, remove the type: ignore comments
 2 | from .agent import (  # type: ignore
 3 |     Agent,
 4 |     AgentConfig,
 5 |     Conversation,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     # Agent abstractions
10 |     "Agent",
11 |     "AgentConfig",
12 |     "Conversation",
13 | ]
14 | 


--------------------------------------------------------------------------------
/py/core/base/agent/tools/built_in/search_file_descriptions.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from shared.abstractions.tool import Tool
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class SearchFileDescriptionsTool(Tool):
 9 |     """
10 |     A tool to search over high-level document data (titles, descriptions, etc.)
11 |     """
12 | 
13 |     def __init__(self):
14 |         super().__init__(
15 |             name="search_file_descriptions",
16 |             description=(
17 |                 "Semantic search over AI-generated summaries of stored documents. "
18 |                 "This does NOT retrieve chunk-level contents or knowledge-graph relationships. "
19 |                 "Use this when you need a broad overview of which documents (files) might be relevant."
20 |             ),
21 |             parameters={
22 |                 "type": "object",
23 |                 "properties": {
24 |                     "query": {
25 |                         "type": "string",
26 |                         "description": "Query string to semantic search over available files 'list documents about XYZ'.",
27 |                     }
28 |                 },
29 |                 "required": ["query"],
30 |             },
31 |             results_function=self.execute,
32 |             llm_format_function=None,
33 |         )
34 | 
35 |     async def execute(self, query: str, *args, **kwargs):
36 |         """
37 |         Calls the file_search_method from context.
38 |         """
39 |         from core.base.abstractions import AggregateSearchResult
40 | 
41 |         context = self.context
42 | 
43 |         # Check if context has necessary method
44 |         if not context or not hasattr(context, "file_search_method"):
45 |             logger.error("No file_search_method provided in context")
46 |             return AggregateSearchResult(document_search_results=[])
47 | 
48 |         # Get the file_search_method from context
49 |         file_search_method = context.file_search_method
50 | 
51 |         # Call the content_method from the context
52 |         try:
53 |             doc_results = await file_search_method(
54 |                 query=query,
55 |                 settings=context.search_settings,
56 |             )
57 |         except Exception as e:
58 |             logger.error(f"Error calling content_method: {e}")
59 |             return AggregateSearchResult(document_search_results=[])
60 | 
61 |         result = AggregateSearchResult(document_search_results=doc_results)
62 | 
63 |         # Add to results collector if context has it
64 |         if hasattr(context, "search_results_collector"):
65 |             context.search_results_collector.add_aggregate_result(result)
66 | 
67 |         return result
68 | 


--------------------------------------------------------------------------------
/py/core/base/agent/tools/built_in/web_search.py:
--------------------------------------------------------------------------------
 1 | from shared.abstractions.tool import Tool
 2 | 
 3 | 
 4 | class WebSearchTool(Tool):
 5 |     """
 6 |     A web search tool that uses Serper to perform Google searches and returns
 7 |     the most relevant results.
 8 |     """
 9 | 
10 |     def __init__(self):
11 |         super().__init__(
12 |             name="web_search",
13 |             description=(
14 |                 "Search for information on the web - use this tool when the user "
15 |                 "query needs LIVE or recent data from the internet."
16 |             ),
17 |             parameters={
18 |                 "type": "object",
19 |                 "properties": {
20 |                     "query": {
21 |                         "type": "string",
22 |                         "description": "The query to search with an external web API.",
23 |                     },
24 |                 },
25 |                 "required": ["query"],
26 |             },
27 |             results_function=self.execute,
28 |             llm_format_function=None,
29 |         )
30 | 
31 |     async def execute(self, query: str, *args, **kwargs):
32 |         """
33 |         Implementation of web search functionality.
34 |         """
35 |         import asyncio
36 | 
37 |         from core.base.abstractions import (
38 |             AggregateSearchResult,
39 |             WebSearchResult,
40 |         )
41 |         from core.utils.serper import SerperClient
42 | 
43 |         context = self.context
44 | 
45 |         serper_client = SerperClient()
46 | 
47 |         raw_results = await asyncio.get_event_loop().run_in_executor(
48 |             None,
49 |             lambda: serper_client.get_raw(query),
50 |         )
51 | 
52 |         web_response = await asyncio.get_event_loop().run_in_executor(
53 |             None, lambda: WebSearchResult.from_serper_results(raw_results)
54 |         )
55 | 
56 |         result = AggregateSearchResult(
57 |             web_search_results=[web_response],
58 |         )
59 | 
60 |         # Add to results collector if context is provided
61 |         if context and hasattr(context, "search_results_collector"):
62 |             context.search_results_collector.add_aggregate_result(result)
63 | 
64 |         return result
65 | 


--------------------------------------------------------------------------------
/py/core/base/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_parser import AsyncParser
2 | 
3 | __all__ = [
4 |     "AsyncParser",
5 | ]
6 | 


--------------------------------------------------------------------------------
/py/core/base/parsers/base_parser.py:
--------------------------------------------------------------------------------
 1 | """Abstract base class for parsers."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import AsyncGenerator, Generic, TypeVar
 5 | 
 6 | T = TypeVar("T")
 7 | 
 8 | 
 9 | class AsyncParser(ABC, Generic[T]):
10 |     @abstractmethod
11 |     async def ingest(self, data: T, **kwargs) -> AsyncGenerator[str, None]:
12 |         pass
13 | 


--------------------------------------------------------------------------------
/py/core/base/providers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .auth import AuthConfig, AuthProvider
 2 | from .base import AppConfig, Provider, ProviderConfig
 3 | from .crypto import CryptoConfig, CryptoProvider
 4 | from .database import (
 5 |     DatabaseConfig,
 6 |     DatabaseConnectionManager,
 7 |     DatabaseProvider,
 8 |     Handler,
 9 |     LimitSettings,
10 |     PostgresConfigurationSettings,
11 | )
12 | from .email import EmailConfig, EmailProvider
13 | from .embedding import EmbeddingConfig, EmbeddingProvider
14 | from .file import FileConfig, FileProvider
15 | from .ingestion import (
16 |     ChunkingStrategy,
17 |     IngestionConfig,
18 |     IngestionProvider,
19 | )
20 | from .llm import CompletionConfig, CompletionProvider
21 | from .ocr import OCRConfig, OCRProvider
22 | from .orchestration import OrchestrationConfig, OrchestrationProvider, Workflow
23 | from .scheduler import SchedulerConfig, SchedulerProvider
24 | 
25 | __all__ = [
26 |     # Auth provider
27 |     "AuthConfig",
28 |     "AuthProvider",
29 |     # Base provider classes
30 |     "AppConfig",
31 |     "Provider",
32 |     "ProviderConfig",
33 |     # Crypto provider
34 |     "CryptoConfig",
35 |     "CryptoProvider",
36 |     # Database providers
37 |     "DatabaseConnectionManager",
38 |     "DatabaseConfig",
39 |     "LimitSettings",
40 |     "PostgresConfigurationSettings",
41 |     "DatabaseProvider",
42 |     "Handler",
43 |     # Email provider
44 |     "EmailConfig",
45 |     "EmailProvider",
46 |     # Embedding provider
47 |     "EmbeddingConfig",
48 |     "EmbeddingProvider",
49 |     # File provider
50 |     "FileConfig",
51 |     "FileProvider",
52 |     # Ingestion provider
53 |     "IngestionConfig",
54 |     "IngestionProvider",
55 |     "ChunkingStrategy",
56 |     # LLM provider
57 |     "CompletionConfig",
58 |     "CompletionProvider",
59 |     # OCR provider
60 |     "OCRConfig",
61 |     "OCRProvider",
62 |     # Orchestration provider
63 |     "OrchestrationConfig",
64 |     "OrchestrationProvider",
65 |     "Workflow",
66 |     # Scheduler provider
67 |     "SchedulerConfig",
68 |     "SchedulerProvider",
69 | ]
70 | 


--------------------------------------------------------------------------------
/py/core/base/providers/orchestration.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from enum import Enum
 3 | from typing import Any
 4 | 
 5 | from .base import Provider, ProviderConfig
 6 | 
 7 | 
 8 | class Workflow(Enum):
 9 |     INGESTION = "ingestion"
10 |     GRAPH = "graph"
11 | 
12 | 
13 | class OrchestrationConfig(ProviderConfig):
14 |     provider: str
15 |     max_runs: int = 2_048
16 |     graph_search_results_creation_concurrency_limit: int = 32
17 |     ingestion_concurrency_limit: int = 16
18 |     graph_search_results_concurrency_limit: int = 8
19 | 
20 |     def validate_config(self) -> None:
21 |         if self.provider not in self.supported_providers:
22 |             raise ValueError(f"Provider {self.provider} is not supported.")
23 | 
24 |     @property
25 |     def supported_providers(self) -> list[str]:
26 |         return ["hatchet", "simple"]
27 | 
28 | 
29 | class OrchestrationProvider(Provider):
30 |     def __init__(self, config: OrchestrationConfig):
31 |         super().__init__(config)
32 |         self.config = config
33 |         self.worker = None
34 | 
35 |     @abstractmethod
36 |     async def start_worker(self):
37 |         pass
38 | 
39 |     @abstractmethod
40 |     def get_worker(self, name: str, max_runs: int) -> Any:
41 |         pass
42 | 
43 |     @abstractmethod
44 |     def step(self, *args, **kwargs) -> Any:
45 |         pass
46 | 
47 |     @abstractmethod
48 |     def workflow(self, *args, **kwargs) -> Any:
49 |         pass
50 | 
51 |     @abstractmethod
52 |     def failure(self, *args, **kwargs) -> Any:
53 |         pass
54 | 
55 |     @abstractmethod
56 |     def register_workflows(
57 |         self, workflow: Workflow, service: Any, messages: dict
58 |     ) -> None:
59 |         pass
60 | 
61 |     @abstractmethod
62 |     async def run_workflow(
63 |         self,
64 |         workflow_name: str,
65 |         parameters: dict,
66 |         options: dict,
67 |         *args,
68 |         **kwargs,
69 |     ) -> dict[str, str]:
70 |         pass
71 | 


--------------------------------------------------------------------------------
/py/core/base/providers/scheduler.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | from .base import Provider, ProviderConfig
 4 | 
 5 | 
 6 | class SchedulerConfig(ProviderConfig):
 7 |     """Configuration for scheduler provider"""
 8 | 
 9 |     provider: str = "apscheduler"
10 | 
11 |     def validate_config(self):
12 |         if self.provider not in self.supported_providers:
13 |             raise ValueError(
14 |                 f"Scheduler provider {self.provider} is not supported."
15 |             )
16 | 
17 |     @property
18 |     def supported_providers(self) -> list[str]:
19 |         return ["apscheduler"]
20 | 
21 | 
22 | class SchedulerProvider(Provider):
23 |     """Base class for scheduler providers"""
24 | 
25 |     def __init__(self, config: SchedulerConfig):
26 |         super().__init__(config)
27 |         self.config = config
28 | 
29 |     @abstractmethod
30 |     async def add_job(self, func, trigger, **kwargs):
31 |         pass
32 | 
33 |     @abstractmethod
34 |     async def start(self):
35 |         pass
36 | 
37 |     @abstractmethod
38 |     async def shutdown(self):
39 |         pass
40 | 


--------------------------------------------------------------------------------
/py/core/base/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from shared.utils import (
 2 |     RecursiveCharacterTextSplitter,
 3 |     TextSplitter,
 4 |     _decorate_vector_type,
 5 |     _get_vector_column_str,
 6 |     deep_update,
 7 |     dump_collector,
 8 |     dump_obj,
 9 |     format_search_results_for_llm,
10 |     generate_default_prompt_id,
11 |     generate_default_user_collection_id,
12 |     generate_document_id,
13 |     generate_entity_document_id,
14 |     generate_extraction_id,
15 |     generate_id,
16 |     generate_user_id,
17 |     validate_uuid,
18 |     yield_sse_event,
19 | )
20 | 
21 | __all__ = [
22 |     "format_search_results_for_llm",
23 |     "generate_id",
24 |     "generate_default_user_collection_id",
25 |     "generate_document_id",
26 |     "generate_extraction_id",
27 |     "generate_user_id",
28 |     "generate_entity_document_id",
29 |     "generate_default_prompt_id",
30 |     "RecursiveCharacterTextSplitter",
31 |     "TextSplitter",
32 |     "validate_uuid",
33 |     "deep_update",
34 |     "_decorate_vector_type",
35 |     "_get_vector_column_str",
36 |     "yield_sse_event",
37 |     "dump_collector",
38 |     "dump_obj",
39 | ]
40 | 


--------------------------------------------------------------------------------
/py/core/configs/full.toml:
--------------------------------------------------------------------------------
 1 | [completion]
 2 | provider = "r2r"
 3 | concurrent_request_limit = 128
 4 | 
 5 | [ingestion]
 6 | provider = "unstructured_local"
 7 | strategy = "auto"
 8 | chunking_strategy = "by_title"
 9 | new_after_n_chars = 2_048
10 | max_characters = 4_096
11 | combine_under_n_chars = 1_024
12 | overlap = 1_024
13 | 
14 |     [ingestion.extra_parsers]
15 |     pdf = ["zerox", "ocr"]
16 | 
17 | [orchestration]
18 | provider = "hatchet"
19 | kg_creation_concurrency_limit = 32
20 | ingestion_concurrency_limit = 16
21 | kg_concurrency_limit = 8
22 | 


--------------------------------------------------------------------------------
/py/core/configs/full_azure.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "azure/gpt-4.1-mini"
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "azure/gpt-4.1"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "azure/gpt-4.1"
10 | 
11 | # LLM used for transcription
12 | audio_lm = "azure/whisper-1"
13 | 
14 | # Reasoning model, used for `research` agent
15 | reasoning_llm = "azure/o3-mini"
16 | # Planning model, used for `research` agent
17 | planning_llm = "azure/o3-mini"
18 | 
19 | [embedding]
20 | base_model = "azure/text-embedding-3-small"
21 | 
22 | [completion_embedding]
23 | base_model = "azure/text-embedding-3-small"
24 | 
25 | [ingestion]
26 | provider = "unstructured_local"
27 | strategy = "auto"
28 | chunking_strategy = "by_title"
29 | new_after_n_chars = 2_048
30 | max_characters = 4_096
31 | combine_under_n_chars = 1_024
32 | overlap = 1_024
33 | document_summary_model = "azure/gpt-4.1-mini"
34 | automatic_extraction = true # enable automatic extraction of entities and relations
35 | 
36 |   [ingestion.extra_parsers]
37 |     pdf = ["zerox", "ocr"]
38 | 
39 |   [ingestion.chunk_enrichment_settings]
40 |     generation_config = { model = "azure/gpt-4.1-mini" }
41 | 
42 | [orchestration]
43 | provider = "hatchet"
44 | kg_creation_concurrency_limit = 32
45 | ingestion_concurrency_limit = 4
46 | kg_concurrency_limit = 8
47 | 


--------------------------------------------------------------------------------
/py/core/configs/full_lm_studio.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "lm_studio/llama-3.2-3b-instruct"
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "lm_studio/llama-3.2-3b-instruct"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "lm_studio/llama3.2-vision" # TODO - Replace with viable candidate
10 | 
11 | # LLM used for transcription
12 | audio_lm = "lm_studio/llama-3.2-3b-instruct" # TODO - Replace with viable candidate
13 | 
14 | [embedding]
15 | provider = "litellm"
16 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5"
17 | base_dimension = nan
18 | batch_size = 128
19 | concurrent_request_limit = 2
20 | 
21 | [completion_embedding]
22 | # Generally this should be the same as the embedding config, but advanced users may want to run with a different provider to reduce latency
23 | provider = "litellm"
24 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5"
25 | base_dimension = nan
26 | batch_size = 128
27 | concurrent_request_limit = 2
28 | 
29 | [agent]
30 | tools = ["search_file_knowledge"]
31 | 
32 | [completion]
33 | provider = "litellm"
34 | concurrent_request_limit = 1
35 | 
36 |   [completion.generation_config]
37 |   temperature = 0.1
38 |   top_p = 1
39 |   max_tokens_to_sample = 1_024
40 |   stream = false
41 | 
42 | [ingestion]
43 | provider = "unstructured_local"
44 | strategy = "auto"
45 | chunking_strategy = "by_title"
46 | new_after_n_chars = 512
47 | max_characters = 1_024
48 | combine_under_n_chars = 128
49 | overlap = 20
50 | chunks_for_document_summary = 16
51 | document_summary_model = "lm_studio/llama-3.2-3b-instruct"
52 | automatic_extraction = false
53 | 
54 | [orchestration]
55 | provider = "hatchet"
56 | 


--------------------------------------------------------------------------------
/py/core/configs/full_ollama.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "ollama/llama3.1"
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "ollama/llama3.1"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "ollama/llama3.1" # TODO - Replace with viable candidate
10 | 
11 | # LLM used for transcription
12 | audio_lm = "ollama/llama3.1" # TODO - Replace with viable candidate
13 | 
14 | 
15 | # Reasoning model, used for `research` agent
16 | reasoning_llm = "ollama/llama3.1"
17 | # Planning model, used for `research` agent
18 | planning_llm = "ollama/llama3.1"
19 | 
20 | [embedding]
21 | provider = "ollama"
22 | base_model = "mxbai-embed-large"
23 | base_dimension = 1_024
24 | batch_size = 128
25 | concurrent_request_limit = 2
26 | 
27 | [completion_embedding]
28 | provider = "ollama"
29 | base_model = "mxbai-embed-large"
30 | base_dimension = 1_024
31 | batch_size = 128
32 | concurrent_request_limit = 2
33 | 
34 | [agent]
35 | tools = ["search_file_knowledge"]
36 | 
37 | [completion]
38 | provider = "litellm"
39 | concurrent_request_limit = 1
40 | 
41 |   [completion.generation_config]
42 |   temperature = 0.1
43 |   top_p = 1
44 |   max_tokens_to_sample = 1_024
45 |   stream = false
46 |   api_base = "http://host.docker.internal:11434"
47 | 
48 | [ingestion]
49 | provider = "unstructured_local"
50 | strategy = "auto"
51 | chunking_strategy = "by_title"
52 | new_after_n_chars = 512
53 | max_characters = 1_024
54 | combine_under_n_chars = 128
55 | overlap = 20
56 | chunks_for_document_summary = 16
57 | document_summary_model = "ollama/llama3.1"
58 | automatic_extraction = false
59 | 
60 | [orchestration]
61 | provider = "hatchet"
62 | 


--------------------------------------------------------------------------------
/py/core/configs/gemini.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | fast_llm = "gemini/gemini-2.0-flash-lite"
 3 | quality_llm = "gemini/gemini-2.0-flash"
 4 | vlm = "gemini/gemini-2.0-flash"
 5 | audio_lm = "gemini/gemini-2.0-flash-lite"
 6 | 
 7 | [embedding]
 8 | provider = "litellm"
 9 | base_model = "gemini/text-embedding-004"
10 | base_dimension = nan
11 | batch_size = 128
12 | concurrent_request_limit = 2
13 | 
14 | [completion_embedding]
15 | provider = "litellm"
16 | base_model = "gemini/text-embedding-004"
17 | base_dimension = nan
18 | batch_size = 128
19 | concurrent_request_limit = 2
20 | 


--------------------------------------------------------------------------------
/py/core/configs/lm_studio.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "lm_studio/llama-3.2-3b-instruct"
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "lm_studio/llama-3.2-3b-instruct"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "lm_studio/llama3.2-vision" # TODO - Replace with viable candidate
10 | 
11 | # LLM used for transcription
12 | audio_lm = "lm_studio/llama-3.2-3b-instruct" # TODO - Replace with viable candidate
13 | 
14 | [embedding]
15 | provider = "litellm"
16 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5"
17 | base_dimension = nan
18 | batch_size = 128
19 | concurrent_request_limit = 2
20 | 
21 | [completion_embedding]
22 | # Generally this should be the same as the embedding config, but advanced users may want to run with a different provider to reduce latency
23 | provider = "litellm"
24 | base_model = "lm_studio/text-embedding-nomic-embed-text-v1.5"
25 | base_dimension = nan
26 | batch_size = 128
27 | concurrent_request_limit = 2
28 | 
29 | [agent]
30 | tools = ["search_file_knowledge"]
31 | 
32 | [completion]
33 | provider = "litellm"
34 | concurrent_request_limit = 1
35 | 
36 |   [completion.generation_config]
37 |   temperature = 0.1
38 |   top_p = 1
39 |   max_tokens_to_sample = 1_024
40 |   stream = false
41 | 


--------------------------------------------------------------------------------
/py/core/configs/ollama.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "ollama/llama3.1" ### NOTE - RECOMMENDED TO USE `openai` with `api_base = "http://localhost:11434/v1"` for best results, otherwise `ollama` with `litellm` is acceptable
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "ollama/llama3.1"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "ollama/llama3.1" # TODO - Replace with viable candidate
10 | 
11 | # LLM used for transcription
12 | audio_lm = "ollama/llama3.1" # TODO - Replace with viable candidate
13 | 
14 | 
15 | # Reasoning model, used for `research` agent
16 | reasoning_llm = "ollama/llama3.1"
17 | # Planning model, used for `research` agent
18 | planning_llm = "ollama/llama3.1"
19 | 
20 | [embedding]
21 | provider = "ollama"
22 | base_model = "mxbai-embed-large"
23 | base_dimension = 1_024
24 | batch_size = 128
25 | concurrent_request_limit = 2
26 | 
27 | [completion_embedding]
28 | provider = "ollama"
29 | base_model = "mxbai-embed-large"
30 | base_dimension = 1_024
31 | batch_size = 128
32 | concurrent_request_limit = 2
33 | 
34 | [agent]
35 | tools = ["search_file_knowledge"]
36 | 
37 | [completion]
38 | provider = "litellm"
39 | concurrent_request_limit = 1
40 | 
41 |   [completion.generation_config]
42 |   temperature = 0.1
43 |   top_p = 1
44 |   max_tokens_to_sample = 1_024
45 |   stream = false
46 |   api_base = "http://localhost:11434/v1"
47 | 


--------------------------------------------------------------------------------
/py/core/configs/r2r_azure.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "azure/gpt-4.1-mini"
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "azure/gpt-4.1"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "azure/gpt-4.1"
10 | 
11 | # LLM used for transcription
12 | audio_lm = "azure/whisper-1"
13 | 
14 | # Reasoning model, used for `research` agent
15 | reasoning_llm = "azure/o3-mini"
16 | # Planning model, used for `research` agent
17 | planning_llm = "azure/o3-mini"
18 | 
19 | [embedding]
20 | base_model = "azure/text-embedding-3-small"
21 | 
22 | [completion_embedding]
23 | base_model = "azure/text-embedding-3-small"
24 | 


--------------------------------------------------------------------------------
/py/core/configs/r2r_azure_with_test_limits.toml:
--------------------------------------------------------------------------------
 1 | [app]
 2 | # LLM used for internal operations, like deriving conversation names
 3 | fast_llm = "azure/gpt-4.1-mini"
 4 | 
 5 | # LLM used for user-facing output, like RAG replies
 6 | quality_llm = "azure/gpt-4.1"
 7 | 
 8 | # LLM used for ingesting visual inputs
 9 | vlm = "azure/gpt-4.1"
10 | 
11 | # LLM used for transcription
12 | audio_lm = "azure/whisper-1"
13 | 
14 | 
15 | # Reasoning model, used for `research` agent
16 | reasoning_llm = "azure/o3-mini"
17 | # Planning model, used for `research` agent
18 | planning_llm = "azure/o3-mini"
19 | 
20 | [embedding]
21 | base_model = "openai/text-embedding-3-small"
22 | base_dimension = 512
23 | 
24 | [completion_embedding]
25 | base_model = "openai/text-embedding-3-small"
26 | 
27 | [database]
28 |   [database.limits]
29 |   global_per_min = 10  # Small enough to test quickly
30 |   monthly_limit = 20  # Small enough to test in one run
31 | 
32 |   [database.route_limits]
33 |   "/v3/retrieval/search" = { route_per_min = 5, monthly_limit = 10 }
34 | 
35 |   [database.user_limits."47e53676-b478-5b3f-a409-234ca2164de5"]
36 |   global_per_min = 2
37 |   route_per_min = 1
38 | 


--------------------------------------------------------------------------------
/py/core/configs/r2r_with_auth.toml:
--------------------------------------------------------------------------------
1 | [auth]
2 | provider = "r2r"
3 | access_token_lifetime_in_minutes = 60
4 | refresh_token_lifetime_in_days = 7
5 | require_authentication = true
6 | require_email_verification = false
7 | default_admin_email = "admin@example.com"
8 | default_admin_password = "change_me_immediately"
9 | 


--------------------------------------------------------------------------------
/py/core/configs/tavily.toml:
--------------------------------------------------------------------------------
 1 | [completion]
 2 | provider = "r2r"
 3 | concurrent_request_limit = 128
 4 | 
 5 | [ingestion]
 6 | provider = "unstructured_local"
 7 | strategy = "auto"
 8 | chunking_strategy = "by_title"
 9 | new_after_n_chars = 2_048
10 | max_characters = 4_096
11 | combine_under_n_chars = 1_024
12 | overlap = 1_024
13 |     [ingestion.extra_parsers]
14 |     pdf = "zerox"
15 | 
16 | [orchestration]
17 | provider = "hatchet"
18 | kg_creation_concurrency_limit = 32
19 | ingestion_concurrency_limit = 16
20 | kg_concurrency_limit = 8
21 | 
22 | [agent]
23 | # Enable the Tavily search and extraction tools
24 | rag_tools = [
25 |     "search_file_descriptions",
26 |     "search_file_knowledge",
27 |     "get_file_content",
28 |     "tavily_search",
29 |     "tavily_extract"
30 | ]
31 | 


--------------------------------------------------------------------------------
/py/core/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/__init__.py


--------------------------------------------------------------------------------
/py/core/examples/data/DeepSeek_R1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/DeepSeek_R1.pdf


--------------------------------------------------------------------------------
/py/core/examples/data/aristotle_v2.txt:
--------------------------------------------------------------------------------
 1 | Aristotle[A] (Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, politics, psychology, and the arts. As the founder of the Peripatetic school of philosophy in the Lyceum in Athens, he began the wider Aristotelian tradition that followed, which set the groundwork for the development of modern science.
 2 | 
 3 | Little is known about Aristotle's life. He was born in the city of Stagira in northern Greece during the Classical period. His father, Nicomachus, died when Aristotle was a child, and he was brought up by a guardian. At 17 or 18, he joined Plato's Academy in Athens and remained there until the age of 37 (c. 347 BC). Shortly after Plato died, Aristotle left Athens and, at the request of Philip II of Macedon, tutored his son Alexander the Great beginning in 343 BC. He established a library in the Lyceum, which helped him to produce many of his hundreds of books on papyrus scrolls.
 4 | 
 5 | Though Aristotle wrote many elegant treatises and dialogues for publication, only around a third of his original output has survived, none of it intended for publication. Aristotle provided a complex synthesis of the various philosophies existing prior to him. His teachings and methods of inquiry have had a significant impact across the world, and remain a subject of contemporary philosophical discussion.
 6 | 
 7 | Aristotle's views profoundly shaped medieval scholarship. The influence of his physical science extended from late antiquity and the Early Middle Ages into the Renaissance, and was not replaced systematically until the Enlightenment and theories such as classical mechanics were developed. He influenced Judeo-Islamic philosophies during the Middle Ages, as well as Christian theology, especially the Neoplatonism of the Early Church and the scholastic tradition of the Catholic Church.
 8 | 
 9 | Aristotle was revered among medieval Muslim scholars as "The First Teacher", and among medieval Christians like Thomas Aquinas as simply "The Philosopher", while the poet Dante called him "the master of those who know". His works contain the earliest known formal study of logic, and were studied by medieval scholars such as Peter Abelard and Jean Buridan. Aristotle's influence on logic continued well into the 19th century. In addition, his ethics, although always influential, gained renewed interest with the modern advent of virtue ethics.
10 | 


--------------------------------------------------------------------------------
/py/core/examples/data/graphrag.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/graphrag.pdf


--------------------------------------------------------------------------------
/py/core/examples/data/lyft_2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/lyft_2021.pdf


--------------------------------------------------------------------------------
/py/core/examples/data/pg_essay_1.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_1.html


--------------------------------------------------------------------------------
/py/core/examples/data/pg_essay_2.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_2.html


--------------------------------------------------------------------------------
/py/core/examples/data/pg_essay_3.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_3.html


--------------------------------------------------------------------------------
/py/core/examples/data/pg_essay_4.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/pg_essay_4.html


--------------------------------------------------------------------------------
/py/core/examples/data/sample.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/sample.mp3


--------------------------------------------------------------------------------
/py/core/examples/data/sample2.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/sample2.mp3


--------------------------------------------------------------------------------
/py/core/examples/data/screen_shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/screen_shot.png


--------------------------------------------------------------------------------
/py/core/examples/data/test.txt:
--------------------------------------------------------------------------------
1 | this is a test text
2 | 


--------------------------------------------------------------------------------
/py/core/examples/data/uber_2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/data/uber_2021.pdf


--------------------------------------------------------------------------------
/py/core/examples/hello_r2r.py:
--------------------------------------------------------------------------------
 1 | from r2r import R2RClient
 2 | 
 3 | client = R2RClient()
 4 | 
 5 | with open("test.txt", "w") as file:
 6 |     file.write("John is a person that works at Google.")
 7 | 
 8 | client.ingest_files(file_paths=["test.txt"])
 9 | 
10 | # Call RAG directly on an R2R object
11 | rag_response = client.rag(
12 |     query="Who is john",
13 |     rag_generation_config={"model": "gpt-4.1-mini", "temperature": 0.0},
14 | )
15 | results = rag_response["results"]
16 | print(f"Search Results:\n{results['search_results']}")
17 | print(f"Completion:\n{results['completion']}")
18 | 
19 | # RAG Results:
20 | # Search Results:
21 | # AggregateSearchResult(chunk_search_results=[ChunkSearchResult(id=2d71e689-0a0e-5491-a50b-4ecb9494c832, score=0.6848798582029441, metadata={'text': 'John is a person that works at Google.', 'version': 'v0', 'chunk_order': 0, 'document_id': 'ed76b6ee-dd80-5172-9263-919d493b439a', 'id': '1ba494d7-cb2f-5f0e-9f64-76c31da11381', 'associatedQuery': 'Who is john'})], graph_search_results=None)
22 | # Completion:
23 | # ChatCompletion(id='chatcmpl-9g0HnjGjyWDLADe7E2EvLWa35cMkB', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='John is a person that works at Google [1].', role='assistant', function_call=None, tool_calls=None))], created=1719797903, model='gpt-4o-mini', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=11, prompt_tokens=145, total_tokens=156))
24 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/bmp.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/bmp.bmp


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/csv.csv:
--------------------------------------------------------------------------------
 1 | Date,Customer ID,Product,Quantity,Unit Price,Total
 2 | 2024-01-15,C1001,Laptop Pro X,2,999.99,1999.98
 3 | 2024-01-15,C1002,Wireless Mouse,5,29.99,149.95
 4 | 2024-01-16,C1003,External SSD 1TB,3,159.99,479.97
 5 | 2024-01-16,C1001,USB-C Cable,4,19.99,79.96
 6 | 2024-01-17,C1004,Monitor 27",1,349.99,349.99
 7 | 2024-01-17,C1005,Keyboard Elite,2,129.99,259.98
 8 | 2024-01-18,C1002,Headphones Pro,1,199.99,199.99
 9 | 2024-01-18,C1006,Webcam HD,3,79.99,239.97
10 | 2024-01-19,C1007,Power Bank,2,49.99,99.98
11 | 2024-01-19,C1003,Phone Case,5,24.99,124.95
12 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/doc.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/doc.doc


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/docx.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/docx.docx


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/eml.eml:
--------------------------------------------------------------------------------
 1 | From: sender@example.com
 2 | To: recipient@example.com
 3 | Subject: Meeting Summary - Q4 Planning
 4 | Date: Mon, 16 Dec 2024 10:30:00 -0500
 5 | Content-Type: multipart/mixed; boundary="boundary123"
 6 | 
 7 | --boundary123
 8 | Content-Type: text/plain; charset="utf-8"
 9 | Content-Transfer-Encoding: quoted-printable
10 | 
11 | Hi Team,
12 | 
13 | Here's a summary of our Q4 planning meeting:
14 | 
15 | Key Points:
16 | 1. Revenue targets increased by 15%
17 | 2. New product launch scheduled for November
18 | 3. Marketing budget approved for expansion
19 | 
20 | Action Items:
21 | - Sarah: Prepare detailed product roadmap
22 | - Mike: Contact vendors for pricing
23 | - Jennifer: Update financial projections
24 | 
25 | Please review and let me know if you have any questions.
26 | 
27 | Best regards,
28 | Alex
29 | 
30 | --boundary123
31 | Content-Type: text/html; charset="utf-8"
32 | Content-Transfer-Encoding: quoted-printable
33 | 
34 | <html>
35 | <body>
36 | <p>Hi Team,</p>
37 | 
38 | <p>Here's a summary of our Q4 planning meeting:</p>
39 | 
40 | <h3>Key Points:</h3>
41 | <ul>
42 | <li>Revenue targets increased by 15%</li>
43 | <li>New product launch scheduled for November</li>
44 | <li>Marketing budget approved for expansion</li>
45 | </ul>
46 | 
47 | <h3>Action Items:</h3>
48 | <ul>
49 | <li><strong>Sarah:</strong> Prepare detailed product roadmap</li>
50 | <li><strong>Mike:</strong> Contact vendors for pricing</li>
51 | <li><strong>Jennifer:</strong> Update financial projections</li>
52 | </ul>
53 | 
54 | <p>Please review and let me know if you have any questions.</p>
55 | 
56 | <p>Best regards,<br>
57 | Alex</p>
58 | </body>
59 | </html>
60 | 
61 | --boundary123--
62 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/epub.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/epub.epub


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/heic.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/heic.heic


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/html.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Product Dashboard</title>
 7 |     <style>
 8 |         body {
 9 |             font-family: Arial, sans-serif;
10 |             margin: 20px;
11 |             background-color: #f5f5f5;
12 |         }
13 |         .dashboard {
14 |             max-width: 800px;
15 |             margin: 0 auto;
16 |             padding: 20px;
17 |             background-color: white;
18 |             border-radius: 8px;
19 |             box-shadow: 0 2px 4px rgba(0,0,0,0.1);
20 |         }
21 |         .header {
22 |             text-align: center;
23 |             margin-bottom: 30px;
24 |         }
25 |         .metrics {
26 |             display: grid;
27 |             grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
28 |             gap: 20px;
29 |             margin-bottom: 30px;
30 |         }
31 |         .metric-card {
32 |             padding: 15px;
33 |             background-color: #f8f9fa;
34 |             border-radius: 4px;
35 |             text-align: center;
36 |         }
37 |     </style>
38 | </head>
39 | <body>
40 |     <div class="dashboard">
41 |         <div class="header">
42 |             <h1>Product Performance Dashboard</h1>
43 |             <p>Real-time metrics and analytics</p>
44 |         </div>
45 |         <div class="metrics">
46 |             <div class="metric-card">
47 |                 <h3>Active Users</h3>
48 |                 <p>1,234</p>
49 |             </div>
50 |             <div class="metric-card">
51 |                 <h3>Revenue</h3>
52 |                 <p>$45,678</p>
53 |             </div>
54 |             <div class="metric-card">
55 |                 <h3>Conversion Rate</h3>
56 |                 <p>2.34%</p>
57 |             </div>
58 |         </div>
59 |         <div class="recent-activity">
60 |             <h2>Recent Activity</h2>
61 |             <ul>
62 |                 <li>New feature deployed: Enhanced search</li>
63 |                 <li>Bug fix: Mobile navigation issue</li>
64 |                 <li>Performance improvement: Cache optimization</li>
65 |             </ul>
66 |         </div>
67 |     </div>
68 | </body>
69 | </html>
70 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/jpeg.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/jpeg.jpeg


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/jpg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/jpg.jpg


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/js.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const { r2rClient } = require("r2r-js");
 3 | 
 4 | // Create an account at SciPhi Cloud https://app.sciphi.ai and set an R2R_API_KEY environment variable
 5 | // or set the base URL to your instance. E.g. r2rClient("http://localhost:7272")
 6 | const client = new r2rClient();
 7 | 
 8 | async function main() {
 9 |   const filePath = path.resolve(__dirname, "data/raskolnikov.txt");
10 | 
11 | 
12 |   console.log("Ingesting file...");
13 |   const ingestResult = await client.documents.create({
14 |     file: {
15 |       path: filePath,
16 |       name: "raskolnikov.txt"
17 |     },
18 |     metadata: { author: "Dostoevsky" },
19 |   });
20 |   console.log("Ingest result:", JSON.stringify(ingestResult, null, 2));
21 | 
22 |   console.log("Waiting for the file to be ingested...");
23 |   await new Promise((resolve) => setTimeout(resolve, 10000));
24 | 
25 |   console.log("Performing RAG...");
26 |   const ragResponse = await client.retrieval.rag({
27 |     query: "To whom was Raskolnikov desperately in debt to?",
28 |   });
29 | 
30 |   console.log("Search Results:");
31 |   ragResponse.results.searchResults.chunkSearchResults.forEach(
32 |     (result, index) => {
33 |       console.log(`\nResult ${index + 1}:`);
34 |       console.log(`Text: ${result.text.substring(0, 100)}...`);
35 |       console.log(`Score: ${result.score}`);
36 |     },
37 |   );
38 | 
39 |   console.log("\nCompletion:");
40 |   console.log(ragResponse.results.completion);
41 | }
42 | 
43 | main();
44 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/json.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dashboard": {
 3 |         "name": "Product Performance Dashboard",
 4 |         "lastUpdated": "2024-12-16T10:30:00Z",
 5 |         "metrics": {
 6 |             "activeUsers": {
 7 |                 "current": 1234,
 8 |                 "previousPeriod": 1156,
 9 |                 "percentChange": 6.75
10 |             },
11 |             "revenue": {
12 |                 "current": 45678.90,
13 |                 "previousPeriod": 41234.56,
14 |                 "percentChange": 10.78,
15 |                 "currency": "USD"
16 |             },
17 |             "conversionRate": {
18 |                 "current": 2.34,
19 |                 "previousPeriod": 2.12,
20 |                 "percentChange": 10.38,
21 |                 "unit": "percent"
22 |             }
23 |         },
24 |         "recentActivity": [
25 |             {
26 |                 "type": "deployment",
27 |                 "title": "Enhanced search",
28 |                 "description": "New feature deployed: Enhanced search functionality",
29 |                 "timestamp": "2024-12-15T15:45:00Z",
30 |                 "status": "successful"
31 |             },
32 |             {
33 |                 "type": "bugfix",
34 |                 "title": "Mobile navigation",
35 |                 "description": "Bug fix: Mobile navigation issue resolved",
36 |                 "timestamp": "2024-12-14T09:20:00Z",
37 |                 "status": "successful"
38 |             },
39 |             {
40 |                 "type": "performance",
41 |                 "title": "Cache optimization",
42 |                 "description": "Performance improvement: Cache optimization completed",
43 |                 "timestamp": "2024-12-13T11:15:00Z",
44 |                 "status": "successful"
45 |             }
46 |         ],
47 |         "settings": {
48 |             "refreshInterval": 300,
49 |             "timezone": "UTC",
50 |             "theme": "light",
51 |             "notifications": {
52 |                 "email": true,
53 |                 "slack": true,
54 |                 "inApp": true
55 |             }
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/msg.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/msg.msg


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/odt.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/odt.odt


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/pdf.pdf


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/png.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/png.png


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/ppt.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/ppt.ppt


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/pptx.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/pptx.pptx


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/py.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from core.base.parsers.base_parser import AsyncParser
 7 | from core.base.providers import (
 8 |     CompletionProvider,
 9 |     DatabaseProvider,
10 |     IngestionConfig,
11 | )
12 | 
13 | 
14 | class HTMLParser(AsyncParser[str | bytes]):
15 |     """A parser for HTML data."""
16 | 
17 |     def __init__(
18 |         self,
19 |         config: IngestionConfig,
20 |         database_provider: DatabaseProvider,
21 |         llm_provider: CompletionProvider,
22 |     ):
23 |         self.database_provider = database_provider
24 |         self.llm_provider = llm_provider
25 |         self.config = config
26 | 
27 |     async def ingest(
28 |         self, data: str | bytes, *args, **kwargs
29 |     ) -> AsyncGenerator[str, None]:
30 |         """Ingest HTML data and yield text."""
31 |         soup = BeautifulSoup(data, "html.parser")
32 |         yield soup.get_text()
33 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/rst.rst:
--------------------------------------------------------------------------------
 1 | Header 1
 2 | ========
 3 | --------
 4 | Subtitle
 5 | --------
 6 | 
 7 | Example text.
 8 | 
 9 | .. contents:: Table of Contents
10 | 
11 | Header 2
12 | --------
13 | 
14 | 1. Blah blah ``code`` blah
15 | 
16 | 2. More ``code``, hooray
17 | 
18 | 3. Somé UTF-8°
19 | 
20 | The UTF-8 quote character in this table used to cause python to go boom. Now docutils just silently ignores it.
21 | 
22 | .. csv-table:: Things that are Awesome (on a scale of 1-11)
23 | 	:quote: ”
24 | 
25 | 	Thing,Awesomeness
26 | 	Icecream, 7
27 | 	Honey Badgers, 10.5
28 | 	Nickelback, -2
29 | 	Iron Man, 10
30 | 	Iron Man 2, 3
31 | 	Tabular Data, 5
32 | 	Made up ratings, 11
33 | 
34 | .. code::
35 | 
36 | 	A block of code
37 | 
38 | .. code:: python
39 | 
40 | 	python.code('hooray')
41 | 
42 | .. code:: javascript
43 | 
44 | 	export function ƒ(ɑ, β) {}
45 | 
46 | .. doctest:: ignored
47 | 
48 | 	>>> some_function()
49 | 	'result'
50 | 
51 | >>> some_function()
52 | 'result'
53 | 
54 | ==============  ==========================================================
55 | Travis          http://travis-ci.org/tony/pullv
56 | Docs            http://pullv.rtfd.org
57 | API             http://pullv.readthedocs.org/en/latest/api.html
58 | Issues          https://github.com/tony/pullv/issues
59 | Source          https://github.com/tony/pullv
60 | ==============  ==========================================================
61 | 
62 | 
63 | .. image:: https://scan.coverity.com/projects/621/badge.svg
64 | 	:target: https://scan.coverity.com/projects/621
65 | 	:alt: Coverity Scan Build Status
66 | 
67 | .. image:: https://scan.coverity.com/projects/621/badge.svg
68 | 	:alt: Coverity Scan Build Status
69 | 
70 | Field list
71 | ----------
72 | 
73 | :123456789 123456789 123456789 123456789 123456789 1: Uh-oh! This name is too long!
74 | :123456789 123456789 123456789 123456789 1234567890: this is a long name,
75 | 	but no problem!
76 | :123456789 12345: this is not so long, but long enough for the default!
77 | :123456789 1234: this should work even with the default :)
78 | 
79 | someone@somewhere.org
80 | 
81 | Press :kbd:`Ctrl+C` to quit
82 | 
83 | 
84 | .. raw:: html
85 | 
86 |     <p><strong>RAW HTML!</strong></p><style> p {color:blue;} </style>
87 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/rtf.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\deff0
2 | {\fonttbl{\f0\froman\fcharset0 Times New Roman;}}
3 | \viewkind4\uc1\pard\f0\fs24
4 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\par
5 | }
6 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/tiff.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/tiff.tiff


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/tsv.tsv:
--------------------------------------------------------------------------------
 1 | Region	Year	Quarter	Sales	Employees	Growth Rate
 2 | North America	2024	Q1	1250000	45	5.2
 3 | Europe	2024	Q1	980000	38	4.8
 4 | Asia Pacific	2024	Q1	1450000	52	6.1
 5 | South America	2024	Q1	580000	25	3.9
 6 | Africa	2024	Q1	320000	18	4.2
 7 | North America	2024	Q2	1380000	47	5.5
 8 | Europe	2024	Q2	1050000	40	4.9
 9 | Asia Pacific	2024	Q2	1520000	54	5.8
10 | South America	2024	Q2	620000	27	4.1
11 | Africa	2024	Q2	350000	20	4.4
12 | 


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/xls.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/xls.xls


--------------------------------------------------------------------------------
/py/core/examples/supported_file_types/xlsx.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/examples/supported_file_types/xlsx.xlsx


--------------------------------------------------------------------------------
/py/core/main/__init__.py:
--------------------------------------------------------------------------------
 1 | from .abstractions import R2RProviders
 2 | from .api import *
 3 | from .app import *
 4 | 
 5 | # from .app_entry import r2r_app
 6 | from .assembly import *
 7 | from .orchestration import *
 8 | from .services import *
 9 | 
10 | __all__ = [
11 |     # R2R Primary
12 |     "R2RProviders",
13 |     "R2RApp",
14 |     "R2RBuilder",
15 |     "R2RConfig",
16 |     # Factory
17 |     "R2RProviderFactory",
18 |     ## R2R SERVICES
19 |     "AuthService",
20 |     "IngestionService",
21 |     "MaintenanceService",
22 |     "ManagementService",
23 |     "RetrievalService",
24 |     "GraphService",
25 | ]
26 | 


--------------------------------------------------------------------------------
/py/core/main/assembly/__init__.py:
--------------------------------------------------------------------------------
 1 | from ..config import R2RConfig
 2 | from .builder import R2RBuilder
 3 | from .factory import R2RProviderFactory
 4 | 
 5 | __all__ = [
 6 |     # Builder
 7 |     "R2RBuilder",
 8 |     # Config
 9 |     "R2RConfig",
10 |     # Factory
11 |     "R2RProviderFactory",
12 | ]
13 | 


--------------------------------------------------------------------------------
/py/core/main/middleware/__init__.py:
--------------------------------------------------------------------------------
1 | from .project_schema import ProjectSchemaMiddleware
2 | 
3 | __all__ = [
4 |     "ProjectSchemaMiddleware",
5 | ]
6 | 


--------------------------------------------------------------------------------
/py/core/main/middleware/project_schema.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | 
 4 | from fastapi import Request
 5 | from fastapi.responses import JSONResponse
 6 | from starlette.middleware.base import BaseHTTPMiddleware
 7 | 
 8 | from core.utils.context import project_schema_context, set_project_schema
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class ProjectSchemaMiddleware(BaseHTTPMiddleware):
14 |     def __init__(
15 |         self, app, default_schema: str = "r2r_default", schema_exists_func=None
16 |     ):
17 |         super().__init__(app)
18 |         self.default_schema = default_schema
19 |         self.schema_exists_func = schema_exists_func
20 | 
21 |     async def dispatch(self, request: Request, call_next):
22 |         # Skip schema check for static files, docs, etc.
23 |         if request.url.path.startswith(
24 |             ("/docs", "/redoc", "/static", "/openapi.json")
25 |         ):
26 |             return await call_next(request)
27 | 
28 |         # Get the project name from the x-project-name header or use default
29 |         schema_name = request.headers.get(
30 |             "x-project-name", self.default_schema
31 |         )
32 | 
33 |         # Validate schema name format (prevent SQL injection)
34 |         if not re.match(r"^[a-zA-Z0-9_]+$", schema_name):
35 |             return JSONResponse(
36 |                 status_code=400,
37 |                 content={"detail": "Invalid schema name format"},
38 |             )
39 | 
40 |         # Check if schema exists (optional)
41 |         if self.schema_exists_func and schema_name != self.default_schema:
42 |             try:
43 |                 schema_exists = await self.schema_exists_func(schema_name)
44 |                 if not schema_exists:
45 |                     return JSONResponse(
46 |                         status_code=403,
47 |                         content={
48 |                             "detail": f"Schema '{schema_name}' does not exist"
49 |                         },
50 |                     )
51 |             except Exception as e:
52 |                 logger.error(f"Error checking schema existence: {e}")
53 |                 return JSONResponse(
54 |                     status_code=500,
55 |                     content={
56 |                         "detail": "Internal server error checking schema"
57 |                     },
58 |                 )
59 | 
60 |         # Set the project schema in the context for this request
61 |         schema_name = schema_name.replace('"', "")
62 | 
63 |         token = set_project_schema(schema_name)
64 | 
65 |         try:
66 |             # Process the request with the set schema
67 |             return await call_next(request)
68 |         finally:
69 |             # Reset context when done
70 |             project_schema_context.reset(token)
71 | 


--------------------------------------------------------------------------------
/py/core/main/orchestration/__init__.py:
--------------------------------------------------------------------------------
 1 | # FIXME: Once the Hatchet workflows are type annotated, remove the type: ignore comments
 2 | from .hatchet.graph_workflow import (  # type: ignore
 3 |     hatchet_graph_search_results_factory,
 4 | )
 5 | from .hatchet.ingestion_workflow import (  # type: ignore
 6 |     hatchet_ingestion_factory,
 7 | )
 8 | from .simple.graph_workflow import simple_graph_search_results_factory
 9 | from .simple.ingestion_workflow import simple_ingestion_factory
10 | 
11 | __all__ = [
12 |     "hatchet_ingestion_factory",
13 |     "hatchet_graph_search_results_factory",
14 |     "simple_ingestion_factory",
15 |     "simple_graph_search_results_factory",
16 | ]
17 | 


--------------------------------------------------------------------------------
/py/core/main/orchestration/hatchet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/main/orchestration/hatchet/__init__.py


--------------------------------------------------------------------------------
/py/core/main/orchestration/simple/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/main/orchestration/simple/__init__.py


--------------------------------------------------------------------------------
/py/core/main/services/__init__.py:
--------------------------------------------------------------------------------
 1 | from .auth_service import AuthService
 2 | from .graph_service import GraphService
 3 | from .ingestion_service import IngestionService, IngestionServiceAdapter
 4 | from .maintenance_service import MaintenanceService
 5 | from .management_service import ManagementService
 6 | from .retrieval_service import RetrievalService  # type: ignore
 7 | 
 8 | __all__ = [
 9 |     "AuthService",
10 |     "IngestionService",
11 |     "IngestionServiceAdapter",
12 |     "MaintenanceService",
13 |     "ManagementService",
14 |     "GraphService",
15 |     "RetrievalService",
16 | ]
17 | 


--------------------------------------------------------------------------------
/py/core/main/services/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | from ..abstractions import R2RProviders
 4 | from ..config import R2RConfig
 5 | 
 6 | 
 7 | class Service(ABC):
 8 |     def __init__(
 9 |         self,
10 |         config: R2RConfig,
11 |         providers: R2RProviders,
12 |     ):
13 |         self.config = config
14 |         self.providers = providers
15 | 


--------------------------------------------------------------------------------
/py/core/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .media import *
 2 | from .structured import *
 3 | from .text import *
 4 | 
 5 | __all__ = [
 6 |     "AudioParser",
 7 |     "BMPParser",
 8 |     "DOCParser",
 9 |     "DOCXParser",
10 |     "ImageParser",
11 |     "ODTParser",
12 |     "OCRPDFParser",
13 |     "VLMPDFParser",
14 |     "BasicPDFParser",
15 |     "PDFParserUnstructured",
16 |     "PPTParser",
17 |     "PPTXParser",
18 |     "RTFParser",
19 |     "CSVParser",
20 |     "CSVParserAdvanced",
21 |     "EMLParser",
22 |     "EPUBParser",
23 |     "JSONParser",
24 |     "MSGParser",
25 |     "ORGParser",
26 |     "P7SParser",
27 |     "RSTParser",
28 |     "TSVParser",
29 |     "XLSParser",
30 |     "XLSXParser",
31 |     "XLSXParserAdvanced",
32 |     "MDParser",
33 |     "HTMLParser",
34 |     "TextParser",
35 |     "PythonParser",
36 |     "CSSParser",
37 |     "JSParser",
38 |     "TSParser",
39 | ]
40 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/__init__.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from .audio_parser import AudioParser
 3 | from .bmp_parser import BMPParser
 4 | from .doc_parser import DOCParser
 5 | from .docx_parser import DOCXParser
 6 | from .img_parser import ImageParser
 7 | from .odt_parser import ODTParser
 8 | from .pdf_parser import (
 9 |     BasicPDFParser,
10 |     OCRPDFParser,
11 |     PDFParserUnstructured,
12 |     VLMPDFParser,
13 | )
14 | from .ppt_parser import PPTParser
15 | from .pptx_parser import PPTXParser
16 | from .rtf_parser import RTFParser
17 | 
18 | __all__ = [
19 |     "AudioParser",
20 |     "BMPParser",
21 |     "DOCParser",
22 |     "DOCXParser",
23 |     "ImageParser",
24 |     "ODTParser",
25 |     "OCRPDFParser",
26 |     "VLMPDFParser",
27 |     "BasicPDFParser",
28 |     "PDFParserUnstructured",
29 |     "PPTParser",
30 |     "PPTXParser",
31 |     "RTFParser",
32 | ]
33 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/audio_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import logging
 3 | import os
 4 | import tempfile
 5 | from typing import AsyncGenerator
 6 | 
 7 | from litellm import atranscription
 8 | 
 9 | from core.base.parsers.base_parser import AsyncParser
10 | from core.base.providers import (
11 |     CompletionProvider,
12 |     DatabaseProvider,
13 |     IngestionConfig,
14 | )
15 | 
16 | logger = logging.getLogger()
17 | 
18 | 
19 | class AudioParser(AsyncParser[bytes]):
20 |     """A parser for audio data using Whisper transcription."""
21 | 
22 |     def __init__(
23 |         self,
24 |         config: IngestionConfig,
25 |         database_provider: DatabaseProvider,
26 |         llm_provider: CompletionProvider,
27 |     ):
28 |         self.database_provider = database_provider
29 |         self.llm_provider = llm_provider
30 |         self.config = config
31 |         self.atranscription = atranscription
32 | 
33 |     async def ingest(  # type: ignore
34 |         self, data: bytes, **kwargs
35 |     ) -> AsyncGenerator[str, None]:
36 |         """Ingest audio data and yield a transcription using Whisper via
37 |         LiteLLM.
38 | 
39 |         Args:
40 |             data: Raw audio bytes
41 |             *args, **kwargs: Additional arguments passed to the transcription call
42 | 
43 |         Yields:
44 |             Chunks of transcribed text
45 |         """
46 |         try:
47 |             # Create a temporary file to store the audio data
48 |             with tempfile.NamedTemporaryFile(
49 |                 suffix=".wav", delete=False
50 |             ) as temp_file:
51 |                 temp_file.write(data)
52 |                 temp_file_path = temp_file.name
53 | 
54 |             # Call Whisper transcription
55 |             response = await self.atranscription(
56 |                 model=self.config.audio_transcription_model
57 |                 or self.config.app.audio_lm,
58 |                 file=open(temp_file_path, "rb"),
59 |                 **kwargs,
60 |             )
61 | 
62 |             # The response should contain the transcribed text directly
63 |             yield response.text
64 | 
65 |         except Exception as e:
66 |             logger.error(f"Error processing audio with Whisper: {str(e)}")
67 |             raise
68 | 
69 |         finally:
70 |             # Clean up the temporary file
71 |             try:
72 |                 os.unlink(temp_file_path)
73 |             except Exception as e:
74 |                 logger.warning(f"Failed to delete temporary file: {str(e)}")
75 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/bmp_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from core.base.parsers.base_parser import AsyncParser
 5 | from core.base.providers import (
 6 |     CompletionProvider,
 7 |     DatabaseProvider,
 8 |     IngestionConfig,
 9 | )
10 | 
11 | 
12 | class BMPParser(AsyncParser[str | bytes]):
13 |     """A parser for BMP image data."""
14 | 
15 |     def __init__(
16 |         self,
17 |         config: IngestionConfig,
18 |         database_provider: DatabaseProvider,
19 |         llm_provider: CompletionProvider,
20 |     ):
21 |         self.database_provider = database_provider
22 |         self.llm_provider = llm_provider
23 |         self.config = config
24 | 
25 |         import struct
26 | 
27 |         self.struct = struct
28 | 
29 |     async def extract_bmp_metadata(self, data: bytes) -> dict:
30 |         """Extract metadata from BMP file header."""
31 |         try:
32 |             # BMP header format
33 |             header_format = "<2sIHHI"
34 |             header_size = self.struct.calcsize(header_format)
35 | 
36 |             # Unpack header data
37 |             (
38 |                 signature,
39 |                 file_size,
40 |                 reserved,
41 |                 reserved2,
42 |                 data_offset,
43 |             ) = self.struct.unpack(header_format, data[:header_size])
44 | 
45 |             # DIB header
46 |             dib_format = "<IiiHHIIiiII"
47 |             dib_size = self.struct.calcsize(dib_format)
48 |             dib_data = self.struct.unpack(dib_format, data[14 : 14 + dib_size])
49 | 
50 |             width = dib_data[1]
51 |             height = abs(dib_data[2])  # Height can be negative
52 |             bits_per_pixel = dib_data[4]
53 |             compression = dib_data[5]
54 | 
55 |             return {
56 |                 "width": width,
57 |                 "height": height,
58 |                 "bits_per_pixel": bits_per_pixel,
59 |                 "file_size": file_size,
60 |                 "compression": compression,
61 |             }
62 |         except Exception as e:
63 |             return {"error": f"Failed to parse BMP header: {str(e)}"}
64 | 
65 |     async def ingest(
66 |         self, data: str | bytes, **kwargs
67 |     ) -> AsyncGenerator[str, None]:
68 |         """Ingest BMP data and yield metadata description."""
69 |         if isinstance(data, str):
70 |             # Convert base64 string to bytes if needed
71 |             import base64
72 | 
73 |             data = base64.b64decode(data)
74 | 
75 |         metadata = await self.extract_bmp_metadata(data)
76 | 
77 |         # Generate description of the BMP file
78 |         yield f"BMP image with dimensions {metadata.get('width', 'unknown')}x{metadata.get('height', 'unknown')} pixels, {metadata.get('bits_per_pixel', 'unknown')} bits per pixel, file size: {metadata.get('file_size', 'unknown')} bytes"
79 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/docx_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from io import BytesIO
 3 | from typing import AsyncGenerator
 4 | 
 5 | from docx import Document
 6 | 
 7 | from core.base.parsers.base_parser import AsyncParser
 8 | from core.base.providers import (
 9 |     CompletionProvider,
10 |     DatabaseProvider,
11 |     IngestionConfig,
12 | )
13 | 
14 | 
15 | class DOCXParser(AsyncParser[str | bytes]):
16 |     """A parser for DOCX data."""
17 | 
18 |     def __init__(
19 |         self,
20 |         config: IngestionConfig,
21 |         database_provider: DatabaseProvider,
22 |         llm_provider: CompletionProvider,
23 |     ):
24 |         self.database_provider = database_provider
25 |         self.llm_provider = llm_provider
26 |         self.config = config
27 |         self.Document = Document
28 | 
29 |     async def ingest(
30 |         self, data: str | bytes, *args, **kwargs
31 |     ) -> AsyncGenerator[str, None]:  # type: ignore
32 |         """Ingest DOCX data and yield text from each paragraph."""
33 |         if isinstance(data, str):
34 |             raise ValueError("DOCX data must be in bytes format.")
35 | 
36 |         doc = self.Document(BytesIO(data))
37 |         for paragraph in doc.paragraphs:
38 |             yield paragraph.text
39 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/odt_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import xml.etree.ElementTree as ET
 3 | import zipfile
 4 | from typing import AsyncGenerator
 5 | 
 6 | from core.base.parsers.base_parser import AsyncParser
 7 | from core.base.providers import (
 8 |     CompletionProvider,
 9 |     DatabaseProvider,
10 |     IngestionConfig,
11 | )
12 | 
13 | 
14 | class ODTParser(AsyncParser[str | bytes]):
15 |     def __init__(
16 |         self,
17 |         config: IngestionConfig,
18 |         database_provider: DatabaseProvider,
19 |         llm_provider: CompletionProvider,
20 |     ):
21 |         self.database_provider = database_provider
22 |         self.llm_provider = llm_provider
23 |         self.config = config
24 |         self.zipfile = zipfile
25 |         self.ET = ET
26 | 
27 |     async def ingest(
28 |         self, data: str | bytes, **kwargs
29 |     ) -> AsyncGenerator[str, None]:
30 |         if isinstance(data, str):
31 |             raise ValueError("ODT data must be in bytes format.")
32 | 
33 |         from io import BytesIO
34 | 
35 |         file_obj = BytesIO(data)
36 | 
37 |         try:
38 |             with self.zipfile.ZipFile(file_obj) as odt:
39 |                 # ODT files are zip archives containing content.xml
40 |                 content = odt.read("content.xml")
41 |                 root = self.ET.fromstring(content)
42 | 
43 |                 # ODT XML namespace
44 |                 ns = {"text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0"}
45 | 
46 |                 # Extract paragraphs and headers
47 |                 for p in root.findall(".//text:p", ns):
48 |                     text = "".join(p.itertext())
49 |                     if text.strip():
50 |                         yield text.strip()
51 | 
52 |                 for h in root.findall(".//text:h", ns):
53 |                     text = "".join(h.itertext())
54 |                     if text.strip():
55 |                         yield text.strip()
56 | 
57 |         except Exception as e:
58 |             raise ValueError(f"Error processing ODT file: {str(e)}") from e
59 |         finally:
60 |             file_obj.close()
61 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/pptx_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from io import BytesIO
 3 | from typing import AsyncGenerator
 4 | 
 5 | from pptx import Presentation
 6 | 
 7 | from core.base.parsers.base_parser import AsyncParser
 8 | from core.base.providers import (
 9 |     CompletionProvider,
10 |     DatabaseProvider,
11 |     IngestionConfig,
12 | )
13 | 
14 | 
15 | class PPTXParser(AsyncParser[str | bytes]):
16 |     """A parser for PPT data."""
17 | 
18 |     def __init__(
19 |         self,
20 |         config: IngestionConfig,
21 |         database_provider: DatabaseProvider,
22 |         llm_provider: CompletionProvider,
23 |     ):
24 |         self.database_provider = database_provider
25 |         self.llm_provider = llm_provider
26 |         self.config = config
27 |         self.Presentation = Presentation
28 | 
29 |     async def ingest(
30 |         self, data: str | bytes, **kwargs
31 |     ) -> AsyncGenerator[str, None]:  # type: ignore
32 |         """Ingest PPT data and yield text from each slide."""
33 |         if isinstance(data, str):
34 |             raise ValueError("PPT data must be in bytes format.")
35 | 
36 |         prs = self.Presentation(BytesIO(data))
37 |         for slide in prs.slides:
38 |             for shape in slide.shapes:
39 |                 if hasattr(shape, "text"):
40 |                     yield shape.text
41 | 


--------------------------------------------------------------------------------
/py/core/parsers/media/rtf_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from striprtf.striprtf import rtf_to_text
 5 | 
 6 | from core.base.parsers.base_parser import AsyncParser
 7 | from core.base.providers import (
 8 |     CompletionProvider,
 9 |     DatabaseProvider,
10 |     IngestionConfig,
11 | )
12 | 
13 | 
14 | class RTFParser(AsyncParser[str | bytes]):
15 |     """Parser for Rich Text Format (.rtf) files."""
16 | 
17 |     def __init__(
18 |         self,
19 |         config: IngestionConfig,
20 |         database_provider: DatabaseProvider,
21 |         llm_provider: CompletionProvider,
22 |     ):
23 |         self.database_provider = database_provider
24 |         self.llm_provider = llm_provider
25 |         self.config = config
26 |         self.striprtf = rtf_to_text
27 | 
28 |     async def ingest(
29 |         self, data: str | bytes, **kwargs
30 |     ) -> AsyncGenerator[str, None]:
31 |         if isinstance(data, bytes):
32 |             data = data.decode("utf-8", errors="ignore")
33 | 
34 |         try:
35 |             # Convert RTF to plain text
36 |             plain_text = self.striprtf(data)
37 | 
38 |             # Split into paragraphs and yield non-empty ones
39 |             paragraphs = plain_text.split("\n\n")
40 |             for paragraph in paragraphs:
41 |                 if paragraph.strip():
42 |                     yield paragraph.strip()
43 | 
44 |         except Exception as e:
45 |             raise ValueError(f"Error processing RTF file: {str(e)}") from e
46 | 


--------------------------------------------------------------------------------
/py/core/parsers/structured/__init__.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from .csv_parser import CSVParser, CSVParserAdvanced
 3 | from .eml_parser import EMLParser
 4 | from .epub_parser import EPUBParser
 5 | from .json_parser import JSONParser
 6 | from .msg_parser import MSGParser
 7 | from .org_parser import ORGParser
 8 | from .p7s_parser import P7SParser
 9 | from .rst_parser import RSTParser
10 | from .tsv_parser import TSVParser
11 | from .xls_parser import XLSParser
12 | from .xlsx_parser import XLSXParser, XLSXParserAdvanced
13 | 
14 | __all__ = [
15 |     "CSVParser",
16 |     "CSVParserAdvanced",
17 |     "EMLParser",
18 |     "EPUBParser",
19 |     "JSONParser",
20 |     "MSGParser",
21 |     "ORGParser",
22 |     "P7SParser",
23 |     "RSTParser",
24 |     "TSVParser",
25 |     "XLSParser",
26 |     "XLSXParser",
27 |     "XLSXParserAdvanced",
28 | ]
29 | 


--------------------------------------------------------------------------------
/py/core/parsers/structured/eml_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from email import message_from_bytes, policy
 3 | from typing import AsyncGenerator
 4 | 
 5 | from core.base.parsers.base_parser import AsyncParser
 6 | from core.base.providers import (
 7 |     CompletionProvider,
 8 |     DatabaseProvider,
 9 |     IngestionConfig,
10 | )
11 | 
12 | 
13 | class EMLParser(AsyncParser[str | bytes]):
14 |     """Parser for EML (email) files."""
15 | 
16 |     def __init__(
17 |         self,
18 |         config: IngestionConfig,
19 |         database_provider: DatabaseProvider,
20 |         llm_provider: CompletionProvider,
21 |     ):
22 |         self.database_provider = database_provider
23 |         self.llm_provider = llm_provider
24 |         self.config = config
25 | 
26 |     async def ingest(
27 |         self, data: str | bytes, **kwargs
28 |     ) -> AsyncGenerator[str, None]:
29 |         """Ingest EML data and yield email content."""
30 |         if isinstance(data, str):
31 |             raise ValueError("EML data must be in bytes format.")
32 | 
33 |         # Parse email with policy for modern email handling
34 |         email_message = message_from_bytes(data, policy=policy.default)
35 | 
36 |         # Extract and yield email metadata
37 |         metadata = []
38 |         if email_message["Subject"]:
39 |             metadata.append(f"Subject: {email_message['Subject']}")
40 |         if email_message["From"]:
41 |             metadata.append(f"From: {email_message['From']}")
42 |         if email_message["To"]:
43 |             metadata.append(f"To: {email_message['To']}")
44 |         if email_message["Date"]:
45 |             metadata.append(f"Date: {email_message['Date']}")
46 | 
47 |         if metadata:
48 |             yield "\n".join(metadata)
49 | 
50 |         # Extract and yield email body
51 |         if email_message.is_multipart():
52 |             for part in email_message.walk():
53 |                 if part.get_content_type() == "text/plain":
54 |                     text = part.get_content()
55 |                     if text.strip():
56 |                         yield text.strip()
57 |                 elif part.get_content_type() == "text/html":
58 |                     # Could add HTML parsing here if needed
59 |                     continue
60 |         else:
61 |             body = email_message.get_content()
62 |             if body.strip():
63 |                 yield body.strip()
64 | 


--------------------------------------------------------------------------------
/py/core/parsers/structured/msg_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | import tempfile
 4 | from typing import AsyncGenerator
 5 | 
 6 | from msg_parser import MsOxMessage
 7 | 
 8 | from core.base.parsers.base_parser import AsyncParser
 9 | from core.base.providers import (
10 |     CompletionProvider,
11 |     DatabaseProvider,
12 |     IngestionConfig,
13 | )
14 | 
15 | 
16 | class MSGParser(AsyncParser[str | bytes]):
17 |     """Parser for MSG (Outlook Message) files using msg_parser."""
18 | 
19 |     def __init__(
20 |         self,
21 |         config: IngestionConfig,
22 |         database_provider: DatabaseProvider,
23 |         llm_provider: CompletionProvider,
24 |     ):
25 |         self.database_provider = database_provider
26 |         self.llm_provider = llm_provider
27 |         self.config = config
28 | 
29 |     async def ingest(
30 |         self, data: str | bytes, **kwargs
31 |     ) -> AsyncGenerator[str, None]:
32 |         """Ingest MSG data and yield email content."""
33 |         if isinstance(data, str):
34 |             raise ValueError("MSG data must be in bytes format.")
35 | 
36 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".msg")
37 |         try:
38 |             tmp_file.write(data)
39 |             tmp_file.close()
40 | 
41 |             msg = MsOxMessage(tmp_file.name)
42 | 
43 |             metadata = []
44 | 
45 |             if msg.subject:
46 |                 metadata.append(f"Subject: {msg.subject}")
47 |             if msg.sender:
48 |                 metadata.append(f"From: {msg.sender}")
49 |             if msg.to:
50 |                 metadata.append(f"To: {', '.join(msg.to)}")
51 |             if msg.sent_date:
52 |                 metadata.append(f"Date: {msg.sent_date}")
53 |             if metadata:
54 |                 yield "\n".join(metadata)
55 |             if msg.body:
56 |                 yield msg.body.strip()
57 | 
58 |             for attachment in msg.attachments:
59 |                 if attachment.Filename:
60 |                     yield f"\nAttachment: {attachment.Filename}"
61 | 
62 |         except Exception as e:
63 |             raise ValueError(f"Error processing MSG file: {str(e)}") from e
64 |         finally:
65 |             os.remove(tmp_file.name)
66 | 


--------------------------------------------------------------------------------
/py/core/parsers/structured/org_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | import orgparse
 5 | 
 6 | from core.base.parsers.base_parser import AsyncParser
 7 | from core.base.providers import (
 8 |     CompletionProvider,
 9 |     DatabaseProvider,
10 |     IngestionConfig,
11 | )
12 | 
13 | 
14 | class ORGParser(AsyncParser[str | bytes]):
15 |     """Parser for ORG (Emacs Org-mode) files."""
16 | 
17 |     def __init__(
18 |         self,
19 |         config: IngestionConfig,
20 |         database_provider: DatabaseProvider,
21 |         llm_provider: CompletionProvider,
22 |     ):
23 |         self.database_provider = database_provider
24 |         self.llm_provider = llm_provider
25 |         self.config = config
26 |         self.orgparse = orgparse
27 | 
28 |     def _process_node(self, node) -> list[str]:
29 |         """Process an org-mode node and return its content."""
30 |         contents = []
31 | 
32 |         # Add heading with proper level of asterisks
33 |         if node.level > 0:
34 |             contents.append(f"{'*' * node.level} {node.heading}")
35 | 
36 |         # Add body content if exists
37 |         if node.body:
38 |             contents.append(node.body.strip())
39 | 
40 |         return contents
41 | 
42 |     async def ingest(
43 |         self, data: str | bytes, **kwargs
44 |     ) -> AsyncGenerator[str, None]:
45 |         """Ingest ORG data and yield document content."""
46 |         if isinstance(data, bytes):
47 |             data = data.decode("utf-8")
48 | 
49 |         try:
50 |             # Create a temporary file-like object for orgparse
51 |             from io import StringIO
52 | 
53 |             file_obj = StringIO(data)
54 | 
55 |             # Parse the org file
56 |             root = self.orgparse.load(file_obj)
57 | 
58 |             # Process root node if it has content
59 |             if root.body:
60 |                 yield root.body.strip()
61 | 
62 |             # Process all nodes
63 |             for node in root[1:]:  # Skip root node in iteration
64 |                 contents = self._process_node(node)
65 |                 for content in contents:
66 |                     if content.strip():
67 |                         yield content.strip()
68 | 
69 |         except Exception as e:
70 |             raise ValueError(f"Error processing ORG file: {str(e)}") from e
71 |         finally:
72 |             file_obj.close()
73 | 


--------------------------------------------------------------------------------
/py/core/parsers/structured/rst_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from docutils.core import publish_string
 5 | from docutils.writers import html5_polyglot
 6 | 
 7 | from core.base.parsers.base_parser import AsyncParser
 8 | from core.base.providers import (
 9 |     CompletionProvider,
10 |     DatabaseProvider,
11 |     IngestionConfig,
12 | )
13 | 
14 | 
15 | class RSTParser(AsyncParser[str | bytes]):
16 |     """Parser for reStructuredText (.rst) files."""
17 | 
18 |     def __init__(
19 |         self,
20 |         config: IngestionConfig,
21 |         database_provider: DatabaseProvider,
22 |         llm_provider: CompletionProvider,
23 |     ):
24 |         self.database_provider = database_provider
25 |         self.llm_provider = llm_provider
26 |         self.config = config
27 |         self.publish_string = publish_string
28 |         self.html5_polyglot = html5_polyglot
29 | 
30 |     async def ingest(
31 |         self, data: str | bytes, **kwargs
32 |     ) -> AsyncGenerator[str, None]:
33 |         if isinstance(data, bytes):
34 |             data = data.decode("utf-8")
35 | 
36 |         try:
37 |             # Convert RST to HTML
38 |             html = self.publish_string(
39 |                 source=data,
40 |                 writer=self.html5_polyglot.Writer(),
41 |                 settings_overrides={"report_level": 5},
42 |             )
43 | 
44 |             # Basic HTML cleanup
45 |             import re
46 | 
47 |             text = html.decode("utf-8")
48 |             text = re.sub(r"<[^>]+>", " ", text)
49 |             text = re.sub(r"\s+", " ", text)
50 | 
51 |             # Split into paragraphs and yield non-empty ones
52 |             paragraphs = text.split("\n\n")
53 |             for paragraph in paragraphs:
54 |                 if paragraph.strip():
55 |                     yield paragraph.strip()
56 | 
57 |         except Exception as e:
58 |             raise ValueError(f"Error processing RST file: {str(e)}") from e
59 | 


--------------------------------------------------------------------------------
/py/core/parsers/text/__init__.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from .css_parser import CSSParser
 3 | from .html_parser import HTMLParser
 4 | from .js_parser import JSParser
 5 | from .md_parser import MDParser
 6 | from .python_parser import PythonParser
 7 | from .text_parser import TextParser
 8 | from .ts_parser import TSParser
 9 | 
10 | __all__ = [
11 |     "MDParser",
12 |     "HTMLParser",
13 |     "TextParser",
14 |     "PythonParser",
15 |     "CSSParser",
16 |     "JSParser",
17 |     "TSParser",
18 | ]
19 | 


--------------------------------------------------------------------------------
/py/core/parsers/text/html_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from core.base.parsers.base_parser import AsyncParser
 7 | from core.base.providers import (
 8 |     CompletionProvider,
 9 |     DatabaseProvider,
10 |     IngestionConfig,
11 | )
12 | 
13 | 
14 | class HTMLParser(AsyncParser[str | bytes]):
15 |     """A parser for HTML data."""
16 | 
17 |     def __init__(
18 |         self,
19 |         config: IngestionConfig,
20 |         database_provider: DatabaseProvider,
21 |         llm_provider: CompletionProvider,
22 |     ):
23 |         self.database_provider = database_provider
24 |         self.llm_provider = llm_provider
25 |         self.config = config
26 | 
27 |     async def ingest(
28 |         self, data: str | bytes, *args, **kwargs
29 |     ) -> AsyncGenerator[str, None]:
30 |         """Ingest HTML data and yield text."""
31 |         soup = BeautifulSoup(data, "html.parser")
32 |         yield soup.get_text()
33 | 


--------------------------------------------------------------------------------
/py/core/parsers/text/md_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from core.base.parsers.base_parser import AsyncParser
 7 | from core.base.providers import (
 8 |     CompletionProvider,
 9 |     DatabaseProvider,
10 |     IngestionConfig,
11 | )
12 | 
13 | 
14 | class MDParser(AsyncParser[str | bytes]):
15 |     """A parser for Markdown data."""
16 | 
17 |     def __init__(
18 |         self,
19 |         config: IngestionConfig,
20 |         database_provider: DatabaseProvider,
21 |         llm_provider: CompletionProvider,
22 |     ):
23 |         self.database_provider = database_provider
24 |         self.llm_provider = llm_provider
25 |         self.config = config
26 | 
27 |         import markdown
28 | 
29 |         self.markdown = markdown
30 | 
31 |     async def ingest(
32 |         self, data: str | bytes, *args, **kwargs
33 |     ) -> AsyncGenerator[str, None]:
34 |         """Ingest Markdown data and yield text."""
35 |         if isinstance(data, bytes):
36 |             data = data.decode("utf-8")
37 |         html = self.markdown.markdown(data)
38 |         soup = BeautifulSoup(html, "html.parser")
39 |         yield soup.get_text()
40 | 


--------------------------------------------------------------------------------
/py/core/parsers/text/text_parser.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from typing import AsyncGenerator
 3 | 
 4 | from core.base.parsers.base_parser import AsyncParser
 5 | from core.base.providers import (
 6 |     CompletionProvider,
 7 |     DatabaseProvider,
 8 |     IngestionConfig,
 9 | )
10 | 
11 | 
12 | class TextParser(AsyncParser[str | bytes]):
13 |     """A parser for raw text data."""
14 | 
15 |     def __init__(
16 |         self,
17 |         config: IngestionConfig,
18 |         database_provider: DatabaseProvider,
19 |         llm_provider: CompletionProvider,
20 |     ):
21 |         self.database_provider = database_provider
22 |         self.llm_provider = llm_provider
23 |         self.config = config
24 | 
25 |     async def ingest(
26 |         self, data: str | bytes, *args, **kwargs
27 |     ) -> AsyncGenerator[str | bytes, None]:
28 |         if isinstance(data, bytes):
29 |             data = data.decode("utf-8")
30 |         yield data
31 | 


--------------------------------------------------------------------------------
/py/core/providers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .auth import (
 2 |     ClerkAuthProvider,
 3 |     JwtAuthProvider,
 4 |     R2RAuthProvider,
 5 |     SupabaseAuthProvider,
 6 | )
 7 | from .crypto import (
 8 |     BcryptCryptoConfig,
 9 |     BCryptCryptoProvider,
10 |     NaClCryptoConfig,
11 |     NaClCryptoProvider,
12 | )
13 | from .database import PostgresDatabaseProvider
14 | from .email import (
15 |     AsyncSMTPEmailProvider,
16 |     ConsoleMockEmailProvider,
17 |     MailerSendEmailProvider,
18 |     SendGridEmailProvider,
19 | )
20 | from .embeddings import (
21 |     LiteLLMEmbeddingProvider,
22 |     OllamaEmbeddingProvider,
23 |     OpenAIEmbeddingProvider,
24 | )
25 | from .file import (
26 |     PostgresFileProvider,
27 |     S3FileProvider,
28 | )
29 | from .ingestion import (  # type: ignore
30 |     R2RIngestionConfig,
31 |     R2RIngestionProvider,
32 |     UnstructuredIngestionConfig,
33 |     UnstructuredIngestionProvider,
34 | )
35 | from .llm import (
36 |     AnthropicCompletionProvider,
37 |     LiteLLMCompletionProvider,
38 |     OpenAICompletionProvider,
39 |     R2RCompletionProvider,
40 | )
41 | from .ocr import (
42 |     MistralOCRProvider,
43 | )
44 | from .orchestration import (
45 |     HatchetOrchestrationProvider,
46 |     SimpleOrchestrationProvider,
47 | )
48 | from .scheduler import (
49 |     APSchedulerProvider,
50 | )
51 | 
52 | __all__ = [
53 |     # Auth
54 |     "R2RAuthProvider",
55 |     "SupabaseAuthProvider",
56 |     "JwtAuthProvider",
57 |     "ClerkAuthProvider",
58 |     # Ingestion
59 |     "R2RIngestionProvider",
60 |     "R2RIngestionConfig",
61 |     "UnstructuredIngestionProvider",
62 |     "UnstructuredIngestionConfig",
63 |     # Crypto
64 |     "BCryptCryptoProvider",
65 |     "BcryptCryptoConfig",
66 |     "NaClCryptoConfig",
67 |     "NaClCryptoProvider",
68 |     # Database
69 |     "PostgresDatabaseProvider",
70 |     # Embeddings
71 |     "LiteLLMEmbeddingProvider",
72 |     "OllamaEmbeddingProvider",
73 |     "OpenAIEmbeddingProvider",
74 |     # Email
75 |     "AsyncSMTPEmailProvider",
76 |     "ConsoleMockEmailProvider",
77 |     "SendGridEmailProvider",
78 |     "MailerSendEmailProvider",
79 |     # File
80 |     "PostgresFileProvider",
81 |     "S3FileProvider",
82 |     # LLM
83 |     "AnthropicCompletionProvider",
84 |     "OpenAICompletionProvider",
85 |     "R2RCompletionProvider",
86 |     "LiteLLMCompletionProvider",
87 |     # OCR
88 |     "MistralOCRProvider",
89 |     # Orchestration
90 |     "HatchetOrchestrationProvider",
91 |     "SimpleOrchestrationProvider",
92 |     # Scheduler
93 |     "APSchedulerProvider",
94 | ]
95 | 


--------------------------------------------------------------------------------
/py/core/providers/auth/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clerk import ClerkAuthProvider
 2 | from .jwt import JwtAuthProvider
 3 | from .r2r_auth import R2RAuthProvider
 4 | from .supabase import SupabaseAuthProvider
 5 | 
 6 | __all__ = [
 7 |     "R2RAuthProvider",
 8 |     "SupabaseAuthProvider",
 9 |     "JwtAuthProvider",
10 |     "ClerkAuthProvider",
11 | ]
12 | 


--------------------------------------------------------------------------------
/py/core/providers/crypto/__init__.py:
--------------------------------------------------------------------------------
 1 | from .bcrypt import BcryptCryptoConfig, BCryptCryptoProvider
 2 | from .nacl import NaClCryptoConfig, NaClCryptoProvider
 3 | 
 4 | __all__ = [
 5 |     "BCryptCryptoProvider",
 6 |     "BcryptCryptoConfig",
 7 |     "NaClCryptoConfig",
 8 |     "NaClCryptoProvider",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/py/core/providers/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .postgres import PostgresDatabaseProvider
2 | 
3 | __all__ = [
4 |     "PostgresDatabaseProvider",
5 | ]
6 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/core/providers/database/prompts/__init__.py


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/chunk_enrichment.yaml:
--------------------------------------------------------------------------------
 1 | chunk_enrichment:
 2 |   template: >
 3 |     ## Task:
 4 | 
 5 |     Enrich and refine the given chunk of text while maintaining its independence and precision.
 6 | 
 7 |     ## Context:
 8 |     Document Summary: {document_summary}
 9 |     Preceding Chunks: {preceding_chunks}
10 |     Succeeding Chunks: {succeeding_chunks}
11 | 
12 |     ## Input Chunk:
13 |     {chunk}
14 | 
15 |     ## Semantic Organization Guidelines:
16 |     1. Group related information:
17 |        - Combine logically connected data points
18 |        - Maintain context within each grouping
19 |        - Preserve relationships between entities
20 | 
21 |     2. Structure hierarchy:
22 |        - Organize from general to specific
23 |        - Use clear categorical divisions
24 |        - Maintain parent-child relationships
25 | 
26 |     3. Information density:
27 |        - Balance completeness with clarity
28 |        - Ensure each chunk can stand alone
29 |        - Preserve essential context
30 | 
31 |     4. Pattern recognition:
32 |        - Standardize similar information
33 |        - Use consistent formatting for similar data types
34 |        - It is appropriate to restructure tables or lists in ways that are more advantageous for sematic matching
35 |        - Maintain searchable patterns
36 | 
37 |     ## Output Requirements:
38 |     1. Each chunk should be independently meaningful
39 |     2. Related information should stay together
40 |     3. Format should support efficient matching
41 |     4. Original data relationships must be preserved
42 |     5. Context should be clear without external references
43 | 
44 |     Maximum length: {chunk_size} characters
45 | 
46 |     Output the restructured chunk only.
47 | 
48 |     ## Restructured Chunk:
49 | 
50 |   input_types:
51 |     document_summary: str
52 |     chunk: str
53 |     preceding_chunks: str
54 |     succeeding_chunks: str
55 |     chunk_size: int
56 |   overwrite_on_diff: true
57 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/collection_summary.yaml:
--------------------------------------------------------------------------------
 1 | collection_summary:
 2 |   template: >
 3 |     ## Task:
 4 | 
 5 |     Generate a comprehensive collection-level summary that describes the overall content, themes, and relationships across multiple documents. The summary should provide a high-level understanding of what the collection contains and represents.
 6 | 
 7 |     ### Input Documents:
 8 | 
 9 |     Document Summaries:
10 |     {document_summaries}
11 | 
12 |     ### Requirements:
13 | 
14 |     1. SCOPE
15 |     - Synthesize key themes and patterns across all documents
16 |     - Identify common topics, entities, and relationships
17 |     - Capture the collection's overall purpose or domain
18 | 
19 |     2. STRUCTURE
20 |     - Target length: Approximately 3-4 concise sentences
21 |     - Focus on collective insights rather than individual document details
22 | 
23 |     3. CONTENT GUIDELINES
24 |     - Emphasize shared concepts and recurring elements
25 |     - Highlight any temporal or thematic progression
26 |     - Identify key stakeholders or entities that appear across documents
27 |     - Note any significant relationships between documents
28 | 
29 |     4. INTEGRATION PRINCIPLES
30 |     - Connect related concepts across different documents
31 |     - Identify overarching narratives or frameworks
32 |     - Preserve important context from individual documents
33 |     - Balance breadth of coverage with depth of insight
34 | 
35 |     ### Query:
36 | 
37 |     Generate a collection-level summary following the above requirements. Focus on synthesizing the key themes and relationships across all documents while maintaining clarity and concision.
38 | 
39 |     ## Response:
40 |   input_types:
41 |     document_summaries: str
42 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/dynamic_rag_agent.yaml:
--------------------------------------------------------------------------------
 1 | dynamic_rag_agent:
 2 |   template: >
 3 |     ### You are a helpful agent that can search for information, the date is {date}.
 4 | 
 5 | 
 6 |     The response should contain line-item attributions to relevant search results, and be as informative if possible. Note that you will only be able to load {max_tool_context_length} tokens of context at a time, if the context surpasses this then it will be truncated. If possible, set filters which will reduce the context returned to only that which is specific, by means of '$eq' or '$overlap' filters.
 7 | 
 8 | 
 9 |     Search rarely exceeds the context window, while getting raw context can depending on the user data shown below. IF YOU CAN FETCH THE RAW CONTEXT, THEN DO SO.
10 | 
11 | 
12 |     The available user documents and collections are shown below:
13 | 
14 |     <= Documents =>
15 |     {document_context}
16 | 
17 | 
18 |     If no relevant results are found, then state that no results were found. If no obvious question is present given the available tools and context, then do not carry out a search, and instead ask for clarification.
19 | 
20 | 
21 |     REMINDER - Use line item references to like [c910e2e], [b12cd2f], to refer to the specific search result IDs returned in the provided context.
22 | 
23 |   input_types:
24 |     date: str
25 |     document_context: str
26 |     max_tool_context_length: str
27 | 
28 |   overwrite_on_diff: true
29 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/graph_entity_description.yaml:
--------------------------------------------------------------------------------
 1 | graph_entity_description:
 2 |   template: |
 3 |     Given the following information about an entity:
 4 | 
 5 |     Document Summary:
 6 |     {document_summary}
 7 | 
 8 |     Entity Information:
 9 |     {entity_info}
10 | 
11 |     Relationship Data:
12 |     {relationships_txt}
13 | 
14 |     Generate a comprehensive entity description that:
15 | 
16 |     1. Opens with a clear definition statement identifying the entity's primary classification and core function
17 |     2. Incorporates key data points from both the document summary and relationship information
18 |     3. Emphasizes the entity's role within its broader context or system
19 |     4. Highlights critical relationships, particularly those that:
20 |       - Demonstrate hierarchical connections
21 |       - Show functional dependencies
22 |       - Indicate primary use cases or applications
23 | 
24 |     Format Requirements:
25 |     - Length: 2-3 sentences
26 |     - Style: Technical and precise
27 |     - Structure: Definition + Context + Key Relationships
28 |     - Tone: Objective and authoritative
29 | 
30 |     Integration Guidelines:
31 |     - Prioritize information that appears in multiple sources
32 |     - Resolve any conflicting information by favoring the most specific source
33 |     - Include temporal context if relevant to the entity's current state or evolution
34 | 
35 |     Output should reflect the entity's complete nature while maintaining concision and clarity.
36 |   input_types:
37 |     document_summary: str
38 |     entity_info: str
39 |     relationships_txt: str
40 |   overwrite_on_diff: true
41 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/hyde.yaml:
--------------------------------------------------------------------------------
 1 | hyde:
 2 |   template: >
 3 |     ### Instruction:
 4 | 
 5 |     Given the query that follows write a double newline separated list of {num_outputs} single paragraph distinct attempted answers to the given query.
 6 | 
 7 | 
 8 |     DO NOT generate any single answer which is likely to require information from multiple distinct documents,
 9 | 
10 |     EACH single answer will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents.
11 | 
12 | 
13 |     FOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two attempted answers would be
14 | 
15 |     `The key themes of Great Gatsby are ... ANSWER_CONTINUED` and `The key themes of 1984 are ... ANSWER_CONTINUED`, where `ANSWER_CONTINUED` IS TO BE COMPLETED BY YOU in your response.
16 | 
17 | 
18 |     Here is the original user query to be transformed into answers:
19 | 
20 | 
21 |     ### Query:
22 | 
23 |     {message}
24 | 
25 | 
26 |     ### Response:
27 |   input_types:
28 |     num_outputs: int
29 |     message: str
30 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/rag.yaml:
--------------------------------------------------------------------------------
 1 | rag:
 2 |   template: >
 3 |     ## Task:
 4 | 
 5 |     Answer the query given immediately below given the context which follows later. Use line item references to like [c910e2e], [b12cd2f], ... refer to provided search results.
 6 | 
 7 | 
 8 |     ### Query:
 9 | 
10 |     {query}
11 | 
12 | 
13 |     ### Context:
14 | 
15 |     {context}
16 | 
17 | 
18 |     ### Query:
19 | 
20 |     {query}
21 | 
22 | 
23 |     REMINDER - Use line item references to like [c910e2e], [b12cd2f], to refer to the specific search result IDs returned in the provided context.
24 | 
25 |     ## Response:
26 |   input_types:
27 |     query: str
28 |     context: str
29 |   overwrite_on_diff: true
30 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/rag_fusion.yaml:
--------------------------------------------------------------------------------
 1 | rag_fusion:
 2 |   template: >
 3 |     ### Instruction:
 4 | 
 5 | 
 6 |     Given the following query that follows to write a double newline separated list of up to {num_outputs} queries meant to help answer the original query.
 7 | 
 8 |     DO NOT generate any single query which is likely to require information from multiple distinct documents,
 9 | 
10 |     EACH single query will be used to carry out a cosine similarity semantic search over distinct indexed documents, such as varied medical documents.
11 | 
12 |     FOR EXAMPLE if asked `how do the key themes of Great Gatsby compare with 1984`, the two queries would be
13 | 
14 |     `What are the key themes of Great Gatsby?` and `What are the key themes of 1984?`.
15 | 
16 |     Here is the original user query to be transformed into answers:
17 | 
18 | 
19 |     ### Query:
20 | 
21 |     {message}
22 | 
23 | 
24 |     ### Response:
25 |   input_types:
26 |     num_outputs: int
27 |     message: str
28 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/static_rag_agent.yaml:
--------------------------------------------------------------------------------
 1 | static_rag_agent:
 2 |   template: >
 3 |     ### You are a helpful agent that can search for information, the date is {date}.
 4 | 
 5 |     When asked a question, YOU SHOULD ALWAYS USE YOUR SEARCH TOOL TO ATTEMPT TO SEARCH FOR RELEVANT INFORMATION THAT ANSWERS THE USER QUESTION.
 6 | 
 7 |     The response should contain line-item attributions to relevant search results, and be as informative if possible.
 8 | 
 9 |     If no relevant results are found, then state that no results were found. If no obvious question is present, then do not carry out a search, and instead ask for clarification.
10 | 
11 |     REMINDER - Use line item references to like [c910e2e], [b12cd2f], to refer to the specific search result IDs returned in the provided context.
12 | 
13 |   input_types:
14 |     date: str
15 | 
16 |   overwrite_on_diff: true
17 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/summary.yaml:
--------------------------------------------------------------------------------
 1 | summary:
 2 |   template: >
 3 |     ## Task:
 4 | 
 5 |     Your task is to generate a descriptive summary of the document that follows. Your objective is to return a summary that is roughly 10% of the input document size while retaining as many key points as possible. Your response should begin with `The document contains `.
 6 | 
 7 |     ### Document:
 8 | 
 9 |     {document}
10 | 
11 | 
12 |     ### Query:
13 | 
14 |     Reminder: Your task is to generate a descriptive summary of the document that was given. Your objective is to return a summary that is roughly 10% of the input document size while retaining as many key points as possible. Your response should begin with `The document contains `.
15 | 
16 |     ## Response:
17 |   input_types:
18 |     document: str
19 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/system.yaml:
--------------------------------------------------------------------------------
1 | system:
2 |   template: You are a helpful agent.
3 |   input_types: {}
4 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/vision_img.yaml:
--------------------------------------------------------------------------------
1 | vision_img:
2 |   template: >
3 |     First, provide a title for the image, then explain everything that you see. Be very thorough in your analysis as a user will need to understand the image without seeing it. If it is possible to transcribe the image to text directly, then do so. The more detail you provide, the better the user will understand the image.
4 |   input_types: {}
5 | 


--------------------------------------------------------------------------------
/py/core/providers/database/prompts/vision_pdf.yaml:
--------------------------------------------------------------------------------
 1 | vision_pdf:
 2 |   template: >
 3 |     Convert this PDF page to markdown format, preserving all content and formatting. Follow these guidelines:
 4 | 
 5 |     Text:
 6 |     - Maintain the original text hierarchy (headings, paragraphs, lists)
 7 |     - Preserve any special formatting (bold, italic, underline)
 8 |     - Include all footnotes, citations, and references
 9 |     - Keep text in its original reading order
10 | 
11 |     Tables:
12 |     - Recreate tables using markdown table syntax
13 |     - Preserve all headers, rows, and columns
14 |     - Maintain alignment and formatting where possible
15 |     - Include any table captions or notes
16 | 
17 |     Equations:
18 |     - Convert mathematical equations using LaTeX notation
19 |     - Preserve equation numbers if present
20 |     - Include any surrounding context or references
21 | 
22 |     Images:
23 |     - Enclose image descriptions within [FIG] and [/FIG] tags
24 |     - Include detailed descriptions of:
25 |       * Main subject matter
26 |       * Text overlays or captions
27 |       * Charts, graphs, or diagrams
28 |       * Relevant colors, patterns, or visual elements
29 |     - Maintain image placement relative to surrounding text
30 | 
31 |     Additional Elements:
32 |     - Include page numbers if visible
33 |     - Preserve headers and footers
34 |     - Maintain sidebars or callout boxes
35 |     - Keep any special symbols or characters
36 | 
37 |     Quality Requirements:
38 |     - Ensure 100% content preservation
39 |     - Maintain logical document flow
40 |     - Verify all markdown syntax is valid
41 |     - Double-check completeness before submitting
42 |   input_types: {}
43 | 


--------------------------------------------------------------------------------
/py/core/providers/database/tokens.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from typing import Optional
 3 | 
 4 | from core.base import Handler
 5 | 
 6 | from .base import PostgresConnectionManager
 7 | 
 8 | 
 9 | class PostgresTokensHandler(Handler):
10 |     TABLE_NAME = "blacklisted_tokens"
11 | 
12 |     def __init__(
13 |         self, project_name: str, connection_manager: PostgresConnectionManager
14 |     ):
15 |         super().__init__(project_name, connection_manager)
16 | 
17 |     async def create_tables(self):
18 |         query = f"""
19 |         CREATE TABLE IF NOT EXISTS {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (
20 |             id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
21 |             token TEXT NOT NULL,
22 |             blacklisted_at TIMESTAMPTZ DEFAULT NOW()
23 |         );
24 |         CREATE INDEX IF NOT EXISTS idx_{self.project_name}_{PostgresTokensHandler.TABLE_NAME}_token
25 |         ON {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (token);
26 |         CREATE INDEX IF NOT EXISTS idx_{self.project_name}_{PostgresTokensHandler.TABLE_NAME}_blacklisted_at
27 |         ON {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (blacklisted_at);
28 |         """
29 |         await self.connection_manager.execute_query(query)
30 | 
31 |     async def blacklist_token(
32 |         self, token: str, current_time: Optional[datetime] = None
33 |     ):
34 |         if current_time is None:
35 |             current_time = datetime.utcnow()
36 | 
37 |         query = f"""
38 |         INSERT INTO {self._get_table_name(PostgresTokensHandler.TABLE_NAME)} (token, blacklisted_at)
39 |         VALUES ($1, $2)
40 |         """
41 |         await self.connection_manager.execute_query(
42 |             query, [token, current_time]
43 |         )
44 | 
45 |     async def is_token_blacklisted(self, token: str) -> bool:
46 |         query = f"""
47 |         SELECT 1 FROM {self._get_table_name(PostgresTokensHandler.TABLE_NAME)}
48 |         WHERE token = $1
49 |         LIMIT 1
50 |         """
51 |         result = await self.connection_manager.fetchrow_query(query, [token])
52 |         return bool(result)
53 | 
54 |     async def clean_expired_blacklisted_tokens(
55 |         self,
56 |         max_age_hours: int = 7 * 24,
57 |         current_time: Optional[datetime] = None,
58 |     ):
59 |         if current_time is None:
60 |             current_time = datetime.utcnow()
61 |         expiry_time = current_time - timedelta(hours=max_age_hours)
62 | 
63 |         query = f"""
64 |         DELETE FROM {self._get_table_name(PostgresTokensHandler.TABLE_NAME)}
65 |         WHERE blacklisted_at < $1
66 |         """
67 |         await self.connection_manager.execute_query(query, [expiry_time])
68 | 


--------------------------------------------------------------------------------
/py/core/providers/database/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Database utility functions for PostgreSQL operations.
 3 | """
 4 | 
 5 | 
 6 | def psql_quote_literal(value: str) -> str:
 7 |     """Safely quote a string literal for PostgreSQL to prevent SQL injection.
 8 | 
 9 |     This is a simple implementation - in production, you should use proper parameterization
10 |     or your database driver's quoting functions.
11 |     """
12 |     return "'" + value.replace("'", "''") + "'"
13 | 


--------------------------------------------------------------------------------
/py/core/providers/email/__init__.py:
--------------------------------------------------------------------------------
 1 | from .console_mock import ConsoleMockEmailProvider
 2 | from .mailersend import MailerSendEmailProvider
 3 | from .sendgrid import SendGridEmailProvider
 4 | from .smtp import AsyncSMTPEmailProvider
 5 | 
 6 | __all__ = [
 7 |     "ConsoleMockEmailProvider",
 8 |     "AsyncSMTPEmailProvider",
 9 |     "SendGridEmailProvider",
10 |     "MailerSendEmailProvider",
11 | ]
12 | 


--------------------------------------------------------------------------------
/py/core/providers/email/console_mock.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Optional
 3 | 
 4 | from core.base import EmailProvider
 5 | 
 6 | logger = logging.getLogger()
 7 | 
 8 | 
 9 | class ConsoleMockEmailProvider(EmailProvider):
10 |     """A simple email provider that logs emails to console, useful for
11 |     testing."""
12 | 
13 |     async def send_email(
14 |         self,
15 |         to_email: str,
16 |         subject: str,
17 |         body: str,
18 |         html_body: Optional[str] = None,
19 |         *args,
20 |         **kwargs,
21 |     ) -> None:
22 |         logger.info(f"""
23 |         -------- Email Message --------
24 |         To: {to_email}
25 |         Subject: {subject}
26 |         Body:
27 |         {body}
28 |         -----------------------------
29 |         """)
30 | 
31 |     async def send_verification_email(
32 |         self, to_email: str, verification_code: str, *args, **kwargs
33 |     ) -> None:
34 |         logger.info(f"""
35 |         -------- Email Message --------
36 |         To: {to_email}
37 |         Subject: Please verify your email address
38 |         Body:
39 |         Verification code: {verification_code}
40 |         -----------------------------
41 |         """)
42 | 
43 |     async def send_password_reset_email(
44 |         self, to_email: str, reset_token: str, *args, **kwargs
45 |     ) -> None:
46 |         logger.info(f"""
47 |         -------- Email Message --------
48 |         To: {to_email}
49 |         Subject: Password Reset Request
50 |         Body:
51 |         Reset token: {reset_token}
52 |         -----------------------------
53 |         """)
54 | 
55 |     async def send_password_changed_email(
56 |         self, to_email: str, *args, **kwargs
57 |     ) -> None:
58 |         logger.info(f"""
59 |             -------- Email Message --------
60 |             To: {to_email}
61 |             Subject: Your Password Has Been Changed
62 |             Body:
63 |             Your password has been successfully changed.
64 | 
65 |             For security reasons, you will need to log in again on all your devices.
66 |             -----------------------------
67 |             """)
68 | 


--------------------------------------------------------------------------------
/py/core/providers/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | from .litellm import LiteLLMEmbeddingProvider
 2 | from .ollama import OllamaEmbeddingProvider
 3 | from .openai import OpenAIEmbeddingProvider
 4 | 
 5 | __all__ = [
 6 |     "LiteLLMEmbeddingProvider",
 7 |     "OpenAIEmbeddingProvider",
 8 |     "OllamaEmbeddingProvider",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/py/core/providers/embeddings/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from litellm import get_model_info, token_counter
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def truncate_texts_to_token_limit(texts: list[str], model: str) -> list[str]:
 9 |     """
10 |     Truncate texts to fit within the model's token limit.
11 |     """
12 |     try:
13 |         model_info = get_model_info(model=model)
14 |         if not model_info.get("max_input_tokens"):
15 |             return texts  # No truncation needed if no limit specified
16 | 
17 |         truncated_texts = []
18 |         for text in texts:
19 |             text_tokens = token_counter(model=model, text=text)
20 |             assert model_info["max_input_tokens"]
21 |             if text_tokens > model_info["max_input_tokens"]:
22 |                 estimated_chars = (
23 |                     model_info["max_input_tokens"] * 3
24 |                 )  # Estimate 3 chars per token
25 |                 truncated_text = text[:estimated_chars]
26 |                 truncated_texts.append(truncated_text)
27 |                 logger.warning(
28 |                     f"Truncated text from {text_tokens} to ~{model_info['max_input_tokens']} tokens"
29 |                 )
30 |             else:
31 |                 truncated_texts.append(text)
32 | 
33 |         return truncated_texts
34 |     except Exception as e:
35 |         logger.warning(f"Failed to truncate texts: {str(e)}")
36 |         return texts  # Return original texts if truncation fails
37 | 


--------------------------------------------------------------------------------
/py/core/providers/file/__init__.py:
--------------------------------------------------------------------------------
1 | from .postgres import PostgresFileProvider
2 | from .s3 import S3FileProvider
3 | 
4 | __all__ = [
5 |     "PostgresFileProvider",
6 |     "S3FileProvider",
7 | ]
8 | 


--------------------------------------------------------------------------------
/py/core/providers/ingestion/__init__.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | from .r2r.base import R2RIngestionConfig, R2RIngestionProvider
 3 | from .unstructured.base import (
 4 |     UnstructuredIngestionConfig,
 5 |     UnstructuredIngestionProvider,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "R2RIngestionConfig",
10 |     "R2RIngestionProvider",
11 |     "UnstructuredIngestionProvider",
12 |     "UnstructuredIngestionConfig",
13 | ]
14 | 


--------------------------------------------------------------------------------
/py/core/providers/llm/__init__.py:
--------------------------------------------------------------------------------
 1 | from .anthropic import AnthropicCompletionProvider
 2 | from .litellm import LiteLLMCompletionProvider
 3 | from .openai import OpenAICompletionProvider
 4 | from .r2r_llm import R2RCompletionProvider
 5 | 
 6 | __all__ = [
 7 |     "AnthropicCompletionProvider",
 8 |     "LiteLLMCompletionProvider",
 9 |     "OpenAICompletionProvider",
10 |     "R2RCompletionProvider",
11 | ]
12 | 


--------------------------------------------------------------------------------
/py/core/providers/ocr/__init__.py:
--------------------------------------------------------------------------------
1 | from .mistral import MistralOCRProvider
2 | 
3 | __all__ = [
4 |     "MistralOCRProvider",
5 | ]
6 | 


--------------------------------------------------------------------------------
/py/core/providers/orchestration/__init__.py:
--------------------------------------------------------------------------------
1 | from .hatchet import HatchetOrchestrationProvider
2 | from .simple import SimpleOrchestrationProvider
3 | 
4 | __all__ = ["HatchetOrchestrationProvider", "SimpleOrchestrationProvider"]
5 | 


--------------------------------------------------------------------------------
/py/core/providers/orchestration/simple.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from core.base import OrchestrationConfig, OrchestrationProvider, Workflow
 4 | 
 5 | 
 6 | class SimpleOrchestrationProvider(OrchestrationProvider):
 7 |     def __init__(self, config: OrchestrationConfig):
 8 |         super().__init__(config)
 9 |         self.config = config
10 |         self.messages: dict[str, str] = {}
11 | 
12 |     async def start_worker(self):
13 |         pass
14 | 
15 |     def get_worker(self, name: str, max_runs: int) -> Any:
16 |         pass
17 | 
18 |     def step(self, *args, **kwargs) -> Any:
19 |         pass
20 | 
21 |     def workflow(self, *args, **kwargs) -> Any:
22 |         pass
23 | 
24 |     def failure(self, *args, **kwargs) -> Any:
25 |         pass
26 | 
27 |     def register_workflows(
28 |         self, workflow: Workflow, service: Any, messages: dict
29 |     ) -> None:
30 |         for key, msg in messages.items():
31 |             self.messages[key] = msg
32 | 
33 |         if workflow == Workflow.INGESTION:
34 |             from core.main.orchestration import simple_ingestion_factory
35 | 
36 |             self.ingestion_workflows = simple_ingestion_factory(service)
37 | 
38 |         elif workflow == Workflow.GRAPH:
39 |             from core.main.orchestration.simple.graph_workflow import (
40 |                 simple_graph_search_results_factory,
41 |             )
42 | 
43 |             self.graph_search_results_workflows = (
44 |                 simple_graph_search_results_factory(service)
45 |             )
46 | 
47 |     async def run_workflow(
48 |         self, workflow_name: str, parameters: dict, options: dict
49 |     ) -> dict[str, str]:
50 |         if workflow_name in self.ingestion_workflows:
51 |             await self.ingestion_workflows[workflow_name](
52 |                 parameters.get("request")
53 |             )
54 |             return {"message": self.messages[workflow_name]}
55 |         elif workflow_name in self.graph_search_results_workflows:
56 |             await self.graph_search_results_workflows[workflow_name](
57 |                 parameters.get("request")
58 |             )
59 |             return {"message": self.messages[workflow_name]}
60 |         else:
61 |             raise ValueError(f"Workflow '{workflow_name}' not found.")
62 | 


--------------------------------------------------------------------------------
/py/core/providers/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | from .apscheduler import APSchedulerProvider
2 | 
3 | __all__ = ["APSchedulerProvider"]
4 | 


--------------------------------------------------------------------------------
/py/core/providers/scheduler/apscheduler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from apscheduler.schedulers.asyncio import AsyncIOScheduler
 4 | 
 5 | from core.base import SchedulerConfig, SchedulerProvider
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class APSchedulerProvider(SchedulerProvider):
11 |     """Implementation using APScheduler"""
12 | 
13 |     def __init__(self, config: SchedulerConfig):
14 |         super().__init__(config)
15 |         self.scheduler = AsyncIOScheduler()
16 | 
17 |     async def add_job(self, func, trigger, **kwargs):
18 |         logger.info(
19 |             f"Adding job {func.__name__} with trigger {trigger} and kwargs {kwargs}"
20 |         )
21 |         self.scheduler.add_job(func, trigger, **kwargs)
22 | 
23 |     async def start(self):
24 |         self.scheduler.start()
25 |         logger.info("Scheduler started")
26 | 
27 |     async def shutdown(self):
28 |         if self.scheduler.running:
29 |             self.scheduler.shutdown()
30 |             logger.info("Scheduler shutdown")
31 | 
32 |     async def __aenter__(self):
33 |         await self.start()
34 |         return self
35 | 
36 |     async def __aexit__(self, exc_type, exc, tb):
37 |         await self.shutdown()
38 | 


--------------------------------------------------------------------------------
/py/core/utils/context.py:
--------------------------------------------------------------------------------
 1 | from contextvars import ContextVar, Token
 2 | 
 3 | project_schema_context: ContextVar[str | None] = ContextVar(
 4 |     "project_schema_context", default=None
 5 | )
 6 | 
 7 | 
 8 | def get_current_project_schema() -> str | None:
 9 |     """Get the current project schema name from context."""
10 |     return project_schema_context.get()
11 | 
12 | 
13 | def set_project_schema(schema_name: str) -> Token:
14 |     """Set the current project schema in context."""
15 |     return project_schema_context.set(schema_name)
16 | 


--------------------------------------------------------------------------------
/py/core/utils/sentry.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import os
 3 | 
 4 | import sentry_sdk
 5 | 
 6 | 
 7 | def init_sentry():
 8 |     dsn = os.getenv("R2R_SENTRY_DSN")
 9 |     if not dsn:
10 |         return
11 | 
12 |     with contextlib.suppress(Exception):
13 |         sentry_sdk.init(
14 |             dsn=dsn,
15 |             environment=os.getenv("R2R_SENTRY_ENVIRONMENT", "not_set"),
16 |             traces_sample_rate=float(
17 |                 os.getenv("R2R_SENTRY_TRACES_SAMPLE_RATE", 1.0)
18 |             ),
19 |             profiles_sample_rate=float(
20 |                 os.getenv("R2R_SENTRY_PROFILES_SAMPLE_RATE", 1.0)
21 |             ),
22 |         )
23 | 


--------------------------------------------------------------------------------
/py/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.
2 | 


--------------------------------------------------------------------------------
/py/migrations/alembic.ini:
--------------------------------------------------------------------------------
 1 | [alembic]
 2 | script_location = .
 3 | sqlalchemy.url = postgresql://postgres:postgres@localhost:5432/postgres
 4 | 
 5 | [loggers]
 6 | keys = root,sqlalchemy,alembic
 7 | 
 8 | [handlers]
 9 | keys = console
10 | 
11 | [formatters]
12 | keys = generic
13 | 
14 | [logger_root]
15 | level = WARN
16 | handlers = console
17 | qualname =
18 | 
19 | [logger_sqlalchemy]
20 | level = WARN
21 | handlers =
22 | qualname = sqlalchemy.engine
23 | 
24 | [logger_alembic]
25 | level = INFO
26 | handlers =
27 | qualname = alembic
28 | 
29 | [handler_console]
30 | class = StreamHandler
31 | args = (sys.stderr,)
32 | level = NOTSET
33 | formatter = generic
34 | 
35 | [formatter_generic]
36 | format = %(levelname)-5.5s [%(name)s] %(message)s
37 | datefmt = %H:%M:%S
38 | 


--------------------------------------------------------------------------------
/py/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | Schema: %(schema)s
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | ${imports if imports else ""}
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = ${repr(up_revision)}
16 | down_revision: Union[str, None] = ${repr(down_revision)}
17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19 | 
20 | def upgrade() -> None:
21 |     # Get the schema name
22 |     schema = op.get_context().get_context_kwargs.get('version_table_schema')
23 | 
24 |     """
25 |     ### Schema-aware migration
26 |     All table operations should include the schema name, for example:
27 | 
28 |     op.create_tables(
29 |         'my_table',
30 |         sa.Column('id', sa.Integer(), nullable=False),
31 |         sa.Column('name', sa.String(), nullable=True),
32 |         schema=schema
33 |     )
34 | 
35 |     op.create_index(
36 |         'idx_my_table_name',
37 |         'my_table',
38 |         ['name'],
39 |         schema=schema
40 |     )
41 |     """
42 |     ${upgrades if upgrades else "pass"}
43 | 
44 | def downgrade() -> None:
45 |     # Get the schema name
46 |     schema = op.get_context().get_context_kwargs.get('version_table_schema')
47 | 
48 |     """
49 |     ### Schema-aware downgrade
50 |     Remember to include schema in all operations, for example:
51 | 
52 |     op.drop_table('my_table', schema=schema)
53 |     """
54 |     ${downgrades if downgrades else "pass"}
55 | 


--------------------------------------------------------------------------------
/py/migrations/versions/7eb70560f406_add_limits_overrides_to_users.py:
--------------------------------------------------------------------------------
 1 | """add_limits_overrides_to_users.
 2 | 
 3 | Revision ID: 7eb70560f406
 4 | Revises: c45a9cf6a8a4
 5 | Create Date: 2025-01-03 20:27:16.139511
 6 | """
 7 | 
 8 | import os
 9 | from typing import Sequence, Union
10 | 
11 | import sqlalchemy as sa
12 | from alembic import op
13 | from sqlalchemy import inspect
14 | 
15 | # revision identifiers, used by Alembic.
16 | revision: str = "7eb70560f406"
17 | down_revision: Union[str, None] = "c45a9cf6a8a4"
18 | branch_labels: Union[str, Sequence[str], None] = None
19 | depends_on: Union[str, Sequence[str], None] = None
20 | 
21 | project_name = os.getenv("R2R_PROJECT_NAME", "r2r_default")
22 | 
23 | 
24 | def check_if_upgrade_needed():
25 |     """Check if the upgrade has already been applied."""
26 |     connection = op.get_bind()
27 |     inspector = inspect(connection)
28 | 
29 |     # Check if users table exists
30 |     if not inspector.has_table("users", schema=project_name):
31 |         print(
32 |             f"Migration not needed: '{project_name}.users' table doesn't exist"
33 |         )
34 |         return False
35 | 
36 |     users_columns = {
37 |         col["name"]
38 |         for col in inspector.get_columns("users", schema=project_name)
39 |     }
40 | 
41 |     if "limits_overrides" in users_columns:
42 |         print(
43 |             "Migration not needed: users table already has limits_overrides column"
44 |         )
45 |         return False
46 |     else:
47 |         print("Migration needed: users table needs limits_overrides column")
48 |         return True
49 | 
50 | 
51 | def upgrade() -> None:
52 |     if not check_if_upgrade_needed():
53 |         return
54 | 
55 |     # Add the limits_overrides column as JSONB with default NULL
56 |     op.add_column(
57 |         "users",
58 |         sa.Column("limits_overrides", sa.JSON(), nullable=True),
59 |         schema=project_name,
60 |     )
61 | 
62 | 
63 | def downgrade() -> None:
64 |     # Remove the limits_overrides column
65 |     op.drop_column("users", "limits_overrides", schema=project_name)
66 | 


--------------------------------------------------------------------------------
/py/r2r/__init__.py:
--------------------------------------------------------------------------------
 1 | from importlib import metadata
 2 | 
 3 | from sdk.async_client import R2RAsyncClient
 4 | from sdk.sync_client import R2RClient
 5 | from shared import *
 6 | from shared import __all__ as shared_all
 7 | 
 8 | __version__ = metadata.version("r2r")
 9 | 
10 | __all__ = [
11 |     "R2RAsyncClient",
12 |     "R2RClient",
13 |     "__version__",
14 |     "R2RException",
15 | ] + shared_all
16 | 
17 | 
18 | def get_version():
19 |     return __version__
20 | 


--------------------------------------------------------------------------------
/py/sdk/README.md:
--------------------------------------------------------------------------------
 1 | # R2R Python SDK Documentation
 2 | 
 3 | For the complete look at the R2R Python SDK, [visit our documentation.](https://r2r-docs.sciphi.ai/documentation/python-sdk/introduction)
 4 | 
 5 | ## Installation
 6 | 
 7 | Before starting, make sure you have completed the [R2R installation](/documentation/installation).
 8 | 
 9 | Install the R2R Python SDK:
10 | 
11 | ```bash
12 | pip install r2r
13 | ```
14 | 
15 | ## Getting Started
16 | 
17 | 1. Import the R2R client:
18 | 
19 | ```python
20 | from r2r import R2RClient
21 | ```
22 | 
23 | 2. Initialize the client:
24 | 
25 | ```python
26 | client = R2RClient("http://localhost:7272")
27 | ```
28 | 
29 | 
30 | 3. Check if R2R is running correctly:
31 | 
32 | ```python
33 | health_response = client.health()
34 | # {"status":"ok"}
35 | ```
36 | 
37 | 4. Login (Optional):
38 | ```python
39 | client.register("me@email.com", "my_password")
40 | # client.verify_email("me@email.com", "my_verification_code")
41 | client.login("me@email.com", "my_password")
42 | ```
43 | When using authentication the commands below automatically restrict the scope to a user's available documents.
44 | 


--------------------------------------------------------------------------------
/py/sdk/__init__.py:
--------------------------------------------------------------------------------
1 | from .async_client import R2RAsyncClient
2 | from .sync_client import R2RClient
3 | 
4 | __all__ = ["R2RAsyncClient", "R2RClient"]
5 | 


--------------------------------------------------------------------------------
/py/sdk/asnyc_methods/__init__.py:
--------------------------------------------------------------------------------
 1 | from .chunks import ChunksSDK
 2 | from .collections import CollectionsSDK
 3 | from .conversations import ConversationsSDK
 4 | from .documents import DocumentsSDK
 5 | from .graphs import GraphsSDK
 6 | from .indices import IndicesSDK
 7 | from .prompts import PromptsSDK
 8 | from .retrieval import RetrievalSDK
 9 | from .system import SystemSDK
10 | from .users import UsersSDK
11 | 
12 | __all__ = [
13 |     "ChunksSDK",
14 |     "CollectionsSDK",
15 |     "ConversationsSDK",
16 |     "DocumentsSDK",
17 |     "GraphsSDK",
18 |     "IndicesSDK",
19 |     "PromptsSDK",
20 |     "RetrievalSDK",
21 |     "SystemSDK",
22 |     "UsersSDK",
23 | ]
24 | 


--------------------------------------------------------------------------------
/py/sdk/asnyc_methods/system.py:
--------------------------------------------------------------------------------
 1 | from shared.api.models import (
 2 |     WrappedGenericMessageResponse,
 3 |     WrappedServerStatsResponse,
 4 |     WrappedSettingsResponse,
 5 | )
 6 | 
 7 | 
 8 | class SystemSDK:
 9 |     def __init__(self, client):
10 |         self.client = client
11 | 
12 |     async def health(self) -> WrappedGenericMessageResponse:
13 |         """Check the health of the R2R server."""
14 |         response_dict = await self.client._make_request(
15 |             "GET", "health", version="v3"
16 |         )
17 | 
18 |         return WrappedGenericMessageResponse(**response_dict)
19 | 
20 |     async def settings(self) -> WrappedSettingsResponse:
21 |         """Get the configuration settings for the R2R server.
22 | 
23 |         Returns:
24 |             dict: The server settings.
25 |         """
26 |         response_dict = await self.client._make_request(
27 |             "GET", "system/settings", version="v3"
28 |         )
29 | 
30 |         return WrappedSettingsResponse(**response_dict)
31 | 
32 |     async def status(self) -> WrappedServerStatsResponse:
33 |         """Get statistics about the server, including the start time, uptime,
34 |         CPU usage, and memory usage.
35 | 
36 |         Returns:
37 |             dict: The server statistics.
38 |         """
39 |         response_dict = await self.client._make_request(
40 |             "GET", "system/status", version="v3"
41 |         )
42 | 
43 |         return WrappedServerStatsResponse(**response_dict)
44 | 


--------------------------------------------------------------------------------
/py/sdk/base/__init_.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/sdk/base/__init_.py


--------------------------------------------------------------------------------
/py/sdk/base/base_client.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from shared.abstractions import R2RClientException
 4 | 
 5 | 
 6 | class BaseClient:
 7 |     def __init__(
 8 |         self,
 9 |         base_url: str | None = None,
10 |         timeout: float = 300.0,
11 |     ):
12 |         self.base_url = base_url or os.getenv(
13 |             "R2R_API_BASE", "http://localhost:7272"
14 |         )
15 |         self.timeout = timeout
16 |         self.access_token: str | None = None
17 |         self._refresh_token: str | None = None
18 |         self._user_id: str | None = None
19 |         self.api_key: str | None = os.getenv("R2R_API_KEY", None)
20 |         self.project_name: str | None = None
21 | 
22 |     def _get_auth_header(self) -> dict[str, str]:
23 |         if self.access_token and self.api_key:
24 |             raise R2RClientException(
25 |                 message="Cannot have both access token and api key.",
26 |             )
27 |         if self.access_token:
28 |             return {"Authorization": f"Bearer {self.access_token}"}
29 |         elif self.api_key:
30 |             return {"x-api-key": self.api_key}
31 |         else:
32 |             return {}
33 | 
34 |     def _get_full_url(self, endpoint: str, version: str = "v3") -> str:
35 |         return f"{self.base_url}/{version}/{endpoint}"
36 | 
37 |     def _prepare_request_args(self, endpoint: str, **kwargs) -> dict:
38 |         headers = kwargs.pop("headers", {})
39 |         if (self.access_token or self.api_key) and endpoint not in [
40 |             "register",
41 |             "login",
42 |             "verify_email",
43 |         ]:
44 |             headers.update(self._get_auth_header())
45 | 
46 |         if self.project_name:
47 |             headers["x-project-name"] = self.project_name
48 | 
49 |         if (
50 |             kwargs.get("params", None) == {}
51 |             or kwargs.get("params", None) is None
52 |         ):
53 |             kwargs.pop("params", None)
54 | 
55 |         return {"headers": headers, **kwargs}
56 | 


--------------------------------------------------------------------------------
/py/sdk/models.py:
--------------------------------------------------------------------------------
  1 | from shared.abstractions import (
  2 |     AggregateSearchResult,
  3 |     ChunkSearchResult,
  4 |     GenerationConfig,
  5 |     GraphCommunityResult,
  6 |     GraphEntityResult,
  7 |     GraphRelationshipResult,
  8 |     GraphSearchResult,
  9 |     GraphSearchResultType,
 10 |     GraphSearchSettings,
 11 |     HybridSearchSettings,
 12 |     IngestionMode,
 13 |     Message,
 14 |     MessageType,
 15 |     R2RException,
 16 |     R2RSerializable,
 17 |     SearchMode,
 18 |     SearchSettings,
 19 |     Token,
 20 |     User,
 21 |     select_search_filters,
 22 | )
 23 | from shared.abstractions.graph import (
 24 |     GraphCreationSettings,
 25 |     GraphEnrichmentSettings,
 26 | )
 27 | from shared.api.models import (
 28 |     AgentEvent,
 29 |     AgentResponse,
 30 |     Citation,
 31 |     CitationData,
 32 |     CitationEvent,
 33 |     Delta,
 34 |     DeltaPayload,
 35 |     FinalAnswerData,
 36 |     FinalAnswerEvent,
 37 |     MessageData,
 38 |     MessageDelta,
 39 |     MessageEvent,
 40 |     RAGResponse,
 41 |     SearchResultsData,
 42 |     SearchResultsEvent,
 43 |     SSEEventBase,
 44 |     ThinkingData,
 45 |     ThinkingEvent,
 46 |     ToolCallData,
 47 |     ToolCallEvent,
 48 |     ToolResultData,
 49 |     ToolResultEvent,
 50 |     UnknownEvent,
 51 | )
 52 | 
 53 | __all__ = [
 54 |     "AggregateSearchResult",
 55 |     "GenerationConfig",
 56 |     "HybridSearchSettings",
 57 |     "GraphCommunityResult",
 58 |     "GraphCreationSettings",
 59 |     "GraphEnrichmentSettings",
 60 |     "GraphEntityResult",
 61 |     "GraphRelationshipResult",
 62 |     "GraphSearchResult",
 63 |     "GraphSearchResultType",
 64 |     "GraphSearchSettings",
 65 |     "Message",
 66 |     "MessageType",
 67 |     "R2RException",
 68 |     "R2RSerializable",
 69 |     "Token",
 70 |     "ChunkSearchResult",
 71 |     "SearchSettings",
 72 |     "select_search_filters",
 73 |     "IngestionMode",
 74 |     "SearchMode",
 75 |     # "RAGResponse",
 76 |     "Citation",
 77 |     "RAGResponse",
 78 |     "AgentEvent",
 79 |     "AgentResponse",
 80 |     "SSEEventBase",
 81 |     "SearchResultsData",
 82 |     "SearchResultsEvent",
 83 |     "MessageData",
 84 |     "MessageDelta",
 85 |     "MessageEvent",
 86 |     "DeltaPayload",
 87 |     "Delta",
 88 |     "CitationData",
 89 |     "CitationEvent",
 90 |     "FinalAnswerData",
 91 |     "FinalAnswerEvent",
 92 |     "ToolCallData",
 93 |     "ToolCallEvent",
 94 |     "ToolResultData",
 95 |     "ToolResultEvent",
 96 |     "ThinkingEvent",
 97 |     "ThinkingData",
 98 |     "UnknownEvent",
 99 |     "User",
100 | ]
101 | 


--------------------------------------------------------------------------------
/py/sdk/sync_methods/__init__.py:
--------------------------------------------------------------------------------
 1 | from .chunks import ChunksSDK
 2 | from .collections import CollectionsSDK
 3 | from .conversations import ConversationsSDK
 4 | from .documents import DocumentsSDK
 5 | from .graphs import GraphsSDK
 6 | from .indices import IndicesSDK
 7 | from .prompts import PromptsSDK
 8 | from .retrieval import RetrievalSDK
 9 | from .system import SystemSDK
10 | from .users import UsersSDK
11 | 
12 | __all__ = [
13 |     "ChunksSDK",
14 |     "CollectionsSDK",
15 |     "ConversationsSDK",
16 |     "DocumentsSDK",
17 |     "GraphsSDK",
18 |     "IndicesSDK",
19 |     "PromptsSDK",
20 |     "RetrievalSDK",
21 |     "SystemSDK",
22 |     "UsersSDK",
23 | ]
24 | 


--------------------------------------------------------------------------------
/py/sdk/sync_methods/system.py:
--------------------------------------------------------------------------------
 1 | from shared.api.models import (
 2 |     WrappedGenericMessageResponse,
 3 |     WrappedServerStatsResponse,
 4 |     WrappedSettingsResponse,
 5 | )
 6 | 
 7 | 
 8 | class SystemSDK:
 9 |     def __init__(self, client):
10 |         self.client = client
11 | 
12 |     def health(self) -> WrappedGenericMessageResponse:
13 |         """Check the health of the R2R server."""
14 |         response_dict = self.client._make_request(
15 |             "GET", "health", version="v3"
16 |         )
17 | 
18 |         return WrappedGenericMessageResponse(**response_dict)
19 | 
20 |     def settings(self) -> WrappedSettingsResponse:
21 |         """Get the configuration settings for the R2R server.
22 | 
23 |         Returns:
24 |             dict: The server settings.
25 |         """
26 |         response_dict = self.client._make_request(
27 |             "GET", "system/settings", version="v3"
28 |         )
29 | 
30 |         return WrappedSettingsResponse(**response_dict)
31 | 
32 |     def status(self) -> WrappedServerStatsResponse:
33 |         """Get statistics about the server, including the start time, uptime,
34 |         CPU usage, and memory usage.
35 | 
36 |         Returns:
37 |             dict: The server statistics.
38 |         """
39 |         response_dict = self.client._make_request(
40 |             "GET", "system/status", version="v3"
41 |         )
42 | 
43 |         return WrappedServerStatsResponse(**response_dict)
44 | 


--------------------------------------------------------------------------------
/py/shared/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstractions import *
2 | from .abstractions import __all__ as abstractions_all
3 | from .api.models import *
4 | from .api.models import __all__ as api_models_all
5 | from .utils import *
6 | 
7 | __all__ = abstractions_all + api_models_all
8 | 


--------------------------------------------------------------------------------
/py/shared/abstractions/prompt.py:
--------------------------------------------------------------------------------
 1 | """Abstraction for a prompt that can be formatted with inputs."""
 2 | 
 3 | import logging
 4 | from datetime import datetime
 5 | from typing import Any
 6 | from uuid import UUID, uuid4
 7 | 
 8 | from pydantic import BaseModel, Field
 9 | 
10 | logger = logging.getLogger()
11 | 
12 | 
13 | class Prompt(BaseModel):
14 |     """A prompt that can be formatted with inputs."""
15 | 
16 |     id: UUID = Field(default_factory=uuid4)
17 |     name: str
18 |     template: str
19 |     input_types: dict[str, str]
20 |     created_at: datetime = Field(default_factory=datetime.utcnow)
21 |     updated_at: datetime = Field(default_factory=datetime.utcnow)
22 | 
23 |     def format_prompt(self, inputs: dict[str, Any]) -> str:
24 |         self._validate_inputs(inputs)
25 |         return self.template.format(**inputs)
26 | 
27 |     def _validate_inputs(self, inputs: dict[str, Any]) -> None:
28 |         for var, expected_type_name in self.input_types.items():
29 |             expected_type = self._convert_type(expected_type_name)
30 |             if var not in inputs:
31 |                 raise ValueError(f"Missing input: {var}")
32 |             if not isinstance(inputs[var], expected_type):
33 |                 raise TypeError(
34 |                     f"Input '{var}' must be of type {expected_type.__name__}, got {type(inputs[var]).__name__} instead."
35 |                 )
36 | 
37 |     def _convert_type(self, type_name: str) -> type:
38 |         type_mapping = {"int": int, "str": str}
39 |         return type_mapping.get(type_name, str)
40 | 


--------------------------------------------------------------------------------
/py/shared/abstractions/tool.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Optional
 2 | 
 3 | from ..abstractions import R2RSerializable
 4 | 
 5 | 
 6 | class Tool(R2RSerializable):
 7 |     name: str
 8 |     description: str
 9 |     results_function: Callable
10 |     llm_format_function: Optional[Callable] = None
11 |     stream_function: Optional[Callable] = None
12 |     parameters: Optional[dict[str, Any]] = None
13 |     context: Optional[Any] = None
14 | 
15 |     class Config:
16 |         populate_by_name = True
17 |         arbitrary_types_allowed = True
18 | 
19 |     def set_context(self, context: Any) -> None:
20 |         """Set the context for this tool."""
21 |         self.context = context
22 | 
23 |     async def execute(self, *args, **kwargs):
24 |         """
25 |         Execute the tool with context awareness.
26 |         This wraps the results_function to ensure context is available.
27 |         """
28 |         if self.context is None:
29 |             raise ValueError(
30 |                 f"Tool '{self.name}' requires context but none was provided"
31 |             )
32 | 
33 |         # Call the actual implementation with context
34 |         return await self.results_function(context=self.context, **kwargs)
35 | 
36 | 
37 | class ToolResult(R2RSerializable):
38 |     raw_result: Any
39 |     llm_formatted_result: str
40 |     stream_result: Optional[str] = None
41 | 


--------------------------------------------------------------------------------
/py/shared/abstractions/user.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional
 3 | from uuid import UUID
 4 | 
 5 | from pydantic import BaseModel, Field
 6 | 
 7 | from shared.abstractions import R2RSerializable
 8 | 
 9 | from ..utils import generate_default_user_collection_id
10 | 
11 | 
12 | class Collection(BaseModel):
13 |     id: UUID
14 |     name: str
15 |     description: Optional[str] = None
16 |     created_at: datetime = Field(
17 |         default_factory=datetime.utcnow,
18 |     )
19 |     updated_at: datetime = Field(
20 |         default_factory=datetime.utcnow,
21 |     )
22 | 
23 |     class Config:
24 |         populate_by_name = True
25 |         from_attributes = True
26 | 
27 |     def __init__(self, **data):
28 |         super().__init__(**data)
29 |         if self.id is None:
30 |             self.id = generate_default_user_collection_id(self.name)
31 | 
32 | 
33 | class Token(BaseModel):
34 |     token: str
35 |     token_type: str
36 | 
37 | 
38 | class TokenData(BaseModel):
39 |     email: str
40 |     token_type: str
41 |     exp: datetime
42 | 
43 | 
44 | class User(R2RSerializable):
45 |     id: UUID
46 |     email: str
47 |     is_active: bool = True
48 |     is_superuser: bool = False
49 |     created_at: datetime = datetime.now()
50 |     updated_at: datetime = datetime.now()
51 |     is_verified: bool = False
52 |     collection_ids: list[UUID] = []
53 |     graph_ids: list[UUID] = []
54 |     document_ids: list[UUID] = []
55 | 
56 |     # Optional fields (to update or set at creation)
57 |     limits_overrides: Optional[dict] = None
58 |     metadata: Optional[dict] = None
59 |     verification_code_expiry: Optional[datetime] = None
60 |     name: Optional[str] = None
61 |     bio: Optional[str] = None
62 |     profile_picture: Optional[str] = None
63 |     total_size_in_bytes: Optional[int] = None
64 |     num_files: Optional[int] = None
65 | 
66 |     account_type: str = "password"
67 |     hashed_password: Optional[str] = None
68 |     google_id: Optional[str] = None
69 |     github_id: Optional[str] = None
70 | 


--------------------------------------------------------------------------------
/py/shared/api/models/auth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/auth/__init__.py


--------------------------------------------------------------------------------
/py/shared/api/models/auth/responses.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from shared.abstractions import Token
 4 | from shared.api.models.base import R2RResults
 5 | 
 6 | 
 7 | class TokenResponse(BaseModel):
 8 |     access_token: Token
 9 |     refresh_token: Token
10 | 
11 | 
12 | # Create wrapped versions of each response
13 | WrappedTokenResponse = R2RResults[TokenResponse]
14 | 


--------------------------------------------------------------------------------
/py/shared/api/models/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Generic, TypeVar
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | T = TypeVar("T")
 6 | 
 7 | 
 8 | class R2RResults(BaseModel, Generic[T]):
 9 |     results: T
10 | 
11 | 
12 | class PaginatedR2RResult(BaseModel, Generic[T]):
13 |     results: T
14 |     total_entries: int
15 | 
16 | 
17 | class GenericBooleanResponse(BaseModel):
18 |     success: bool
19 | 
20 | 
21 | class GenericMessageResponse(BaseModel):
22 |     message: str
23 | 
24 | 
25 | WrappedBooleanResponse = R2RResults[GenericBooleanResponse]
26 | WrappedGenericMessageResponse = R2RResults[GenericMessageResponse]
27 | 


--------------------------------------------------------------------------------
/py/shared/api/models/graph/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/graph/__init__.py


--------------------------------------------------------------------------------
/py/shared/api/models/graph/responses.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional
 3 | from uuid import UUID
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | from shared.abstractions.graph import Community, Entity, Relationship
 8 | from shared.api.models.base import PaginatedR2RResult, R2RResults
 9 | 
10 | WrappedEntityResponse = R2RResults[Entity]
11 | WrappedEntitiesResponse = PaginatedR2RResult[list[Entity]]
12 | WrappedRelationshipResponse = R2RResults[Relationship]
13 | WrappedRelationshipsResponse = PaginatedR2RResult[list[Relationship]]
14 | WrappedCommunityResponse = R2RResults[Community]
15 | WrappedCommunitiesResponse = PaginatedR2RResult[list[Community]]
16 | 
17 | 
18 | class GraphResponse(BaseModel):
19 |     id: UUID
20 |     collection_id: UUID
21 |     name: str
22 |     description: Optional[str]
23 |     status: str
24 |     created_at: datetime
25 |     updated_at: datetime
26 |     document_ids: list[UUID]
27 | 
28 | 
29 | # Graph Responses
30 | WrappedGraphResponse = R2RResults[GraphResponse]
31 | WrappedGraphsResponse = PaginatedR2RResult[list[GraphResponse]]
32 | 


--------------------------------------------------------------------------------
/py/shared/api/models/ingestion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/ingestion/__init__.py


--------------------------------------------------------------------------------
/py/shared/api/models/ingestion/responses.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional, TypeVar
 2 | from uuid import UUID
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | from shared.api.models.base import PaginatedR2RResult, R2RResults
 7 | 
 8 | T = TypeVar("T")
 9 | 
10 | 
11 | class IngestionResponse(BaseModel):
12 |     message: str = Field(
13 |         ...,
14 |         description="A message describing the result of the ingestion request.",
15 |     )
16 |     task_id: Optional[UUID] = Field(
17 |         None,
18 |         description="The task ID of the ingestion request.",
19 |     )
20 |     document_id: UUID = Field(
21 |         ...,
22 |         description="The ID of the document that was ingested.",
23 |     )
24 | 
25 |     class Config:
26 |         json_schema_extra = {
27 |             "example": {
28 |                 "message": "Ingestion task queued successfully.",
29 |                 "task_id": "c68dc72e-fc23-5452-8f49-d7bd46088a96",
30 |                 "document_id": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
31 |             }
32 |         }
33 | 
34 | 
35 | class UpdateResponse(BaseModel):
36 |     message: str = Field(
37 |         ...,
38 |         description="A message describing the result of the ingestion request.",
39 |     )
40 |     task_id: Optional[UUID] = Field(
41 |         None,
42 |         description="The task ID of the ingestion request.",
43 |     )
44 |     document_ids: list[UUID] = Field(
45 |         ...,
46 |         description="The ID of the document that was ingested.",
47 |     )
48 | 
49 |     class Config:
50 |         json_schema_extra = {
51 |             "example": {
52 |                 "message": "Update task queued successfully.",
53 |                 "task_id": "c68dc72e-fc23-5452-8f49-d7bd46088a96",
54 |                 "document_ids": ["9fbe403b-c11c-5aae-8ade-ef22980c3ad1"],
55 |             }
56 |         }
57 | 
58 | 
59 | class VectorIndexResponse(BaseModel):
60 |     index: dict[str, Any]
61 | 
62 | 
63 | class VectorIndicesResponse(BaseModel):
64 |     indices: list[VectorIndexResponse]
65 | 
66 | 
67 | WrappedIngestionResponse = R2RResults[IngestionResponse]
68 | WrappedMetadataUpdateResponse = R2RResults[IngestionResponse]
69 | WrappedUpdateResponse = R2RResults[UpdateResponse]
70 | 
71 | WrappedVectorIndexResponse = R2RResults[VectorIndexResponse]
72 | WrappedVectorIndicesResponse = PaginatedR2RResult[VectorIndicesResponse]
73 | 


--------------------------------------------------------------------------------
/py/shared/api/models/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/management/__init__.py


--------------------------------------------------------------------------------
/py/shared/api/models/retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/shared/api/models/retrieval/__init__.py


--------------------------------------------------------------------------------
/py/shared/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base_utils import (
 2 |     _decorate_vector_type,
 3 |     _get_vector_column_str,
 4 |     deep_update,
 5 |     dump_collector,
 6 |     dump_obj,
 7 |     format_search_results_for_llm,
 8 |     generate_default_prompt_id,
 9 |     generate_default_user_collection_id,
10 |     generate_document_id,
11 |     generate_entity_document_id,
12 |     generate_extraction_id,
13 |     generate_id,
14 |     generate_user_id,
15 |     validate_uuid,
16 |     yield_sse_event,
17 | )
18 | from .splitter.text import RecursiveCharacterTextSplitter, TextSplitter
19 | 
20 | __all__ = [
21 |     "format_search_results_for_llm",
22 |     # ID generation
23 |     "generate_id",
24 |     "generate_document_id",
25 |     "generate_extraction_id",
26 |     "generate_default_user_collection_id",
27 |     "generate_user_id",
28 |     "generate_default_prompt_id",
29 |     "generate_entity_document_id",
30 |     # Other
31 |     "validate_uuid",
32 |     "deep_update",
33 |     # Text splitter
34 |     "RecursiveCharacterTextSplitter",
35 |     "TextSplitter",
36 |     # Vector utils
37 |     "_decorate_vector_type",
38 |     "_get_vector_column_str",
39 |     "yield_sse_event",
40 |     "dump_collector",
41 |     "dump_obj",
42 | ]
43 | 


--------------------------------------------------------------------------------
/py/shared/utils/splitter/__init__.py:
--------------------------------------------------------------------------------
1 | from .text import RecursiveCharacterTextSplitter
2 | 
3 | __all__ = ["RecursiveCharacterTextSplitter"]
4 | 


--------------------------------------------------------------------------------
/py/tests/integration/test_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from r2r import R2RException
 4 | 
 5 | 
 6 | class BaseTest:
 7 |     """Base class for all test classes with common utilities."""
 8 | 
 9 |     @staticmethod
10 |     async def cleanup_resource(cleanup_func,
11 |                                resource_id: Optional[str] = None) -> None:
12 |         """Generic cleanup helper that won't fail the test if cleanup fails."""
13 |         if resource_id:
14 |             try:
15 |                 await cleanup_func(id=resource_id)
16 |             except R2RException:
17 |                 pass
18 | 


--------------------------------------------------------------------------------
/py/tests/scaling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/tests/scaling/__init__.py


--------------------------------------------------------------------------------
/py/tests/unit/retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/py/tests/unit/retrieval/__init__.py


--------------------------------------------------------------------------------
/services/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/services/README.md


--------------------------------------------------------------------------------
/services/clustering/Dockerfile.clustering:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim AS builder
 2 | 
 3 | # Install system dependencies
 4 | RUN apt-get update && apt-get install -y --no-install-recommends \
 5 |     gcc g++ musl-dev curl libffi-dev \
 6 |     && apt-get clean && rm -rf /var/lib/apt/lists/* \
 7 |     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 8 | 
 9 | RUN pip install --no-cache-dir poetry
10 | 
11 | # Add Rust to PATH
12 | ENV PATH="/root/.cargo/bin:${PATH}"
13 | 
14 | ENV PYTHONDONTWRITEBYTECODE=1
15 | ENV PYTHONUNBUFFERED=1
16 | 
17 | WORKDIR /app
18 | 
19 | # Install graspologic and other dependencies
20 | RUN pip install --no-cache-dir fastapi uvicorn networkx "graspologic[leiden]" future pydantic==2.8.2
21 | 
22 | COPY main.py .
23 | 
24 | EXPOSE 7276
25 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7276"]
26 | 


--------------------------------------------------------------------------------
/services/unstructured/Dockerfile.unstructured:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim AS builder
 2 | 
 3 | # Install system dependencies (including those needed for Unstructured and OpenCV)
 4 | RUN apt-get update && apt-get install -y --no-install-recommends \
 5 |     gcc g++ musl-dev curl libffi-dev gfortran libopenblas-dev \
 6 |     tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
 7 |     poppler-utils libmagic1 pandoc libreoffice \
 8 |     libgl1-mesa-glx libglib2.0-0 \
 9 |     && apt-get clean && rm -rf /var/lib/apt/lists/*
10 | 
11 | ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
12 | 
13 | ENV PYTHONDONTWRITEBYTECODE=1
14 | ENV PYTHONUNBUFFERED=1
15 | 
16 | WORKDIR /app
17 | 
18 | RUN pip install --no-cache-dir unstructured "unstructured[all-docs]"
19 | 
20 | 
21 | ENV NLTK_DATA=/usr/share/nltk_data
22 | RUN mkdir -p ${NLTK_DATA}
23 | RUN python -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng
24 | 
25 | RUN python -c "from unstructured.partition.model_init import initialize; initialize()"
26 | 
27 | RUN pip install gunicorn uvicorn fastapi httpx
28 | 
29 | COPY main.py .
30 | 
31 | EXPOSE 7275
32 | 
33 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7275"]
34 | 


--------------------------------------------------------------------------------
/services/unstructured/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciPhi-AI/R2R/d98d1cf4a9fd838b36072ebd45020a911e70493f/services/unstructured/README.md


--------------------------------------------------------------------------------
/services/unstructured/main.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import base64
 3 | import concurrent.futures
 4 | import logging
 5 | import os
 6 | from io import BytesIO
 7 | from typing import Optional
 8 | 
 9 | from fastapi import FastAPI, HTTPException
10 | from pydantic import BaseModel
11 | from unstructured.partition.auto import partition
12 | 
13 | logger = logging.getLogger()
14 | 
15 | app = FastAPI()
16 | 
17 | 
18 | class PartitionRequestModel(BaseModel):
19 |     file_content: bytes
20 |     ingestion_config: dict
21 |     filename: Optional[str] = None
22 | 
23 | 
24 | class PartitionResponseModel(BaseModel):
25 |     elements: list[dict]
26 | 
27 | 
28 | executor = concurrent.futures.ThreadPoolExecutor(
29 |     max_workers=int(os.environ.get("MAX_INGESTION_WORKERS", 10))
30 | )
31 | 
32 | 
33 | def run_partition(file_content: str, filename: str, ingestion_config: dict) -> list[dict]:
34 |     file_content_bytes = base64.b64decode(file_content)
35 |     file_io = BytesIO(file_content_bytes)
36 |     elements = partition(file=file_io, file_filename=filename, **ingestion_config)
37 |     return [element.to_dict() for element in elements]
38 | 
39 | 
40 | @app.get("/health")
41 | async def health_endpoint():
42 |     return {"status": "ok"}
43 | 
44 | 
45 | @app.post("/partition", response_model=PartitionResponseModel)
46 | async def partition_endpoint(request: PartitionRequestModel):
47 |     try:
48 |         logger.info(f"Partitioning request received: {request}")
49 |         loop = asyncio.get_event_loop()
50 |         elements = await loop.run_in_executor(
51 |             executor,
52 |             run_partition,
53 |             request.file_content,
54 |             request.filename,
55 |             request.ingestion_config,
56 |         )
57 |         logger.info("Partitioning completed")
58 |         return PartitionResponseModel(elements=elements)
59 |     except Exception as e:
60 |         logger.error(f"Error partitioning file: {str(e)}")
61 |         raise HTTPException(status_code=500, detail=str(e))
62 | 


--------------------------------------------------------------------------------