├── .flake8
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── EXTERNAL_ISSUE_TEMPLATE.yml
│ ├── EXTERNAL_USER_FEATURE_REQUEST.yml
│ ├── INTERNAL_EPIC_TEMPLATE.yml
│ ├── INTERNAL_USER_STORY_TEMPLATE.yml
│ └── config.yml
├── PULL_REQUEST_TEMPLATE.md
└── workflows
│ ├── backend-core-tests.yml
│ ├── conventional-pr-title.yml
│ ├── release-please-core.yml
│ └── stale.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── .readthedocs.yaml
├── .release-please-manifest.json
├── .vscode
├── extensions.json
├── launch.json
└── settings.json
├── CHANGELOG.md
├── LICENSE
├── README.md
├── core
├── .flake8
├── .gitignore
├── .python-version
├── CHANGELOG.md
├── Dockerfile.test
├── README.md
├── example_workflows
│ └── talk_to_file_rag_config_workflow.yaml
├── pyproject.toml
├── quivr_core
│ ├── __init__.py
│ ├── base_config.py
│ ├── brain
│ │ ├── __init__.py
│ │ ├── brain.py
│ │ ├── brain_defaults.py
│ │ ├── info.py
│ │ └── serialization.py
│ ├── config.py
│ ├── files
│ │ ├── __init__.py
│ │ └── file.py
│ ├── language
│ │ ├── models.py
│ │ └── utils.py
│ ├── llm
│ │ ├── __init__.py
│ │ └── llm_endpoint.py
│ ├── llm_tools
│ │ ├── __init__.py
│ │ ├── entity.py
│ │ ├── llm_tools.py
│ │ ├── other_tools.py
│ │ └── web_search_tools.py
│ ├── processor
│ │ ├── __init__.py
│ │ ├── implementations
│ │ │ ├── __init__.py
│ │ │ ├── default.py
│ │ │ ├── megaparse_processor.py
│ │ │ ├── simple_txt_processor.py
│ │ │ └── tika_processor.py
│ │ ├── processor_base.py
│ │ ├── registry.py
│ │ └── splitter.py
│ ├── rag
│ │ ├── __init__.py
│ │ ├── entities
│ │ │ ├── __init__.py
│ │ │ ├── chat.py
│ │ │ ├── config.py
│ │ │ └── models.py
│ │ ├── prompts.py
│ │ ├── quivr_rag.py
│ │ ├── quivr_rag_langgraph.py
│ │ └── utils.py
│ └── storage
│ │ ├── __init__.py
│ │ ├── file.py
│ │ ├── local_storage.py
│ │ └── storage_base.py
├── requirements-dev.lock
├── requirements.lock
├── scripts
│ ├── run_tests.sh
│ └── run_tests_buildx.sh
├── tests
│ ├── __init__.py
│ ├── chunk_stream_fixture.jsonl
│ ├── conftest.py
│ ├── fixture_chunks.py
│ ├── processor
│ │ ├── __init__.py
│ │ ├── community
│ │ │ ├── __init__.py
│ │ │ └── test_markdown_processor.py
│ │ ├── data
│ │ │ ├── dummy.pdf
│ │ │ └── guidelines_code.md
│ │ ├── docx
│ │ │ ├── __init__.py
│ │ │ ├── demo.docx
│ │ │ └── test_docx.py
│ │ ├── epub
│ │ │ ├── __init__.py
│ │ │ ├── page-blanche.epub
│ │ │ ├── sway.epub
│ │ │ └── test_epub_processor.py
│ │ ├── odt
│ │ │ ├── __init__.py
│ │ │ ├── bad_odt.odt
│ │ │ ├── sample.odt
│ │ │ └── test_odt.py
│ │ ├── pdf
│ │ │ ├── __init__.py
│ │ │ ├── sample.pdf
│ │ │ └── test_unstructured_pdf_processor.py
│ │ ├── test_default_implementations.py
│ │ ├── test_registry.py
│ │ ├── test_simple_txt_processor.py
│ │ ├── test_tika_processor.py
│ │ └── test_txt_processor.py
│ ├── rag_config.yaml
│ ├── rag_config_workflow.yaml
│ ├── test_brain.py
│ ├── test_chat_history.py
│ ├── test_config.py
│ ├── test_llm_endpoint.py
│ ├── test_quivr_file.py
│ ├── test_quivr_rag.py
│ └── test_utils.py
└── tox.ini
├── docs
├── .gitignore
├── .python-version
├── README.md
├── docs
│ ├── brain
│ │ ├── brain.md
│ │ ├── chat.md
│ │ └── index.md
│ ├── config
│ │ ├── base_config.md
│ │ ├── config.md
│ │ └── index.md
│ ├── css
│ │ └── style.css
│ ├── examples
│ │ ├── assets
│ │ │ └── chatbot_voice_flask.mp4
│ │ ├── chatbot.md
│ │ ├── chatbot_voice.md
│ │ ├── chatbot_voice_flask.md
│ │ ├── custom_storage.md
│ │ └── index.md
│ ├── index.md
│ ├── parsers
│ │ ├── index.md
│ │ ├── megaparse.md
│ │ └── simple.md
│ ├── quickstart.md
│ ├── storage
│ │ ├── base.md
│ │ ├── index.md
│ │ └── local_storage.md
│ ├── vectorstores
│ │ ├── faiss.md
│ │ ├── index.md
│ │ └── pgvector.md
│ └── workflows
│ │ ├── examples
│ │ ├── basic_ingestion.excalidraw.png
│ │ ├── basic_ingestion.md
│ │ ├── basic_rag.excalidraw.png
│ │ ├── basic_rag.md
│ │ ├── rag_with_web_search.excalidraw.png
│ │ └── rag_with_web_search.md
│ │ └── index.md
├── mkdocs.yml
├── overrides
│ └── empty
├── pyproject.toml
├── requirements-dev.lock
├── requirements.lock
└── src
│ └── docs
│ └── __init__.py
├── examples
├── chatbot
│ ├── .chainlit
│ │ ├── config.toml
│ │ └── translations
│ │ │ ├── bn.json
│ │ │ ├── en-US.json
│ │ │ ├── gu.json
│ │ │ ├── he-IL.json
│ │ │ ├── hi.json
│ │ │ ├── kn.json
│ │ │ ├── ml.json
│ │ │ ├── mr.json
│ │ │ ├── ta.json
│ │ │ ├── te.json
│ │ │ └── zh-CN.json
│ ├── .gitignore
│ ├── .python-version
│ ├── README.md
│ ├── basic_rag_workflow.yaml
│ ├── chainlit.md
│ ├── main.py
│ ├── public
│ │ ├── favicon.ico
│ │ ├── logo_dark.png
│ │ └── logo_light.png
│ ├── pyproject.toml
│ ├── requirements-dev.lock
│ └── requirements.lock
├── chatbot_voice
│ ├── .chainlit
│ │ ├── config.toml
│ │ └── translations
│ │ │ ├── bn.json
│ │ │ ├── en-US.json
│ │ │ ├── gu.json
│ │ │ ├── he-IL.json
│ │ │ ├── hi.json
│ │ │ ├── kn.json
│ │ │ ├── ml.json
│ │ │ ├── mr.json
│ │ │ ├── ta.json
│ │ │ ├── te.json
│ │ │ └── zh-CN.json
│ ├── .gitignore
│ ├── .python-version
│ ├── README.md
│ ├── basic_rag_workflow.yaml
│ ├── chainlit.md
│ ├── main.py
│ ├── public
│ │ ├── favicon.ico
│ │ ├── logo_dark.png
│ │ └── logo_light.png
│ ├── pyproject.toml
│ ├── requirements-dev.lock
│ └── requirements.lock
├── pdf_document_from_yaml.py
├── pdf_parsing_tika.py
├── quivr-whisper
│ ├── .env_example
│ ├── .gitignore
│ ├── .python-version
│ ├── README.md
│ ├── app.py
│ ├── pyproject.toml
│ ├── requirements-dev.lock
│ ├── requirements.lock
│ ├── static
│ │ ├── app.js
│ │ ├── loader.svg
│ │ ├── mic-off.svg
│ │ ├── mic.svg
│ │ └── styles.css
│ └── templates
│ │ └── index.html
├── save_load_brain.py
├── simple_question
│ ├── .gitignore
│ ├── .python-version
│ ├── README.md
│ ├── pyproject.toml
│ ├── requirements-dev.lock
│ ├── requirements.lock
│ ├── simple_question.py
│ └── simple_question_streaming.py
└── simple_question_megaparse.py
├── logo.png
├── release-please-config.json
└── vercel.json
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ; Minimal configuration for Flake8 to work with Black.
3 | max-line-length = 100
4 | ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,BLK100
5 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: StanGirard
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: StanGirard
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/EXTERNAL_ISSUE_TEMPLATE.yml:
--------------------------------------------------------------------------------
1 | name: Bug Report 🐛
2 | description: File a bug report
3 | title: "[Bug]: "
4 | labels: ["bug"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Thanks for taking the time to fill out this bug report!
10 | - type: textarea
11 | id: what-happened
12 | attributes:
13 | label: What happened?
14 | description: Also tell us, what did you expect to happen?
15 | placeholder: Tell us what you see!
16 | value: "A bug happened!"
17 | validations:
18 | required: true
19 | - type: textarea
20 | id: logs
21 | attributes:
22 | label: Relevant log output
23 | description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
24 | render: bash
25 | - type: input
26 | id: contact
27 | attributes:
28 | label: Twitter / LinkedIn details
29 | description: We announce new features on Twitter + LinkedIn. If this issue leads to an announcement, and you'd like a mention, we'll gladly shout you out!
30 | placeholder: ex. @_StanGirard / <>
31 | validations:
32 | required: false
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/EXTERNAL_USER_FEATURE_REQUEST.yml:
--------------------------------------------------------------------------------
1 | name: Feature Request 🚀
2 | description: Submit a proposal/request for a new Quivr feature.
3 | title: "[Feature]: "
4 | labels: ["enhancement"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Thanks for making Quivr better!
10 | - type: textarea
11 | id: the-feature
12 | attributes:
13 | label: The Feature
14 | description: A clear and concise description of the feature proposal
15 | placeholder: Tell us what you want!
16 | validations:
17 | required: true
18 | - type: textarea
19 | id: motivation
20 | attributes:
21 | label: Motivation, pitch
22 | description: Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., "I'm working on X and would like Y to be possible". If this is related to another GitHub issue, please link here too.
23 | validations:
24 | required: true
25 | - type: input
26 | id: contact
27 | attributes:
28 | label: Twitter / LinkedIn details
29 | description: We announce new features on Twitter + LinkedIn. When this is announced, and you'd like a mention, we'll gladly shout you out!
30 | placeholder: ex. @StanGirard_ / https://www.linkedin.com/in/stanislas-girard/
31 | validations:
32 | required: false
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/INTERNAL_EPIC_TEMPLATE.yml:
--------------------------------------------------------------------------------
1 | name: QUIVR INTERNAL ONLY - Epic
2 | description: Use this form for epics.
3 | title: "[Epic]: "
4 | labels: ["epic"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | **The Why**
10 |
11 | Explain the reasons for this epic.
12 |
13 | - type: textarea
14 | id: why-value
15 | attributes:
16 | label: Why do we need this functionality?
17 | description: What's the value? For whom?
18 | placeholder: Detail the reasons here...
19 | validations:
20 | required: true
21 |
22 | - type: markdown
23 | attributes:
24 | value: |
25 | **The What**
26 |
27 | - type: textarea
28 | id: developing
29 | attributes:
30 | label: What are we developing?
31 | description: What’s the expected behaviour? How should it look like?
32 | placeholder: Describe the functionality or feature here...
33 | validations:
34 | required: true
35 |
36 | - type: textarea
37 | id: validation-criteria
38 | attributes:
39 | label: What are the validation criteria to validate this feature?
40 | description: Testing criteria, edge cases, error behavior, etc.
41 | placeholder: Detail the validation criteria here...
42 | validations:
43 | required: true
44 |
45 | - type: textarea
46 | id: out-of-scope
47 | attributes:
48 | label: What's out of scope for this feature?
49 | placeholder: Detail what's not covered by this epic...
50 | validations:
51 | required: true
52 |
53 | - type: checkboxes
54 | id: feature-flagged
55 | attributes:
56 | label: Should this feature be feature flagged?
57 | options:
58 | - label: Feature Flagged
59 | required: false
60 |
61 | - type: markdown
62 | attributes:
63 | value: |
64 | **The How**
65 |
66 | - type: textarea
67 | id: code-strategy
68 | attributes:
69 | label: How are we going to code this feature?
70 | description: Technical strategy, impacts, consequences, etc.
71 | placeholder: Describe the technical approach here...
72 | validations:
73 | required: true
74 |
75 | - type: textarea
76 | id: technical-decisions
77 | attributes:
78 | label: Are there any technical decisions made that should be shared?
79 | placeholder: Detail the decisions here...
80 | validations:
81 | required: true
82 |
83 | - type: textarea
84 | id: blockers
85 | attributes:
86 | label: What is preventing us from delivering this feature?
87 | placeholder: Detail any dependencies, issues, or blockers here...
88 | validations:
89 | required: true
90 |
91 | - type: textarea
92 | id: potential-breaks
93 | attributes:
94 | label: What can the feature potentially break?
95 | placeholder: Detail any potential side effects or issues here...
96 | validations:
97 | required: true
98 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/INTERNAL_USER_STORY_TEMPLATE.yml:
--------------------------------------------------------------------------------
1 | name: QUIVR INTERNAL ONLY - User story
2 | description: Use this form for user stories.
3 | title: "User story:"
4 | labels: ["user story"]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | **Epic**
10 |
11 | Include the issue that represents the epic.
12 |
13 | - type: input
14 | id: epic-link
15 | attributes:
16 | label: Link to the Epic
17 | placeholder: Paste the link to the related epic here...
18 | validations:
19 | required: true
20 |
21 | - type: markdown
22 | attributes:
23 | value: |
24 | **Functional**
25 |
26 | Detail the functionality and provide context and motivation.
27 |
28 | - type: textarea
29 | id: functionality-detail
30 | attributes:
31 | label: Explain the Functionality
32 | placeholder: Detail the user story functionality here...
33 | validations:
34 | required: true
35 |
36 | - type: markdown
37 | attributes:
38 | value: |
39 | **Schema**
40 |
41 | - type: markdown
42 | attributes:
43 | value: |
44 | ### Tech
45 |
46 | - type: markdown
47 | attributes:
48 | value: |
49 | ### Tech To-dos
50 |
51 | - type: textarea
52 | id: tech-todos
53 | attributes:
54 | label: Tech To-dos
55 | placeholder: Detail the tech to-dos here...
56 | validations:
57 | required: true
58 |
59 | - type: markdown
60 | attributes:
61 | value: |
62 | ### Tests
63 |
64 | - type: textarea
65 | id: tests
66 | attributes:
67 | label: Tests
68 | placeholder: Detail the tests here...
69 | validations:
70 | required: true
71 |
72 | - type: markdown
73 | attributes:
74 | value: |
75 | ### Validation Checks
76 |
77 | - type: textarea
78 | id: validation-checks
79 | attributes:
80 | label: Validation Checks
81 | placeholder: Detail the validation checks here...
82 | validations:
83 | required: true
84 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Quivr Community Discord
4 | url: https://discord.gg/HUpRgp2HG8
5 | about: Please ask and answer questions here.
6 | - name: Twitter
7 | url: https://twitter.com/Quivr_brain
8 | about: Follow us on Twitter for updates.
9 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # Description
2 |
3 | Please include a summary of the changes and the related issue. Please also include relevant motivation and context.
4 |
5 | ## Checklist before requesting a review
6 |
7 | Please delete options that are not relevant.
8 |
9 | - [ ] My code follows the style guidelines of this project
10 | - [ ] I have performed a self-review of my code
11 | - [ ] I have commented hard-to-understand areas
12 | - [ ] I have ideally added tests that prove my fix is effective or that my feature works
13 | - [ ] New and existing unit tests pass locally with my changes
14 | - [ ] Any dependent changes have been merged
15 |
16 | ## Screenshots (if appropriate):
17 |
--------------------------------------------------------------------------------
/.github/workflows/backend-core-tests.yml:
--------------------------------------------------------------------------------
1 | name: Run Tests with Tika Server
2 |
3 | on:
4 | push:
5 | paths:
6 | - "core/**"
7 | pull_request:
8 | paths:
9 | - "core/**"
10 | workflow_dispatch:
11 |
12 | jobs:
13 | test:
14 | runs-on: ubuntu-latest
15 |
16 | services:
17 | tika:
18 | image: apache/tika
19 | ports:
20 | - 9998:9998
21 |
22 | steps:
23 | - name: 👀 Checkout code
24 | uses: actions/checkout@v2
25 |
26 | - name: 🔨 Install the latest version of rye
27 | uses: eifinger/setup-rye@v4
28 | with:
29 | enable-cache: true
30 | working-directory: backend
31 | - name: 🔄 Sync dependencies
32 | run: |
33 | cd core
34 | UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
35 |
36 | - name: Run tests
37 | env:
38 | TIKA_URL: http://localhost:9998/tika
39 | OPENAI_API_KEY: this-is-a-test-key
40 | run: |
41 | sudo apt-get update
42 | sudo apt-get install -y libmagic-dev poppler-utils libreoffice tesseract-ocr pandoc
43 | cd core
44 | rye test -p quivr-core
45 |
--------------------------------------------------------------------------------
/.github/workflows/conventional-pr-title.yml:
--------------------------------------------------------------------------------
1 | name: "Lint PR"
2 |
3 | on:
4 | pull_request_target:
5 | types:
6 | - opened
7 | - edited
8 | - synchronize
9 |
10 | permissions:
11 | pull-requests: read
12 |
13 | jobs:
14 | main:
15 | name: Validate PR title
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: amannn/action-semantic-pull-request@v5
19 | env:
20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--------------------------------------------------------------------------------
/.github/workflows/release-please-core.yml:
--------------------------------------------------------------------------------
1 | name: release-please-core
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch:
8 |
9 | permissions:
10 | contents: write
11 | pull-requests: write
12 |
13 | jobs:
14 | release-please:
15 | runs-on: ubuntu-latest
16 | outputs:
17 | release_created: ${{ steps.release.outputs['core--release_created'] }}
18 | steps:
19 | - name: Checkout repository
20 | uses: actions/checkout@v3
21 | with:
22 | fetch-depth: 0 # Fetch all history for tags and releases
23 |
24 | - name: Setup Python
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: '3.11'
28 |
29 | - name: Run release-please
30 | id: release
31 | uses: google-github-actions/release-please-action@v4
32 | with:
33 | path: core
34 | token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
35 |
36 |
37 | deploy:
38 | if: needs.release-please.outputs.release_created == 'true'
39 | needs: release-please
40 | runs-on: ubuntu-latest
41 | defaults:
42 | run:
43 | working-directory: core
44 | steps:
45 | - uses: actions/checkout@v4
46 | - name: Install Rye
47 | uses: eifinger/setup-rye@v2
48 | with:
49 | enable-cache: true
50 | - name: Rye Sync
51 | run: UV_INDEX_STRATEGY=unsafe-first-match rye sync --no-lock
52 | - name: Rye Build
53 | run: rye build
54 | - name: Rye Publish
55 | run: rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes --skip-existing
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: "Close stale issues and PRs"
2 | on:
3 | schedule:
4 | - cron: "0 */4 * * *"
5 |
6 | permissions:
7 | contents: write # only for delete-branch option
8 | issues: write
9 | pull-requests: write
10 |
11 | jobs:
12 | stale:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/stale@main
16 | with:
17 | exempt-assignees: true
18 | exempt-draft-pr: true
19 | days-before-stale: 90
20 | days-before-close: 5
21 | operations-per-run: 400
22 | exempt-milestones: true
23 | stale-issue-message: "Thanks for your contributions, we'll be closing this issue as it has gone stale. Feel free to reopen if you'd like to continue the discussion."
24 | stale-pr-message: "Thanks for your contributions, we'll be closing this PR as it has gone stale. Feel free to reopen if you'd like to continue the discussion."
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | docker-compose.override.yml
2 | secondbrain/
3 | .env
4 | env.sh
5 | .streamlit/secrets.toml
6 | **/*.pyc
7 | toto.txt
8 | log.txt
9 |
10 | backend/venv
11 | backend/.env
12 | backend/*.deb
13 | backend/.python-version
14 |
15 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
16 |
17 | # dependencies
18 | **/node_modules
19 | **/.pnp
20 | .pnp.js
21 |
22 | Pipfile
23 |
24 | # testing
25 | **/coverage
26 |
27 | # next.js
28 | **/.next/
29 | **/out/
30 |
31 | # production
32 | **/build
33 |
34 | # misc
35 | .DS_Store
36 | *.pem
37 |
38 | # debug
39 | npm-debug.log*
40 | yarn-debug.log*
41 | yarn-error.log*
42 |
43 | # local env files
44 | .env*.local
45 |
46 | # vercel
47 | .vercel
48 |
49 | # typescript
50 | *.tsbuildinfo
51 | next-env.d.ts
52 | quivr/*
53 | streamlit-demo/.streamlit/secrets.toml
54 | .backend_env
55 | .frontend_env
56 | backend/core/pandoc-*
57 | **/.pandoc-*
58 | backend/core/application_default_credentials.json
59 |
60 | #local models
61 | backend/core/local_models/*
62 |
63 | ## scripts
64 | package-lock.json
65 | celerybeat-schedule
66 | frontend/public/robots.txt
67 | frontend/public/sitemap*
68 |
69 | pyfiles/*
70 | backend/bin/*
71 | backend/lib/*
72 | backend/pyvenv.cfg
73 | backend/share/*
74 | backend/slim.report.json
75 | volumes/db/data/
76 | volumes/storage/stub/stub/quivr/*
77 | supabase/migrations/20240103191539_private.sql
78 | supabase/20240103191539_private.sql
79 | paulgraham.py
80 | .env_test
81 | supabase/seed-airwallex.sql
82 | airwallexpayouts.py
83 | **/application.log*
84 | backend/celerybeat-schedule.db
85 |
86 | backend/application.log.*
87 | backend/score.json
88 | backend/modules/assistant/ito/utils/simple.pdf
89 | backend/modules/sync/controller/credentials.json
90 | backend/.env.test
91 |
92 | **/*.egg-info
93 |
94 | .coverage
95 | backend/core/examples/chatbot/.files/*
96 | backend/core/examples/chatbot/.python-version
97 | backend/core/examples/chatbot/.chainlit/config.toml
98 | backend/core/examples/chatbot/.chainlit/translations/en-US.json
99 |
100 | *.log
101 |
102 | # Tox
103 | .tox
104 | Pipfile
105 | *.pkl
106 | backend/docs/site/*
107 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.6.0
4 | hooks:
5 | - id: check-added-large-files
6 | args: ["--maxkb=5000"]
7 | - id: check-toml
8 | - id: check-yaml
9 | - id: end-of-file-fixer
10 | - id: trailing-whitespace
11 | - id: check-merge-conflict
12 | - id: detect-private-key
13 | - id: check-case-conflict
14 | - repo: https://github.com/pre-commit/pre-commit
15 | rev: v3.6.2
16 | hooks:
17 | - id: validate_manifest
18 | - repo: https://github.com/astral-sh/ruff-pre-commit
19 | # Ruff version.
20 | rev: v0.5.1
21 | hooks:
22 | # Run the linter.
23 | - id: ruff
24 | args: [--fix, --isolated]
25 | additional_dependencies: []
26 | # Run the formatter.
27 | - id: ruff-format
28 | args: [--isolated]
29 | additional_dependencies: []
30 | - repo: https://github.com/pre-commit/mirrors-mypy
31 | rev: v1.10.1
32 | hooks:
33 | - id: mypy
34 | name: mypy
35 | args: ["--ignore-missing-imports", "--no-incremental", "--follow-imports=skip"]
36 | additional_dependencies: ["types-aiofiles", "types-pyyaml", "pydantic", "sqlmodel"]
37 | ci:
38 | autofix_commit_msg: |
39 | [pre-commit.ci] auto fixes from pre-commit.com hooks
40 |
41 | for more information, see https://pre-commit.ci
42 | autofix_prs: true
43 | autoupdate_branch: ""
44 | autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
45 | autoupdate_schedule: weekly
46 | skip: []
47 | submodules: false
48 |
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file for MkDocs projects
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | # Required
5 | version: 2
6 |
7 | # Set the version of Python and other tools you might need
8 | build:
9 | os: ubuntu-22.04
10 | tools:
11 | python: "3.11"
12 | commands:
13 | - asdf plugin add uv
14 | - asdf install uv latest
15 | - asdf global uv latest
16 | - uv venv
17 | - cd docs && UV_INDEX_STRATEGY=unsafe-first-match uv pip install -r requirements.lock
18 | - cd docs/ && ls -la && NO_COLOR=1 ../.venv/bin/mkdocs build --strict --site-dir $READTHEDOCS_OUTPUT/html --config-file mkdocs.yml
19 |
20 |
21 |
22 |
23 | mkdocs:
24 | configuration: backend/docs/mkdocs.yml
25 |
26 |
--------------------------------------------------------------------------------
/.release-please-manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "core": "0.0.33"
3 | }
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | "recommendations": [
3 | "ms-pyright.pyright",
4 | "dbaeumer.vscode-eslint",
5 | "ms-python.vscode-pylance",
6 | "ms-pyright.pyright",
7 | "inlang.vs-code-extension",
8 | "denoland.vscode-deno"
9 | ]
10 | }
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | "name": "Python: Remote Attach",
6 | "type": "python",
7 | "request": "attach",
8 | "connect": {
9 | "host": "localhost",
10 | "port": 5678
11 | },
12 | "pathMappings": [
13 | {
14 | "localRoot": "${workspaceFolder}/backend",
15 | "remoteRoot": "."
16 | }
17 | ],
18 | "justMyCode": true
19 | },
20 | {
21 | "name": "Python: Debug Test Script",
22 | "type": "python",
23 | "request": "launch",
24 | "program": "${workspaceFolder}/backend/test_process_file_and_notify.py",
25 | "console": "integratedTerminal",
26 | "justMyCode": false
27 | },
28 | {
29 | "name": "Python: Debug",
30 | "type": "debugpy",
31 | "request": "launch",
32 | "program": "${file}",
33 | "console": "integratedTerminal",
34 | "justMyCode": false,
35 | "env": {
36 | "PYTHONPATH": "${workspaceFolder}/backend:${env:PYTHONPATH}"
37 | },
38 | "envFile": "${workspaceFolder}/.env"
39 | }
40 | ]
41 | }
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "editor.codeActionsOnSave": {
3 | "source.organizeImports": "explicit",
4 | "source.fixAll": "explicit",
5 | "source.unusedImports": "explicit",
6 | },
7 | "editor.formatOnSave": true,
8 | "editor.formatOnSaveMode": "file",
9 | "files.exclude": {
10 | "**/__pycache__": true,
11 | "**/.benchmarks/": true,
12 | "**/.cache/": true,
13 | "**/.pytest_cache/": true,
14 | "**/.next/": true,
15 | "**/build/": true,
16 | "**/.docusaurus/": true,
17 | "**/node_modules/": true
18 | },
19 | "[python]": {
20 | "editor.defaultFormatter": "charliermarsh.ruff",
21 | "editor.formatOnSave": true,
22 | "editor.codeActionsOnSave": {
23 | "source.organizeImports": "explicit",
24 | "source.fixAll": "explicit"
25 | }
26 | },
27 | "python.analysis.extraPaths": [
28 | "./backend"
29 | ],
30 | "python.defaultInterpreterPath": "python3",
31 | "python.testing.pytestArgs": [
32 | "-v",
33 | "--color=yes",
34 | "--envfile=backend/tests/.env_test",
35 | "backend/"
36 | ],
37 | "python.testing.unittestEnabled": false,
38 | "python.testing.pytestEnabled": true,
39 | "python.testing.autoTestDiscoverOnSaveEnabled": true,
40 | "python.analysis.autoImportCompletions": true,
41 | "python.analysis.typeCheckingMode": "basic",
42 | "python.analysis.diagnosticSeverityOverrides": {
43 | "reportMissingImports": "error",
44 | "reportUnusedImport": "warning",
45 | "reportGeneralTypeIssues": "warning"
46 | },
47 | "makefile.configureOnOpen": false,
48 | "djlint.showInstallError": false
49 | }
50 |
--------------------------------------------------------------------------------
/core/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ; Minimal configuration for Flake8 to work with Black.
3 | max-line-length = 100
4 | ignore = E101,E111,E112,E221,E222,E501,E711,E712,W503,W504,F401,E203
5 |
--------------------------------------------------------------------------------
/core/.gitignore:
--------------------------------------------------------------------------------
1 | # python generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # venv
10 | .venv
11 |
--------------------------------------------------------------------------------
/core/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/core/Dockerfile.test:
--------------------------------------------------------------------------------
1 | # Using a slim version for a smaller base image
2 | FROM python:3.11.6-slim-bullseye
3 |
4 | # Install GEOS library, Rust, and other dependencies, then clean up
5 | RUN apt-get clean && apt-get update && apt-get install -y \
6 | curl \
7 | gcc \
8 | autoconf \
9 | automake \
10 | build-essential \
11 | # Additional dependencies for document handling
12 | libmagic-dev \
13 | tesseract-ocr \
14 | poppler-utils \
15 | libreoffice \
16 | pandoc && \
17 | rm -rf /var/lib/apt/lists/*
18 |
19 | # Set the working directory
20 | WORKDIR /code
21 |
22 | # Install Poetry
23 | RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
24 | cd /usr/local/bin && \
25 | ln -s /opt/poetry/bin/poetry && \
26 | poetry config virtualenvs.create false
27 |
28 | # Add Poetry to PATH
29 | ENV PATH="/root/.local/bin:$PATH"
30 |
31 | # Copy the current directory contents into the container at /app
32 | COPY ./pyproject.toml ./poetry.lock* /code/
33 |
34 | RUN python3 -m pip install nltk && python3 -c "import nltk; nltk.download('punkt')" \
35 | && python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')"
36 |
37 | # Install project dependencies
38 | RUN poetry install --with test
39 |
40 | ENV PYTHONPATH=/code
41 |
--------------------------------------------------------------------------------
/core/README.md:
--------------------------------------------------------------------------------
1 | # quivr-core package
2 |
3 | The RAG of Quivr.com
4 |
5 | ## License 📄
6 |
7 | This project is licensed under the Apache 2.0 License
8 |
9 | ## Installation
10 |
11 | ```bash
12 | pip install quivr-core
13 | ```
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/core/example_workflows/talk_to_file_rag_config_workflow.yaml:
--------------------------------------------------------------------------------
1 | {
2 | "max_files": 20,
3 | "llm_config": { "temperature": 0.3, "max_context_tokens": 20000 },
4 | "max_history": 10,
5 | "reranker_config":
6 | { "model": "rerank-v3.5", "top_n": 10, "supplier": "cohere" },
7 | "workflow_config":
8 | {
9 | "name": "Standard RAG",
10 | "nodes":
11 | [
12 | {
13 | "name": "START",
14 | "edges": ["filter_history"],
15 | "description": "Starting workflow",
16 | },
17 | {
18 | "name": "filter_history",
19 | "edges": ["retrieve"],
20 | "description": "Filtering history",
21 | },
22 | {
23 | "name": "retrieve",
24 | "edges": ["retrieve_full_documents_context"],
25 | "description": "Retrieving relevant information",
26 | },
27 | {
28 | "name": "retrieve_full_documents_context",
29 | "edges": ["generate_zendesk_rag"],
30 | "description": "Retrieving full tickets context",
31 | },
32 | {
33 | "name": "generate_zendesk_rag",
34 | "edges": ["END"],
35 | "description": "Generating answer",
36 | },
37 | ],
38 | },
39 | }
40 |
--------------------------------------------------------------------------------
/core/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "quivr-core"
3 | version = "0.0.33"
4 | description = "Quivr core RAG package"
5 | authors = [{ name = "Stan Girard", email = "stan@quivr.app" }]
6 | dependencies = [
7 | "pydantic>=2.8.2",
8 | "langchain-core>=0.3,<0.4",
9 | "langchain>=0.3.9,<0.4",
10 | "langgraph>=0.2.38,<0.3",
11 | "httpx>=0.27.0",
12 | "rich>=13.7.1",
13 | "tiktoken>=0.7.0",
14 | "aiofiles>=23.1.0",
15 | "langchain-openai>=0.3.0",
16 | "langchain-cohere>=0.1.0",
17 | "langchain-community>=0.3,<0.4",
18 | "langchain-anthropic>=0.1.23",
19 | "types-pyyaml>=6.0.12.20240808",
20 | "transformers[sentencepiece]>=4.44.2",
21 | "faiss-cpu>=1.8.0.post1",
22 | "rapidfuzz>=3.10.1",
23 | "markupsafe>=2.1.5",
24 | "megaparse-sdk>=0.1.11",
25 | "langchain-mistralai>=0.2.3",
26 | "langchain-google-genai>=2.1.3",
27 | "fasttext-langdetect>=1.0.5",
28 | "langfuse>=2.57.0",
29 | "langchain-groq>=0.3.2",
30 | ]
31 | readme = "README.md"
32 | requires-python = ">= 3.11"
33 |
34 | [build-system]
35 | requires = ["hatchling"]
36 | build-backend = "hatchling.build"
37 |
38 | [tool.rye]
39 | managed = true
40 | dev-dependencies = [
41 | "mypy>=1.11.1",
42 | "pre-commit>=3.8.0",
43 | "ipykernel>=6.29.5",
44 | "ruff>=0.6.1",
45 | "flake8>=7.1.1",
46 | "flake8-black>=0.3.6",
47 | "pytest-asyncio>=0.23.8",
48 | "pytest>=8.3.2",
49 | "pytest-xdist>=3.6.1",
50 | "pytest-benchmark>=4.0.0",
51 | ]
52 |
53 | [tool.hatch.metadata]
54 | allow-direct-references = true
55 |
56 | [tool.hatch.build.targets.wheel]
57 | packages = ["quivr_core"]
58 |
59 | [tool.pytest.ini_options]
60 | addopts = "--tb=short -ra -v"
61 | filterwarnings = ["ignore::DeprecationWarning"]
62 | markers = [
63 | "slow: marks tests as slow (deselect with '-m \"not slow\"')",
64 | "base: these tests require quivr-core with extra `base` to be installed",
65 | "tika: these tests require a tika server to be running",
66 | "unstructured: these tests require `unstructured` dependency",
67 | ]
68 |
69 | [[tool.mypy.overrides]]
70 | module = "yaml"
71 | ignore_missing_imports = true
72 |
--------------------------------------------------------------------------------
/core/quivr_core/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import entry_points
2 |
3 | from .brain import Brain
4 | from .processor.registry import register_processor, registry
5 |
6 | __all__ = ["Brain", "registry", "register_processor"]
7 |
8 |
9 | def register_entries():
10 | if entry_points is not None:
11 | try:
12 | eps = entry_points()
13 | except TypeError:
14 | pass # importlib-metadata < 0.8
15 | else:
16 | if hasattr(eps, "select"): # Python 3.10+ / importlib_metadata >= 3.9.0
17 | processors = eps.select(group="quivr_core.processor")
18 | else:
19 | processors = eps.get("quivr_core.processor", [])
20 | registered_names = set()
21 | for spec in processors:
22 | err_msg = f"Unable to load processor from {spec}"
23 | name = spec.name
24 | if name in registered_names:
25 | continue
26 | registered_names.add(name)
27 | register_processor(
28 | name,
29 | spec.value.replace(":", "."),
30 | errtxt=err_msg,
31 | append=True,
32 | )
33 |
34 |
35 | register_entries()
36 |
--------------------------------------------------------------------------------
/core/quivr_core/base_config.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import yaml
4 | from pydantic import BaseModel, ConfigDict
5 | from typing import Self
6 |
7 |
8 | class QuivrBaseConfig(BaseModel):
9 | """
10 | Base configuration class for Quivr.
11 |
12 | This class extends Pydantic's BaseModel and provides a foundation for
13 | configuration management in quivr-core.
14 |
15 | Attributes:
16 | model_config (ConfigDict): Configuration for the Pydantic model.
17 | It's set to forbid extra attributes, ensuring strict adherence
18 | to the defined schema.
19 |
20 | Class Methods:
21 | from_yaml: Create an instance of the class from a YAML file.
22 | """
23 |
24 | model_config = ConfigDict(extra="forbid")
25 |
26 | @classmethod
27 | def from_yaml(cls, file_path: str | Path) -> Self:
28 | """
29 | Create an instance of the class from a YAML file.
30 |
31 | Args:
32 | file_path (str | Path): The path to the YAML file.
33 |
34 | Returns:
35 | QuivrBaseConfig: An instance of the class initialized with the data from the YAML file.
36 | """
37 | # Load the YAML file
38 | with open(file_path, "r") as stream:
39 | config_data = yaml.safe_load(stream)
40 |
41 | # Instantiate the class using the YAML data
42 | return cls(**config_data)
43 |
--------------------------------------------------------------------------------
/core/quivr_core/brain/__init__.py:
--------------------------------------------------------------------------------
1 | from .brain import Brain
2 |
3 | __all__ = ["Brain"]
4 |
--------------------------------------------------------------------------------
/core/quivr_core/brain/brain_defaults.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from langchain_core.documents import Document
4 | from langchain_core.embeddings import Embeddings
5 | from langchain_core.vectorstores import VectorStore
6 |
7 | from quivr_core.rag.entities.config import DefaultModelSuppliers, LLMEndpointConfig
8 | from quivr_core.llm import LLMEndpoint
9 |
10 | logger = logging.getLogger("quivr_core")
11 |
12 |
13 | async def build_default_vectordb(
14 | docs: list[Document], embedder: Embeddings
15 | ) -> VectorStore:
16 | try:
17 | from langchain_community.vectorstores import FAISS
18 |
19 | logger.debug("Using Faiss-CPU as vector store.")
20 | # TODO(@aminediro) : embedding call is usually not concurrent for all documents but waits
21 | if len(docs) > 0:
22 | vector_db = await FAISS.afrom_documents(documents=docs, embedding=embedder)
23 | return vector_db
24 | else:
25 | raise ValueError("can't initialize brain without documents")
26 |
27 | except ImportError as e:
28 | raise ImportError(
29 | "Please provide a valid vector store or install quivr-core['base'] package for using the default one."
30 | ) from e
31 |
32 |
33 | def default_embedder() -> Embeddings:
34 | try:
35 | from langchain_openai import OpenAIEmbeddings
36 |
37 | logger.debug("Loaded OpenAIEmbeddings as default LLM for brain")
38 | embedder = OpenAIEmbeddings()
39 | return embedder
40 | except ImportError as e:
41 | raise ImportError(
42 | "Please provide a valid Embedder or install quivr-core['base'] package for using the defaultone."
43 | ) from e
44 |
45 |
46 | def default_llm() -> LLMEndpoint:
47 | try:
48 | logger.debug("Loaded ChatOpenAI as default LLM for brain")
49 | llm = LLMEndpoint.from_config(
50 | LLMEndpointConfig(supplier=DefaultModelSuppliers.OPENAI, model="gpt-4o")
51 | )
52 | return llm
53 |
54 | except ImportError as e:
55 | raise ImportError(
56 | "Please provide a valid BaseLLM or install quivr-core['base'] package"
57 | ) from e
58 |
--------------------------------------------------------------------------------
/core/quivr_core/brain/info.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from uuid import UUID
3 |
4 | from rich.tree import Tree
5 |
6 |
7 | @dataclass
8 | class ChatHistoryInfo:
9 | nb_chats: int
10 | current_default_chat: UUID
11 | current_chat_history_length: int
12 |
13 | def add_to_tree(self, chats_tree: Tree):
14 | chats_tree.add(f"Number of Chats: [bold]{self.nb_chats}[/bold]")
15 | chats_tree.add(
16 | f"Current Default Chat: [bold magenta]{self.current_default_chat}[/bold magenta]"
17 | )
18 | chats_tree.add(
19 | f"Current Chat History Length: [bold]{self.current_chat_history_length}[/bold]"
20 | )
21 |
22 |
23 | @dataclass
24 | class LLMInfo:
25 | model: str
26 | llm_base_url: str
27 | temperature: float
28 | max_tokens: int
29 | supports_function_calling: int
30 |
31 | def add_to_tree(self, llm_tree: Tree):
32 | llm_tree.add(f"Model: [italic]{self.model}[/italic]")
33 | llm_tree.add(f"Base URL: [underline]{self.llm_base_url}[/underline]")
34 | llm_tree.add(f"Temperature: [bold]{self.temperature}[/bold]")
35 | llm_tree.add(f"Max Tokens: [bold]{self.max_tokens}[/bold]")
36 | func_call_color = "green" if self.supports_function_calling else "red"
37 | llm_tree.add(
38 | f"Supports Function Calling: [bold {func_call_color}]{self.supports_function_calling}[/bold {func_call_color}]"
39 | )
40 |
41 |
42 | @dataclass
43 | class StorageInfo:
44 | storage_type: str
45 | n_files: int
46 |
47 | def add_to_tree(self, files_tree: Tree):
48 | files_tree.add(f"Storage Type: [italic]{self.storage_type}[/italic]")
49 | files_tree.add(f"Number of Files: [bold]{self.n_files}[/bold]")
50 |
51 |
52 | @dataclass
53 | class BrainInfo:
54 | brain_id: UUID
55 | brain_name: str
56 | chats_info: ChatHistoryInfo
57 | llm_info: LLMInfo
58 | files_info: StorageInfo | None = None
59 |
60 | def to_tree(self):
61 | tree = Tree("📊 Brain Information")
62 | tree.add(f"🆔 ID: [bold cyan]{self.brain_id}[/bold cyan]")
63 | tree.add(f"🧠 Brain Name: [bold green]{self.brain_name}[/bold green]")
64 |
65 | if self.files_info:
66 | files_tree = tree.add("📁 Files")
67 | self.files_info.add_to_tree(files_tree)
68 |
69 | chats_tree = tree.add("💬 Chats")
70 | self.chats_info.add_to_tree(chats_tree)
71 |
72 | llm_tree = tree.add("🤖 LLM")
73 | self.llm_info.add_to_tree(llm_tree)
74 | return tree
75 |
--------------------------------------------------------------------------------
/core/quivr_core/brain/serialization.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Any, Dict, Literal, Union
3 | from uuid import UUID
4 |
5 | from pydantic import BaseModel, Field, SecretStr
6 |
7 | from quivr_core.rag.entities.config import LLMEndpointConfig
8 | from quivr_core.rag.entities.models import ChatMessage
9 | from quivr_core.files.file import QuivrFileSerialized
10 |
11 |
12 | class EmbedderConfig(BaseModel):
13 | embedder_type: Literal["openai_embedding"] = "openai_embedding"
14 | # TODO: type this correctly
15 | config: Dict[str, Any]
16 |
17 |
18 | class PGVectorConfig(BaseModel):
19 | vectordb_type: Literal["pgvector"] = "pgvector"
20 | pg_url: str
21 | pg_user: str
22 | pg_psswd: SecretStr
23 | table_name: str
24 | vector_dim: int
25 |
26 |
27 | class FAISSConfig(BaseModel):
28 | vectordb_type: Literal["faiss"] = "faiss"
29 | vectordb_folder_path: str
30 |
31 |
32 | class LocalStorageConfig(BaseModel):
33 | storage_type: Literal["local_storage"] = "local_storage"
34 | storage_path: Path
35 | files: dict[UUID, QuivrFileSerialized]
36 |
37 |
38 | class TransparentStorageConfig(BaseModel):
39 | storage_type: Literal["transparent_storage"] = "transparent_storage"
40 | files: dict[UUID, QuivrFileSerialized]
41 |
42 |
43 | class BrainSerialized(BaseModel):
44 | id: UUID
45 | name: str
46 | chat_history: list[ChatMessage]
47 | vectordb_config: Union[FAISSConfig, PGVectorConfig] = Field(
48 | ..., discriminator="vectordb_type"
49 | )
50 | storage_config: Union[TransparentStorageConfig, LocalStorageConfig] = Field(
51 | ..., discriminator="storage_type"
52 | )
53 |
54 | llm_config: LLMEndpointConfig
55 | embedding_config: EmbedderConfig
56 |
--------------------------------------------------------------------------------
/core/quivr_core/config.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 | import yaml
4 | from pydantic import BaseModel
5 |
6 |
7 | class ParserType(str, Enum):
8 | """Parser type enumeration."""
9 |
10 | UNSTRUCTURED = "unstructured"
11 | LLAMA_PARSER = "llama_parser"
12 | MEGAPARSE_VISION = "megaparse_vision"
13 |
14 |
15 | class StrategyEnum(str, Enum):
16 | """Method to use for the conversion"""
17 |
18 | FAST = "fast"
19 | AUTO = "auto"
20 | HI_RES = "hi_res"
21 |
22 |
23 | class MegaparseBaseConfig(BaseModel):
24 | @classmethod
25 | def from_yaml(cls, file_path: str):
26 | # Load the YAML file
27 | with open(file_path, "r") as stream:
28 | config_data = yaml.safe_load(stream)
29 |
30 | # Instantiate the class using the YAML data
31 | return cls(**config_data)
32 |
33 |
34 | class MegaparseConfig(MegaparseBaseConfig):
35 | method: ParserType = ParserType.UNSTRUCTURED
36 | strategy: StrategyEnum = StrategyEnum.FAST
37 | check_table: bool = False
38 | parsing_instruction: str | None = None
39 | model_name: str = "gpt-4o"
40 |
--------------------------------------------------------------------------------
/core/quivr_core/files/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import QuivrFile
2 |
3 | __all__ = ["QuivrFile"]
4 |
--------------------------------------------------------------------------------
/core/quivr_core/language/utils.py:
--------------------------------------------------------------------------------
1 | from ftlangdetect import detect
2 | from quivr_core.language.models import Language
3 |
4 |
5 | def detect_language(text: str, low_memory: bool = True) -> Language:
6 | detected_lang = detect(text=text, low_memory=low_memory)
7 | try:
8 | detected_language = Language(detected_lang["lang"])
9 | except ValueError:
10 | return Language.UNKNOWN
11 |
12 | return detected_language
13 |
--------------------------------------------------------------------------------
/core/quivr_core/llm/__init__.py:
--------------------------------------------------------------------------------
1 | from .llm_endpoint import LLMEndpoint
2 |
3 | __all__ = ["LLMEndpoint"]
4 |
--------------------------------------------------------------------------------
/core/quivr_core/llm_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/quivr_core/llm_tools/__init__.py
--------------------------------------------------------------------------------
/core/quivr_core/llm_tools/entity.py:
--------------------------------------------------------------------------------
1 | from quivr_core.base_config import QuivrBaseConfig
2 | from typing import Callable
3 | from langchain_core.tools import BaseTool
4 | from typing import Dict, Any
5 |
6 |
7 | class ToolsCategory(QuivrBaseConfig):
8 | name: str
9 | description: str
10 | tools: list
11 | default_tool: str | None = None
12 | create_tool: Callable
13 |
14 | def __init__(self, **data):
15 | super().__init__(**data)
16 | self.name = self.name.lower()
17 |
18 |
19 | class ToolWrapper:
20 | def __init__(self, tool: BaseTool, format_input: Callable, format_output: Callable):
21 | self.tool = tool
22 | self.format_input = format_input
23 | self.format_output = format_output
24 |
25 |
26 | class ToolRegistry:
27 | def __init__(self):
28 | self._registry = {}
29 |
30 | def register_tool(self, tool_name: str, create_func: Callable):
31 | self._registry[tool_name] = create_func
32 |
33 | def create_tool(self, tool_name: str, config: Dict[str, Any]) -> ToolWrapper:
34 | if tool_name not in self._registry:
35 | raise ValueError(f"Tool {tool_name} is not supported.")
36 | return self._registry[tool_name](config)
37 |
--------------------------------------------------------------------------------
/core/quivr_core/llm_tools/llm_tools.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any, Type, Union
2 |
3 | from quivr_core.llm_tools.entity import ToolWrapper
4 |
5 | from quivr_core.llm_tools.web_search_tools import (
6 | WebSearchTools,
7 | )
8 |
9 | from quivr_core.llm_tools.other_tools import (
10 | OtherTools,
11 | )
12 |
13 | TOOLS_CATEGORIES = {
14 | WebSearchTools.name: WebSearchTools,
15 | OtherTools.name: OtherTools,
16 | }
17 |
18 | # Register all ToolsList enums
19 | TOOLS_LISTS = {
20 | **{tool.value: tool for tool in WebSearchTools.tools},
21 | **{tool.value: tool for tool in OtherTools.tools},
22 | }
23 |
24 |
25 | class LLMToolFactory:
26 | @staticmethod
27 | def create_tool(tool_name: str, config: Dict[str, Any]) -> Union[ToolWrapper, Type]:
28 | for category, tools_class in TOOLS_CATEGORIES.items():
29 | if tool_name in tools_class.tools:
30 | return tools_class.create_tool(tool_name, config)
31 | elif tool_name.lower() == category and tools_class.default_tool:
32 | return tools_class.create_tool(tools_class.default_tool, config)
33 | raise ValueError(f"Tool {tool_name} is not supported.")
34 |
--------------------------------------------------------------------------------
/core/quivr_core/llm_tools/other_tools.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import Dict, Any, Type, Union
3 | from langchain_core.tools import BaseTool
4 | from quivr_core.llm_tools.entity import ToolsCategory
5 | from quivr_core.rag.entities.models import cited_answer
6 |
7 |
8 | class OtherToolsList(str, Enum):
9 | CITED_ANSWER = "cited_answer"
10 |
11 |
12 | def create_other_tool(tool_name: str, config: Dict[str, Any]) -> Union[BaseTool, Type]:
13 | if tool_name == OtherToolsList.CITED_ANSWER:
14 | return cited_answer
15 | else:
16 | raise ValueError(f"Tool {tool_name} is not supported.")
17 |
18 |
19 | OtherTools = ToolsCategory(
20 | name="Other",
21 | description="Other tools",
22 | tools=[OtherToolsList.CITED_ANSWER],
23 | create_tool=create_other_tool,
24 | )
25 |
--------------------------------------------------------------------------------
/core/quivr_core/llm_tools/web_search_tools.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import Dict, List, Any
3 | from langchain_community.tools import TavilySearchResults
4 | from langchain_community.utilities.tavily_search import TavilySearchAPIWrapper
5 | from quivr_core.llm_tools.entity import ToolsCategory
6 | import os
7 | from pydantic import SecretStr # Ensure correct import
8 | from quivr_core.llm_tools.entity import ToolWrapper, ToolRegistry
9 | from langchain_core.documents import Document
10 |
11 |
12 | class WebSearchToolsList(str, Enum):
13 | TAVILY = "tavily"
14 |
15 |
16 | def create_tavily_tool(config: Dict[str, Any]) -> ToolWrapper:
17 | api_key = (
18 | config.pop("api_key") if "api_key" in config else os.getenv("TAVILY_API_KEY")
19 | )
20 | if not api_key:
21 | raise ValueError(
22 | "Missing required config key 'api_key' or environment variable 'TAVILY_API_KEY'"
23 | )
24 |
25 | tavily_api_wrapper = TavilySearchAPIWrapper(
26 | tavily_api_key=SecretStr(api_key),
27 | )
28 | tool = TavilySearchResults(
29 | api_wrapper=tavily_api_wrapper,
30 | max_results=config.pop("max_results", 5),
31 | search_depth=config.pop("search_depth", "advanced"),
32 | include_answer=config.pop("include_answer", True),
33 | **config,
34 | )
35 |
36 | tool.name = WebSearchToolsList.TAVILY.value
37 |
38 | def format_input(task: str) -> Dict[str, Any]:
39 | return {"query": task}
40 |
41 | def format_output(response: Any) -> List[Document]:
42 | metadata = {"integration": "", "integration_link": ""}
43 | return [
44 | Document(
45 | page_content=d["content"],
46 | metadata={
47 | **metadata,
48 | "file_name": d["url"] if "url" in d else "",
49 | "original_file_name": d["url"] if "url" in d else "",
50 | },
51 | )
52 | for d in response
53 | ]
54 |
55 | return ToolWrapper(tool, format_input, format_output)
56 |
57 |
58 | # Initialize the registry and register tools
59 | web_search_tool_registry = ToolRegistry()
60 | web_search_tool_registry.register_tool(WebSearchToolsList.TAVILY, create_tavily_tool)
61 |
62 |
63 | def create_web_search_tool(tool_name: str, config: Dict[str, Any]) -> ToolWrapper:
64 | return web_search_tool_registry.create_tool(tool_name, config)
65 |
66 |
67 | WebSearchTools = ToolsCategory(
68 | name="Web Search",
69 | description="Tools for web searching",
70 | tools=[WebSearchToolsList.TAVILY],
71 | default_tool=WebSearchToolsList.TAVILY,
72 | create_tool=create_web_search_tool,
73 | )
74 |
--------------------------------------------------------------------------------
/core/quivr_core/processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/quivr_core/processor/__init__.py
--------------------------------------------------------------------------------
/core/quivr_core/processor/implementations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/quivr_core/processor/implementations/__init__.py
--------------------------------------------------------------------------------
/core/quivr_core/processor/implementations/default.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Any, List, Type, TypeVar
3 |
4 | import tiktoken
5 | from langchain_community.document_loaders import (
6 | BibtexLoader,
7 | CSVLoader,
8 | Docx2txtLoader,
9 | NotebookLoader,
10 | PythonLoader,
11 | UnstructuredEPubLoader,
12 | UnstructuredExcelLoader,
13 | UnstructuredHTMLLoader,
14 | UnstructuredMarkdownLoader,
15 | UnstructuredODTLoader,
16 | UnstructuredPDFLoader,
17 | UnstructuredPowerPointLoader,
18 | )
19 | from langchain_community.document_loaders.base import BaseLoader
20 | from langchain_community.document_loaders.text import TextLoader
21 | from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
22 |
23 | from quivr_core.files.file import FileExtension, QuivrFile
24 | from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
25 | from quivr_core.processor.splitter import SplitterConfig
26 |
27 | logger = logging.getLogger("quivr_core")
28 |
29 | P = TypeVar("P", bound=BaseLoader)
30 |
31 |
32 | class ProcessorInit(ProcessorBase):
33 | def __init__(self, *args, **loader_kwargs) -> None:
34 | pass
35 |
36 |
37 | # FIXME(@aminediro):
38 | # dynamically creates Processor classes. Maybe redo this for finer control over instanciation
39 | # processor classes are opaque as we don't know what params they would have -> not easy to have lsp completion
40 | def _build_processor(
41 | cls_name: str, load_cls: Type[P], cls_extensions: List[FileExtension | str]
42 | ) -> Type[ProcessorInit]:
43 | enc = tiktoken.get_encoding("cl100k_base")
44 |
45 | class _Processor(ProcessorBase):
46 | supported_extensions = cls_extensions
47 |
48 | def __init__(
49 | self,
50 | splitter: TextSplitter | None = None,
51 | splitter_config: SplitterConfig = SplitterConfig(),
52 | **loader_kwargs: dict[str, Any],
53 | ) -> None:
54 | self.loader_cls = load_cls
55 | self.loader_kwargs = loader_kwargs
56 |
57 | self.splitter_config = splitter_config
58 |
59 | if splitter:
60 | self.text_splitter = splitter
61 | else:
62 | self.text_splitter = (
63 | RecursiveCharacterTextSplitter.from_tiktoken_encoder(
64 | chunk_size=splitter_config.chunk_size,
65 | chunk_overlap=splitter_config.chunk_overlap,
66 | )
67 | )
68 |
69 | @property
70 | def processor_metadata(self) -> dict[str, Any]:
71 | return {
72 | "processor_cls": self.loader_cls.__name__,
73 | "splitter": self.splitter_config.model_dump(),
74 | }
75 |
76 | async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[None]:
77 | if hasattr(self.loader_cls, "__init__"):
78 | # NOTE: mypy can't correctly type this as BaseLoader doesn't have a constructor method
79 | loader = self.loader_cls(file_path=str(file.path), **self.loader_kwargs) # type: ignore
80 | else:
81 | loader = self.loader_cls()
82 |
83 | documents = await loader.aload()
84 | docs = self.text_splitter.split_documents(documents)
85 |
86 | for doc in docs:
87 | # TODO: This metadata info should be typed
88 | doc.metadata = {"chunk_size": len(enc.encode(doc.page_content))}
89 |
90 | return ProcessedDocument(
91 | chunks=docs, processor_cls=cls_name, processor_response=None
92 | )
93 |
94 | return type(cls_name, (ProcessorInit,), dict(_Processor.__dict__))
95 |
96 |
97 | CSVProcessor = _build_processor("CSVProcessor", CSVLoader, [FileExtension.csv])
98 | TikTokenTxtProcessor = _build_processor(
99 | "TikTokenTxtProcessor", TextLoader, [FileExtension.txt]
100 | )
101 | DOCXProcessor = _build_processor(
102 | "DOCXProcessor", Docx2txtLoader, [FileExtension.docx, FileExtension.doc]
103 | )
104 | XLSXProcessor = _build_processor(
105 | "XLSXProcessor", UnstructuredExcelLoader, [FileExtension.xlsx, FileExtension.xls]
106 | )
107 | PPTProcessor = _build_processor(
108 | "PPTProcessor", UnstructuredPowerPointLoader, [FileExtension.pptx]
109 | )
110 | MarkdownProcessor = _build_processor(
111 | "MarkdownProcessor",
112 | UnstructuredMarkdownLoader,
113 | [FileExtension.md, FileExtension.mdx, FileExtension.markdown],
114 | )
115 | EpubProcessor = _build_processor(
116 | "EpubProcessor", UnstructuredEPubLoader, [FileExtension.epub]
117 | )
118 | BibTexProcessor = _build_processor("BibTexProcessor", BibtexLoader, [FileExtension.bib])
119 | ODTProcessor = _build_processor(
120 | "ODTProcessor", UnstructuredODTLoader, [FileExtension.odt]
121 | )
122 | HTMLProcessor = _build_processor(
123 | "HTMLProcessor", UnstructuredHTMLLoader, [FileExtension.html]
124 | )
125 | PythonProcessor = _build_processor("PythonProcessor", PythonLoader, [FileExtension.py])
126 | NotebookProcessor = _build_processor(
127 | "NotebookProcessor", NotebookLoader, [FileExtension.ipynb]
128 | )
129 | UnstructuredPDFProcessor = _build_processor(
130 | "UnstructuredPDFProcessor", UnstructuredPDFLoader, [FileExtension.pdf]
131 | )
132 |
--------------------------------------------------------------------------------
/core/quivr_core/processor/implementations/megaparse_processor.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import tiktoken
4 | from langchain_core.documents import Document
5 | from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
6 | from megaparse_sdk.client import MegaParseNATSClient
7 | from megaparse_sdk.config import ClientNATSConfig
8 | from megaparse_sdk.schema.document import Document as MPDocument
9 |
10 | from quivr_core.config import MegaparseConfig
11 | from quivr_core.files.file import QuivrFile
12 | from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
13 | from quivr_core.processor.registry import FileExtension
14 | from quivr_core.processor.splitter import SplitterConfig
15 |
16 | logger = logging.getLogger("quivr_core")
17 |
18 |
19 | class MegaparseProcessor(ProcessorBase[MPDocument]):
20 | """
21 | Megaparse processor for PDF files.
22 |
23 | It can be used to parse PDF files and split them into chunks.
24 |
25 | It comes from the megaparse library.
26 |
27 | ## Installation
28 | ```bash
29 | pip install megaparse
30 | ```
31 |
32 | """
33 |
34 | supported_extensions = [
35 | FileExtension.txt,
36 | FileExtension.pdf,
37 | FileExtension.docx,
38 | FileExtension.doc,
39 | FileExtension.pptx,
40 | FileExtension.xls,
41 | FileExtension.xlsx,
42 | FileExtension.csv,
43 | FileExtension.epub,
44 | FileExtension.bib,
45 | FileExtension.odt,
46 | FileExtension.html,
47 | FileExtension.markdown,
48 | FileExtension.md,
49 | FileExtension.mdx,
50 | ]
51 |
52 | def __init__(
53 | self,
54 | splitter: TextSplitter | None = None,
55 | splitter_config: SplitterConfig = SplitterConfig(),
56 | megaparse_config: MegaparseConfig = MegaparseConfig(),
57 | ) -> None:
58 | self.enc = tiktoken.get_encoding("cl100k_base")
59 | self.splitter_config = splitter_config
60 | self.megaparse_config = megaparse_config
61 |
62 | if splitter:
63 | self.text_splitter = splitter
64 | else:
65 | self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
66 | chunk_size=splitter_config.chunk_size,
67 | chunk_overlap=splitter_config.chunk_overlap,
68 | )
69 |
70 | @property
71 | def processor_metadata(self):
72 | return {
73 | "chunk_overlap": self.splitter_config.chunk_overlap,
74 | }
75 |
76 | async def process_file_inner(
77 | self, file: QuivrFile
78 | ) -> ProcessedDocument[MPDocument | str]:
79 | logger.info(f"Uploading file {file.path} to MegaParse")
80 | async with MegaParseNATSClient(ClientNATSConfig()) as client:
81 | response = await client.parse_file(file=file.path)
82 |
83 | document = Document(
84 | page_content=str(response),
85 | )
86 |
87 | chunks = self.text_splitter.split_documents([document])
88 | for chunk in chunks:
89 | chunk.metadata = {"chunk_size": len(self.enc.encode(chunk.page_content))}
90 | return ProcessedDocument(
91 | chunks=chunks,
92 | processor_cls="MegaparseProcessor",
93 | processor_response=response,
94 | )
95 |
--------------------------------------------------------------------------------
/core/quivr_core/processor/implementations/simple_txt_processor.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import aiofiles
4 | from langchain_core.documents import Document
5 |
6 | from quivr_core.files.file import QuivrFile
7 | from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
8 | from quivr_core.processor.registry import FileExtension
9 | from quivr_core.processor.splitter import SplitterConfig
10 |
11 |
12 | def recursive_character_splitter(
13 | doc: Document, chunk_size: int, chunk_overlap: int
14 | ) -> list[Document]:
15 | assert chunk_overlap < chunk_size, "chunk_overlap is greater than chunk_size"
16 |
17 | if len(doc.page_content) <= chunk_size:
18 | return [doc]
19 |
20 | chunk = Document(page_content=doc.page_content[:chunk_size], metadata=doc.metadata)
21 | remaining = Document(
22 | page_content=doc.page_content[chunk_size - chunk_overlap :],
23 | metadata=doc.metadata,
24 | )
25 |
26 | return [chunk] + recursive_character_splitter(remaining, chunk_size, chunk_overlap)
27 |
28 |
29 | class SimpleTxtProcessor(ProcessorBase):
30 | """
31 | SimpleTxtProcessor is a class that implements the ProcessorBase interface.
32 | It is used to process the files with the Simple Txt parser.
33 | """
34 |
35 | supported_extensions = [FileExtension.txt]
36 |
37 | def __init__(
38 | self, splitter_config: SplitterConfig = SplitterConfig(), **kwargs
39 | ) -> None:
40 | super().__init__(**kwargs)
41 | self.splitter_config = splitter_config
42 |
43 | @property
44 | def processor_metadata(self) -> dict[str, Any]:
45 | return {
46 | "processor_cls": "SimpleTxtProcessor",
47 | "splitter": self.splitter_config.model_dump(),
48 | }
49 |
50 | async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[str]:
51 | async with aiofiles.open(file.path, mode="r") as f:
52 | content = await f.read()
53 |
54 | doc = Document(page_content=content)
55 |
56 | docs = recursive_character_splitter(
57 | doc, self.splitter_config.chunk_size, self.splitter_config.chunk_overlap
58 | )
59 |
60 | return ProcessedDocument(
61 | chunks=docs, processor_cls="SimpleTxtProcessor", processor_response=content
62 | )
63 |
--------------------------------------------------------------------------------
/core/quivr_core/processor/implementations/tika_processor.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from typing import AsyncIterable
4 |
5 | import httpx
6 | import tiktoken
7 | from langchain_core.documents import Document
8 | from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
9 |
10 | from quivr_core.files.file import QuivrFile
11 | from quivr_core.processor.processor_base import ProcessedDocument, ProcessorBase
12 | from quivr_core.processor.registry import FileExtension
13 | from quivr_core.processor.splitter import SplitterConfig
14 |
15 | logger = logging.getLogger("quivr_core")
16 |
17 |
18 | class TikaProcessor(ProcessorBase):
19 | """
20 | TikaProcessor is a class that implements the ProcessorBase interface.
21 | It is used to process the files with the Tika server.
22 |
23 | To run it with docker you can do:
24 | ```bash
25 | docker run -d -p 9998:9998 apache/tika
26 | ```
27 | """
28 |
29 | supported_extensions = [FileExtension.pdf]
30 |
31 | def __init__(
32 | self,
33 | tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"),
34 | splitter: TextSplitter | None = None,
35 | splitter_config: SplitterConfig = SplitterConfig(),
36 | timeout: float = 5.0,
37 | max_retries: int = 3,
38 | ) -> None:
39 | self.tika_url = tika_url
40 | self.max_retries = max_retries
41 | self._client = httpx.AsyncClient(timeout=timeout)
42 |
43 | self.enc = tiktoken.get_encoding("cl100k_base")
44 | self.splitter_config = splitter_config
45 |
46 | if splitter:
47 | self.text_splitter = splitter
48 | else:
49 | self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
50 | chunk_size=splitter_config.chunk_size,
51 | chunk_overlap=splitter_config.chunk_overlap,
52 | )
53 |
54 | async def _send_parse_tika(self, f: AsyncIterable[bytes]) -> str:
55 | retry = 0
56 | headers = {"Accept": "text/plain"}
57 | while retry < self.max_retries:
58 | try:
59 | resp = await self._client.put(self.tika_url, headers=headers, content=f)
60 | resp.raise_for_status()
61 | return resp.content.decode("utf-8")
62 | except Exception as e:
63 | retry += 1
64 | logger.debug(f"tika url error :{e}. retrying for the {retry} time...")
65 | raise RuntimeError("can't send parse request to tika server")
66 |
67 | @property
68 | def processor_metadata(self):
69 | return {
70 | "chunk_overlap": self.splitter_config.chunk_overlap,
71 | }
72 |
73 | async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[None]:
74 | async with file.open() as f:
75 | txt = await self._send_parse_tika(f)
76 | document = Document(page_content=txt)
77 | docs = self.text_splitter.split_documents([document])
78 | for doc in docs:
79 | doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
80 |
81 | return ProcessedDocument(
82 | chunks=docs, processor_cls="TikaProcessor", processor_response=None
83 | )
84 |
--------------------------------------------------------------------------------
/core/quivr_core/processor/processor_base.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from abc import ABC, abstractmethod
3 | from importlib.metadata import PackageNotFoundError, version
4 | from typing import Any, Generic, List, TypeVar
5 |
6 | from attr import dataclass
7 | from langchain_core.documents import Document
8 |
9 | from quivr_core.files.file import FileExtension, QuivrFile
10 | from quivr_core.language.utils import detect_language
11 |
12 | logger = logging.getLogger("quivr_core")
13 |
14 |
15 | R = TypeVar("R", covariant=True)
16 |
17 |
18 | @dataclass
19 | class ProcessedDocument(Generic[R]):
20 | chunks: List[Document]
21 | processor_cls: str
22 | processor_response: R
23 |
24 |
25 | # TODO: processors should be cached somewhere ?
26 | # The processor should be cached by processor type
27 | # The cache should use a single
28 | class ProcessorBase(ABC, Generic[R]):
29 | supported_extensions: list[FileExtension | str]
30 |
31 | def check_supported(self, file: QuivrFile) -> None:
32 | if file.file_extension not in self.supported_extensions:
33 | raise ValueError(f"can't process a file of type {file.file_extension}")
34 |
35 | @property
36 | @abstractmethod
37 | def processor_metadata(self) -> dict[str, Any]:
38 | raise NotImplementedError
39 |
40 | async def process_file(self, file: QuivrFile) -> ProcessedDocument[R]:
41 | logger.debug(f"Processing file {file}")
42 | self.check_supported(file)
43 | docs = await self.process_file_inner(file)
44 | try:
45 | qvr_version = version("quivr-core")
46 | except PackageNotFoundError:
47 | qvr_version = "dev"
48 |
49 | for idx, doc in enumerate(docs.chunks, start=1):
50 | if "original_file_name" in doc.metadata:
51 | doc.page_content = f"Filename: {doc.metadata['original_file_name']} Content: {doc.page_content}"
52 | doc.page_content = doc.page_content.replace("\u0000", "")
53 | doc.page_content = doc.page_content.encode("utf-8", "replace").decode(
54 | "utf-8"
55 | )
56 | doc.metadata = {
57 | "chunk_index": idx,
58 | "quivr_core_version": qvr_version,
59 | "language": detect_language(
60 | text=doc.page_content.replace("\\n", " ").replace("\n", " "),
61 | low_memory=True,
62 | ).value,
63 | **file.metadata,
64 | **doc.metadata,
65 | **self.processor_metadata,
66 | }
67 | return docs
68 |
69 | @abstractmethod
70 | async def process_file_inner(self, file: QuivrFile) -> ProcessedDocument[R]:
71 | raise NotImplementedError
72 |
--------------------------------------------------------------------------------
/core/quivr_core/processor/splitter.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 |
3 |
4 | class SplitterConfig(BaseModel):
5 | """
6 | This class is used to configure the chunking of the documents.
7 |
8 | Chunk size is the number of characters in the chunk.
9 | Chunk overlap is the number of characters that the chunk will overlap with the previous chunk.
10 | """
11 |
12 | chunk_size: int = 400
13 | chunk_overlap: int = 100
14 |
--------------------------------------------------------------------------------
/core/quivr_core/rag/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/quivr_core/rag/__init__.py
--------------------------------------------------------------------------------
/core/quivr_core/rag/entities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/quivr_core/rag/entities/__init__.py
--------------------------------------------------------------------------------
/core/quivr_core/rag/entities/chat.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from typing import Any, Generator, Tuple, List
3 | from uuid import UUID, uuid4
4 |
5 | from langchain_core.messages import AIMessage, HumanMessage
6 |
7 | from quivr_core.rag.entities.models import ChatMessage
8 |
9 |
10 | class ChatHistory:
11 | """
12 | ChatHistory is a class that maintains a record of chat conversations. Each message
13 | in the history is represented by an instance of the `ChatMessage` class, and the
14 | chat history is stored internally as a list of these `ChatMessage` objects.
15 | The class provides methods to retrieve, append, iterate, and manipulate the chat
16 | history, as well as utilities to convert the messages into specific formats
17 | and support deep copying.
18 | """
19 |
20 | def __init__(self, chat_id: UUID, brain_id: UUID | None) -> None:
21 | """Init a new ChatHistory object.
22 |
23 | Args:
24 | chat_id (UUID): A unique identifier for the chat session.
25 | brain_id (UUID | None): An optional identifier for the brain associated with the chat.
26 | """
27 | self.id = chat_id
28 | self.brain_id = brain_id
29 | # TODO(@aminediro): maybe use a deque() instead ?
30 | self._msgs: list[ChatMessage] = []
31 |
32 | def get_chat_history(self, newest_first: bool = False) -> List[ChatMessage]:
33 | """
34 | Retrieves the chat history, optionally sorted in reverse chronological order.
35 |
36 | Args:
37 | newest_first (bool, optional): If True, returns the messages in reverse order (newest first). Defaults to False.
38 |
39 | Returns:
40 | List[ChatMessage]: A sorted list of chat messages.
41 | """
42 | history = sorted(self._msgs, key=lambda msg: msg.message_time)
43 | if newest_first:
44 | return history[::-1]
45 | return history
46 |
47 | def __len__(self):
48 | return len(self._msgs)
49 |
50 | def append(
51 | self, langchain_msg: AIMessage | HumanMessage, metadata: dict[str, Any] = {}
52 | ):
53 | """
54 | Appends a new message to the chat history.
55 |
56 | Args:
57 | langchain_msg (AIMessage | HumanMessage): The message content (either an AI or Human message).
58 | metadata (dict[str, Any], optional): Additional metadata related to the message. Defaults to an empty dictionary.
59 | """
60 | chat_msg = ChatMessage(
61 | chat_id=self.id,
62 | message_id=uuid4(),
63 | brain_id=self.brain_id,
64 | msg=langchain_msg,
65 | message_time=datetime.now(),
66 | metadata=metadata,
67 | )
68 | self._msgs.append(chat_msg)
69 |
70 | def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]:
71 | """
72 | Iterates over the chat history in pairs, returning a HumanMessage followed by an AIMessage.
73 |
74 | Yields:
75 | Tuple[HumanMessage, AIMessage]: Pairs of human and AI messages.
76 |
77 | Raises:
78 | AssertionError: If the messages in the pair are not in the expected order (i.e., a HumanMessage followed by an AIMessage).
79 | """
80 | # Reverse the chat_history, newest first
81 | it = iter(self.get_chat_history(newest_first=True))
82 | for ai_message, human_message in zip(it, it, strict=False):
83 | assert isinstance(
84 | human_message.msg, HumanMessage
85 | ), f"msg {human_message} is not HumanMessage"
86 | assert isinstance(
87 | ai_message.msg, AIMessage
88 | ), f"msg {human_message} is not AIMessage"
89 | yield (human_message.msg, ai_message.msg)
90 |
91 | def to_list(self) -> List[HumanMessage | AIMessage]:
92 | """
93 | Converts the chat history into a list of raw HumanMessage or AIMessage objects.
94 |
95 | Returns:
96 | list[HumanMessage | AIMessage]: A list of messages in their raw form, without metadata.
97 | """
98 |
99 | return [_msg.msg for _msg in self._msgs]
100 |
--------------------------------------------------------------------------------
/core/quivr_core/rag/entities/models.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from enum import Enum
3 | from typing import Any, Dict, Optional
4 | from uuid import UUID
5 |
6 | from langchain_core.documents import Document
7 | from langchain_core.messages import AIMessage, HumanMessage
8 | from pydantic import BaseModel, Field
9 | from typing_extensions import TypedDict
10 |
11 |
12 | class cited_answer(BaseModel):
13 | """Answer the user question based only on the given sources, and cite the sources used."""
14 |
15 | answer: str = Field(
16 | ...,
17 | description="The answer to the user question, which is based only on the given sources.",
18 | )
19 | citations: list[int] = Field(
20 | ...,
21 | description="The integer IDs of the SPECIFIC sources which justify the answer.",
22 | )
23 |
24 | followup_questions: list[str] = Field(
25 | ...,
26 | description="Generate up to 3 follow-up questions that could be asked based on the answer given or context provided.",
27 | )
28 |
29 |
30 | class ChatMessage(BaseModel):
31 | chat_id: UUID
32 | message_id: UUID
33 | brain_id: UUID | None
34 | msg: HumanMessage | AIMessage
35 | message_time: datetime
36 | metadata: dict[str, Any]
37 |
38 |
39 | class KnowledgeStatus(str, Enum):
40 | ERROR = "ERROR"
41 | RESERVED = "RESERVED"
42 | PROCESSING = "PROCESSING"
43 | PROCESSED = "PROCESSED"
44 | UPLOADED = "UPLOADED"
45 |
46 |
47 | class Source(BaseModel):
48 | name: str
49 | source_url: str
50 | type: str
51 | original_file_name: str
52 | citation: str
53 |
54 |
55 | class RawRAGChunkResponse(TypedDict):
56 | answer: dict[str, Any]
57 | docs: dict[str, Any]
58 |
59 |
60 | class RawRAGResponse(TypedDict):
61 | answer: dict[str, Any]
62 | docs: dict[str, Any]
63 |
64 |
65 | class ChatLLMMetadata(BaseModel):
66 | name: str
67 | display_name: str | None = None
68 | description: str | None = None
69 | image_url: str | None = None
70 | brain_id: str | None = None
71 | brain_name: str | None = None
72 |
73 |
74 | class RAGResponseMetadata(BaseModel):
75 | citations: list[int] = Field(default_factory=list)
76 | followup_questions: list[str] = Field(default_factory=list)
77 | sources: list[Any] = Field(default_factory=list)
78 | metadata_model: ChatLLMMetadata | None = None
79 | workflow_step: str | None = None
80 |
81 |
82 | class ParsedRAGResponse(BaseModel):
83 | answer: str
84 | metadata: RAGResponseMetadata | None = None
85 |
86 |
87 | class ParsedRAGChunkResponse(BaseModel):
88 | answer: str
89 | metadata: RAGResponseMetadata
90 | last_chunk: bool = False
91 |
92 |
93 | class QuivrKnowledge(BaseModel):
94 | id: UUID
95 | file_name: str
96 | brain_ids: list[UUID] | None = None
97 | url: Optional[str] = None
98 | extension: str = ".txt"
99 | mime_type: str = "txt"
100 | status: KnowledgeStatus = KnowledgeStatus.PROCESSING
101 | source: Optional[str] = None
102 | source_link: str | None = None
103 | file_size: int | None = None # FIXME: Should not be optional @chloedia
104 | file_sha1: Optional[str] = None # FIXME: Should not be optional @chloedia
105 | updated_at: Optional[datetime] = None
106 | created_at: Optional[datetime] = None
107 | metadata: Optional[Dict[str, str]] = None
108 |
109 |
110 | class SearchResult(BaseModel):
111 | chunk: Document
112 | distance: float
113 |
--------------------------------------------------------------------------------
/core/quivr_core/storage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/quivr_core/storage/__init__.py
--------------------------------------------------------------------------------
/core/quivr_core/storage/file.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import mimetypes
3 | import os
4 | import warnings
5 | from contextlib import asynccontextmanager
6 | from enum import Enum
7 | from pathlib import Path
8 | from typing import Any, AsyncGenerator, AsyncIterable
9 | from uuid import UUID, uuid4
10 |
11 | import aiofiles
12 |
13 |
14 | class FileExtension(str, Enum):
15 | txt = ".txt"
16 | pdf = ".pdf"
17 | docx = ".docx"
18 |
19 |
20 | def get_file_extension(file_path: Path) -> FileExtension | str:
21 | try:
22 | mime_type, _ = mimetypes.guess_type(file_path.name)
23 | if mime_type:
24 | mime_ext = mimetypes.guess_extension(mime_type)
25 | if mime_ext:
26 | return FileExtension(mime_ext)
27 | return FileExtension(file_path.suffix)
28 | except ValueError:
29 | warnings.warn(
30 | f"File {file_path.name} extension isn't recognized. Make sure you have registered a parser for {file_path.suffix}",
31 | stacklevel=2,
32 | )
33 | return file_path.suffix
34 |
35 |
36 | async def load_qfile(brain_id: UUID, path: str | Path):
37 | if not isinstance(path, Path):
38 | path = Path(path)
39 |
40 | if not path.exists():
41 | raise FileExistsError(f"file {path} doesn't exist")
42 |
43 | file_size = os.stat(path).st_size
44 |
45 | async with aiofiles.open(path, mode="rb") as f:
46 | file_sha1 = hashlib.sha1(await f.read()).hexdigest()
47 |
48 | try:
49 | # NOTE: when loading from existing storage, file name will be uuid
50 | id = UUID(path.name)
51 | except ValueError:
52 | id = uuid4()
53 |
54 | return QuivrFile(
55 | id=id,
56 | brain_id=brain_id,
57 | path=path,
58 | original_filename=path.name,
59 | file_extension=get_file_extension(path),
60 | file_size=file_size,
61 | file_sha1=file_sha1,
62 | )
63 |
64 |
65 | class QuivrFile:
66 | __slots__ = [
67 | "id",
68 | "brain_id",
69 | "path",
70 | "original_filename",
71 | "file_size",
72 | "file_extension",
73 | "file_sha1",
74 | ]
75 |
76 | def __init__(
77 | self,
78 | id: UUID,
79 | original_filename: str,
80 | path: Path,
81 | brain_id: UUID,
82 | file_sha1: str,
83 | file_extension: FileExtension | str,
84 | file_size: int | None = None,
85 | ) -> None:
86 | self.id = id
87 | self.brain_id = brain_id
88 | self.path = path
89 | self.original_filename = original_filename
90 | self.file_size = file_size
91 | self.file_extension = file_extension
92 | self.file_sha1 = file_sha1
93 |
94 | @asynccontextmanager
95 | async def open(self) -> AsyncGenerator[AsyncIterable[bytes], None]:
96 | # TODO(@aminediro) : match on path type
97 | f = await aiofiles.open(self.path, mode="rb")
98 | try:
99 | yield f
100 | finally:
101 | await f.close()
102 |
103 | @property
104 | def metadata(self) -> dict[str, Any]:
105 | return {
106 | "qfile_id": self.id,
107 | "qfile_path": self.path,
108 | "original_file_name": self.original_filename,
109 | "file_md4": self.file_sha1,
110 | "file_size": self.file_size,
111 | }
112 |
--------------------------------------------------------------------------------
/core/quivr_core/storage/storage_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from uuid import UUID
3 |
4 | from quivr_core.brain.info import StorageInfo
5 | from quivr_core.storage.local_storage import QuivrFile
6 |
7 |
8 | class StorageBase(ABC):
9 | """
10 | Abstract base class for storage systems. All subclasses are required to define certain attributes and implement specific methods for managing files
11 |
12 | Attributes:
13 | name (str): Name of the storage type.
14 | """
15 |
16 | name: str
17 |
18 | def __init_subclass__(cls, **kwargs):
19 | for required in ("name",):
20 | if not getattr(cls, required):
21 | raise TypeError(
22 | f"Can't instantiate abstract class {cls.__name__} without {required} attribute defined"
23 | )
24 | return super().__init_subclass__(**kwargs)
25 |
26 | def __repr__(self) -> str:
27 | return f"storage_type: {self.name}"
28 |
29 | @abstractmethod
30 | def nb_files(self) -> int:
31 | """
32 | Abstract method to get the number of files in the storage.
33 |
34 | Returns:
35 | int: The number of files in the storage.
36 |
37 | Raises:
38 | Exception: If the method is not implemented.
39 | """
40 | raise Exception("Unimplemented nb_files method")
41 |
42 | @abstractmethod
43 | async def get_files(self) -> list[QuivrFile]:
44 | """
45 | Abstract asynchronous method to get the files `QuivrFile` in the storage.
46 |
47 | Returns:
48 | list[QuivrFile]: A list of QuivrFile objects representing the files in the storage.
49 |
50 | Raises:
51 | Exception: If the method is not implemented.
52 | """
53 | raise Exception("Unimplemented get_files method")
54 |
55 | @abstractmethod
56 | async def upload_file(self, file: QuivrFile, exists_ok: bool = False) -> None:
57 | """
58 | Abstract asynchronous method to upload a file to the storage.
59 |
60 | Args:
61 | file (QuivrFile): The file to upload.
62 | exists_ok (bool): If True, allows overwriting the file if it already exists. Default is False.
63 |
64 | Raises:
65 | Exception: If the method is not implemented.
66 | """
67 | raise Exception("Unimplemented upload_file method")
68 |
69 | @abstractmethod
70 | async def remove_file(self, file_id: UUID) -> None:
71 | """
72 | Abstract asynchronous method to remove a file from the storage.
73 |
74 | Args:
75 | file_id (UUID): The unique identifier of the file to be removed.
76 |
77 | Raises:
78 | Exception: If the method is not implemented.
79 | """
80 | raise Exception("Unimplemented remove_file method")
81 |
82 | def info(self) -> StorageInfo:
83 | """
84 | Returns information about the storage, including the storage type and the number of files.
85 |
86 | Returns:
87 | StorageInfo: An object containing details about the storage.
88 | """
89 | return StorageInfo(
90 | storage_type=self.name,
91 | n_files=self.nb_files(),
92 | )
93 |
--------------------------------------------------------------------------------
/core/scripts/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Exit immediately if a command exits with a non-zero status
4 | set -e
5 |
6 | # Constants
7 | IMAGE_NAME="quivr-core-test"
8 | IMAGE_TAG="latest"
9 | DOCKERFILE="Dockerfile.test"
10 | VOLUME_MAPPING="$PWD:/code"
11 | TOX_DIR="/code/.tox-docker"
12 | CMD="poetry run tox -p auto"
13 |
14 | # Functions
15 | build_image() {
16 | echo "Building Docker image..."
17 | docker build -f $DOCKERFILE -t $IMAGE_NAME:$IMAGE_TAG .
18 | }
19 |
20 | run_container() {
21 | echo "Running tests in Docker container..."
22 | docker run -it --rm \
23 | -e TOX_WORK_DIR=$TOX_DIR \
24 | -v $VOLUME_MAPPING \
25 | $IMAGE_NAME:$IMAGE_TAG $CMD
26 | }
27 |
28 | # Main script execution
29 | build_image
30 | run_container
31 |
32 | echo "Tests completed successfully."
33 |
--------------------------------------------------------------------------------
/core/scripts/run_tests_buildx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | # Constants
6 | IMAGE_NAME="quivr-core-test"
7 | IMAGE_TAG="latest"
8 | DOCKERFILE="Dockerfile.test"
9 | VOLUME_MAPPING="$PWD:/code"
10 | CMD="poetry run tox"
11 | PLATFORM="linux/amd64"
12 | BUILDER_NAME="amd64_builder"
13 |
14 | # Functions
15 | build_image() {
16 | echo "Building Docker image for $PLATFORM..."
17 | EXISTING_BUILDER=$(docker buildx ls | grep -w $BUILDER_NAME)
18 |
19 | # Create the builder if it doesn't exist
20 | if [ -z "$EXISTING_BUILDER" ]; then
21 | echo "Creating builder: $BUILDER_NAME"
22 | docker buildx create --use --name $BUILDER_NAME --platform $PLATFORM
23 | else
24 | echo "Builder $BUILDER_NAME already exists. Skipping creation."
25 | fi
26 |
27 | docker buildx build --platform $PLATFORM -f $DOCKERFILE -t $IMAGE_NAME:$IMAGE_TAG --load .
28 | }
29 |
30 | run_container() {
31 | echo "Running tests in Docker container..."
32 | docker run -it --rm --platform $PLATFORM -v $VOLUME_MAPPING $IMAGE_NAME:$IMAGE_TAG $CMD
33 | }
34 |
35 | # Main script execution
36 | build_image
37 | run_container
38 |
39 | echo "Tests completed successfully."
--------------------------------------------------------------------------------
/core/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/__init__.py
--------------------------------------------------------------------------------
/core/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from pathlib import Path
4 | from uuid import uuid4
5 |
6 | import pytest
7 | from langchain_core.embeddings import DeterministicFakeEmbedding
8 | from langchain_core.language_models import FakeListChatModel
9 | from langchain_core.messages.ai import AIMessageChunk
10 | from langchain_core.runnables.utils import AddableDict
11 | from langchain_core.vectorstores import InMemoryVectorStore
12 | from quivr_core.rag.entities.config import LLMEndpointConfig
13 | from quivr_core.files.file import FileExtension, QuivrFile
14 | from quivr_core.llm import LLMEndpoint
15 |
16 |
17 | @pytest.fixture(scope="function")
18 | def temp_data_file(tmp_path):
19 | data = "This is some test data."
20 | temp_file = tmp_path / "data.txt"
21 | temp_file.write_text(data)
22 | return temp_file
23 |
24 |
25 | @pytest.fixture(scope="function")
26 | def quivr_txt(temp_data_file):
27 | return QuivrFile(
28 | id=uuid4(),
29 | brain_id=uuid4(),
30 | original_filename=temp_data_file.name,
31 | path=temp_data_file,
32 | file_extension=FileExtension.txt,
33 | file_sha1="123",
34 | )
35 |
36 |
37 | @pytest.fixture
38 | def quivr_pdf():
39 | return QuivrFile(
40 | id=uuid4(),
41 | brain_id=uuid4(),
42 | original_filename="dummy.pdf",
43 | path=Path("./tests/processor/data/dummy.pdf"),
44 | file_extension=FileExtension.pdf,
45 | file_sha1="13bh234jh234",
46 | )
47 |
48 |
49 | @pytest.fixture
50 | def full_response():
51 | return "Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and respond to human language in a way that is both valuable and meaningful. NLP combines computational linguistics—rule-based modeling of human language—with statistical, machine learning, and deep learning models. This combination allows computers to process human language in the form of text or voice data and to understand its full meaning, complete with the speaker or writer’s intent and sentiment. Key tasks in NLP include text and speech recognition, translation, sentiment analysis, and topic segmentation."
52 |
53 |
54 | @pytest.fixture
55 | def chunks_stream_answer():
56 | with open("./tests/chunk_stream_fixture.jsonl", "r") as f:
57 | raw_chunks = list(f)
58 |
59 | chunks = []
60 | for rc in raw_chunks:
61 | chunk = AddableDict(**json.loads(rc))
62 | if "answer" in chunk:
63 | chunk["answer"] = AIMessageChunk(**chunk["answer"])
64 | chunks.append(chunk)
65 | return chunks
66 |
67 |
68 | @pytest.fixture(autouse=True)
69 | def openai_api_key():
70 | os.environ["OPENAI_API_KEY"] = "this-is-a-test-key"
71 |
72 |
73 | @pytest.fixture
74 | def answers():
75 | return [f"answer_{i}" for i in range(10)]
76 |
77 |
78 | @pytest.fixture(scope="function")
79 | def fake_llm(answers: list[str]):
80 | llm = FakeListChatModel(responses=answers)
81 | return LLMEndpoint(llm=llm, llm_config=LLMEndpointConfig(model="fake_model"))
82 |
83 |
84 | @pytest.fixture(scope="function")
85 | def embedder():
86 | return DeterministicFakeEmbedding(size=20)
87 |
88 |
89 | @pytest.fixture(scope="function")
90 | def mem_vector_store(embedder):
91 | return InMemoryVectorStore(embedder)
92 |
--------------------------------------------------------------------------------
/core/tests/fixture_chunks.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | from uuid import uuid4
4 |
5 | from langchain_core.embeddings import DeterministicFakeEmbedding
6 | from langchain_core.messages.ai import AIMessageChunk
7 | from langchain_core.vectorstores import InMemoryVectorStore
8 | from quivr_core.rag.entities.chat import ChatHistory
9 | from quivr_core.rag.entities.config import LLMEndpointConfig, RetrievalConfig
10 | from quivr_core.llm import LLMEndpoint
11 | from quivr_core.rag.quivr_rag_langgraph import QuivrQARAGLangGraph
12 |
13 |
14 | async def main():
15 | retrieval_config = RetrievalConfig(llm_config=LLMEndpointConfig(model="gpt-4o"))
16 | embedder = DeterministicFakeEmbedding(size=20)
17 | vec = InMemoryVectorStore(embedder)
18 |
19 | llm = LLMEndpoint.from_config(retrieval_config.llm_config)
20 | chat_history = ChatHistory(uuid4(), uuid4())
21 | rag_pipeline = QuivrQARAGLangGraph(
22 | retrieval_config=retrieval_config, llm=llm, vector_store=vec
23 | )
24 |
25 | conversational_qa_chain = rag_pipeline.build_chain()
26 |
27 | with open("response.jsonl", "w") as f:
28 | async for event in conversational_qa_chain.astream_events(
29 | {
30 | "messages": [
31 | ("user", "What is NLP, give a very long detailed answer"),
32 | ],
33 | "chat_history": chat_history,
34 | "custom_personality": None,
35 | },
36 | version="v1",
37 | config={"metadata": {}},
38 | ):
39 | kind = event["event"]
40 | if (
41 | kind == "on_chat_model_stream"
42 | and event["metadata"]["langgraph_node"] == "generate"
43 | ):
44 | chunk = event["data"]["chunk"]
45 | dict_chunk = {
46 | k: v.dict() if isinstance(v, AIMessageChunk) else v
47 | for k, v in chunk.items()
48 | }
49 | f.write(json.dumps(dict_chunk) + "\n")
50 |
51 |
52 | asyncio.run(main())
53 |
--------------------------------------------------------------------------------
/core/tests/processor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/__init__.py
--------------------------------------------------------------------------------
/core/tests/processor/community/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/community/__init__.py
--------------------------------------------------------------------------------
/core/tests/processor/community/test_markdown_processor.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from quivr_core.files.file import FileExtension, QuivrFile
6 | from quivr_core.processor.implementations.default import MarkdownProcessor
7 |
8 | unstructured = pytest.importorskip("unstructured")
9 |
10 |
11 | @pytest.mark.unstructured
12 | @pytest.mark.asyncio
13 | async def test_markdown_processor():
14 | p = Path("./tests/processor/data/guidelines_code.md")
15 | f = QuivrFile(
16 | id=uuid4(),
17 | brain_id=uuid4(),
18 | original_filename=p.stem,
19 | path=p,
20 | file_extension=FileExtension.md,
21 | file_sha1="123",
22 | )
23 | processor = MarkdownProcessor()
24 | result = await processor.process_file(f)
25 | assert len(result) > 0
26 |
27 |
28 | @pytest.mark.unstructured
29 | @pytest.mark.asyncio
30 | async def test_markdown_processor_fail(quivr_txt):
31 | processor = MarkdownProcessor()
32 | with pytest.raises(ValueError):
33 | await processor.process_file(quivr_txt)
34 |
--------------------------------------------------------------------------------
/core/tests/processor/data/dummy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/data/dummy.pdf
--------------------------------------------------------------------------------
/core/tests/processor/docx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/docx/__init__.py
--------------------------------------------------------------------------------
/core/tests/processor/docx/demo.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/docx/demo.docx
--------------------------------------------------------------------------------
/core/tests/processor/docx/test_docx.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from quivr_core.files.file import FileExtension, QuivrFile
6 | from quivr_core.processor.implementations.default import DOCXProcessor
7 |
8 | unstructured = pytest.importorskip("unstructured")
9 |
10 |
11 | @pytest.mark.unstructured
12 | @pytest.mark.asyncio
13 | async def test_docx_filedocx():
14 | p = Path("./tests/processor/docx/demo.docx")
15 | f = QuivrFile(
16 | id=uuid4(),
17 | brain_id=uuid4(),
18 | original_filename=p.stem,
19 | path=p,
20 | file_extension=FileExtension.docx,
21 | file_sha1="123",
22 | )
23 | processor = DOCXProcessor()
24 | result = await processor.process_file(f)
25 | assert len(result) > 0
26 |
27 |
28 | @pytest.mark.unstructured
29 | @pytest.mark.asyncio
30 | async def test_docx_processor_fail(quivr_txt):
31 | processor = DOCXProcessor()
32 | with pytest.raises(ValueError):
33 | await processor.process_file(quivr_txt)
34 |
--------------------------------------------------------------------------------
/core/tests/processor/epub/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/epub/__init__.py
--------------------------------------------------------------------------------
/core/tests/processor/epub/page-blanche.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/epub/page-blanche.epub
--------------------------------------------------------------------------------
/core/tests/processor/epub/sway.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/epub/sway.epub
--------------------------------------------------------------------------------
/core/tests/processor/epub/test_epub_processor.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from quivr_core.files.file import FileExtension, QuivrFile
6 | from quivr_core.processor.implementations.default import EpubProcessor
7 |
8 | unstructured = pytest.importorskip("unstructured")
9 |
10 |
11 | @pytest.mark.unstructured
12 | @pytest.mark.asyncio
13 | async def test_epub_page_blanche():
14 | p = Path("./tests/processor/epub/page-blanche.epub")
15 | f = QuivrFile(
16 | id=uuid4(),
17 | brain_id=uuid4(),
18 | original_filename=p.stem,
19 | path=p,
20 | file_extension=FileExtension.epub,
21 | file_sha1="123",
22 | )
23 | processor = EpubProcessor()
24 | result = await processor.process_file(f)
25 | assert len(result) == 0
26 |
27 |
28 | @pytest.mark.unstructured
29 | @pytest.mark.asyncio
30 | async def test_epub_processor():
31 | p = Path("./tests/processor/epub/sway.epub")
32 | f = QuivrFile(
33 | id=uuid4(),
34 | brain_id=uuid4(),
35 | original_filename=p.stem,
36 | path=p,
37 | file_extension=FileExtension.epub,
38 | file_sha1="123",
39 | )
40 |
41 | processor = EpubProcessor()
42 | result = await processor.process_file(f)
43 | assert len(result) > 0
44 |
45 |
46 | @pytest.mark.unstructured
47 | @pytest.mark.asyncio
48 | async def test_epub_processor_fail(quivr_txt):
49 | processor = EpubProcessor()
50 | with pytest.raises(ValueError):
51 | await processor.process_file(quivr_txt)
52 |
--------------------------------------------------------------------------------
/core/tests/processor/odt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/odt/__init__.py
--------------------------------------------------------------------------------
/core/tests/processor/odt/bad_odt.odt:
--------------------------------------------------------------------------------
1 |
File Examples | Download redirect... Downloading...
Please wait a moment
2 |
--------------------------------------------------------------------------------
/core/tests/processor/odt/sample.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/odt/sample.odt
--------------------------------------------------------------------------------
/core/tests/processor/odt/test_odt.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from quivr_core.files.file import FileExtension, QuivrFile
6 | from quivr_core.processor.implementations.default import ODTProcessor
7 |
8 | unstructured = pytest.importorskip("unstructured")
9 |
10 |
11 | @pytest.mark.unstructured
12 | @pytest.mark.asyncio
13 | async def test_odt_processor():
14 | p = Path("./tests/processor/odt/sample.odt")
15 | f = QuivrFile(
16 | id=uuid4(),
17 | brain_id=uuid4(),
18 | original_filename=p.stem,
19 | path=p,
20 | file_extension=FileExtension.odt,
21 | file_sha1="123",
22 | )
23 | processor = ODTProcessor()
24 | result = await processor.process_file(f)
25 | assert len(result) > 0
26 |
27 |
28 | @pytest.mark.unstructured
29 | @pytest.mark.asyncio
30 | async def test_odt_processor_fail():
31 | p = Path("./tests/processor/odt/bad_odt.odt")
32 | f = QuivrFile(
33 | id=uuid4(),
34 | brain_id=uuid4(),
35 | original_filename=p.stem,
36 | path=p,
37 | file_extension=FileExtension.txt,
38 | file_sha1="123",
39 | )
40 | processor = ODTProcessor()
41 | with pytest.raises(ValueError):
42 | await processor.process_file(f)
43 |
--------------------------------------------------------------------------------
/core/tests/processor/pdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/pdf/__init__.py
--------------------------------------------------------------------------------
/core/tests/processor/pdf/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/core/tests/processor/pdf/sample.pdf
--------------------------------------------------------------------------------
/core/tests/processor/pdf/test_unstructured_pdf_processor.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from quivr_core.files.file import FileExtension, QuivrFile
6 |
7 | unstructured = pytest.importorskip("unstructured")
8 |
9 | all_but_pdf = list(filter(lambda ext: ext != ".pdf", list(FileExtension)))
10 |
11 |
12 | @pytest.mark.unstructured
13 | @pytest.mark.asyncio
14 | async def test_unstructured_pdf_processor():
15 | from quivr_core.processor.implementations.default import UnstructuredPDFProcessor
16 |
17 | p = Path("./tests/processor/pdf/sample.pdf")
18 | f = QuivrFile(
19 | id=uuid4(),
20 | brain_id=uuid4(),
21 | original_filename=p.stem,
22 | path=p,
23 | file_extension=FileExtension.pdf,
24 | file_sha1="123",
25 | )
26 | processor = UnstructuredPDFProcessor()
27 | result = await processor.process_file(f)
28 | assert len(result) > 0
29 |
30 |
31 | @pytest.mark.unstructured
32 | @pytest.mark.parametrize("ext", all_but_pdf)
33 | @pytest.mark.asyncio
34 | async def test_unstructured_pdf_processor_fail(ext):
35 | from quivr_core.processor.implementations.default import UnstructuredPDFProcessor
36 |
37 | p = Path("./tests/processor/pdf/sample.pdf")
38 | f = QuivrFile(
39 | id=uuid4(),
40 | brain_id=uuid4(),
41 | original_filename=p.stem,
42 | path=p,
43 | file_extension=ext,
44 | file_sha1="123",
45 | )
46 | processor = UnstructuredPDFProcessor()
47 | with pytest.raises(ValueError):
48 | await processor.process_file(f)
49 |
--------------------------------------------------------------------------------
/core/tests/processor/test_default_implementations.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from quivr_core.files.file import FileExtension
3 | from quivr_core.processor.processor_base import ProcessorBase
4 |
5 |
6 | @pytest.mark.base
7 | def test___build_processor():
8 | from langchain_community.document_loaders.base import BaseLoader
9 | from quivr_core.processor.implementations.default import _build_processor
10 |
11 | cls = _build_processor("TestCLS", BaseLoader, [FileExtension.txt])
12 |
13 | assert cls.__name__ == "TestCLS"
14 | assert issubclass(cls, ProcessorBase)
15 | assert "__init__" in cls.__dict__
16 | assert cls.supported_extensions == [FileExtension.txt]
17 | proc = cls()
18 | assert hasattr(proc, "loader_cls")
19 | # FIXME: proper mypy typing
20 | assert proc.loader_cls == BaseLoader # type: ignore
21 |
--------------------------------------------------------------------------------
/core/tests/processor/test_simple_txt_processor.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from langchain_core.documents import Document
3 | from quivr_core.files.file import FileExtension
4 | from quivr_core.processor.implementations.simple_txt_processor import (
5 | SimpleTxtProcessor,
6 | recursive_character_splitter,
7 | )
8 | from quivr_core.processor.splitter import SplitterConfig
9 |
10 |
11 | def test_recursive_character_splitter():
12 | doc = Document(page_content="abcdefgh", metadata={"key": "value"})
13 |
14 | docs = recursive_character_splitter(doc, chunk_size=2, chunk_overlap=1)
15 |
16 | assert [d.page_content for d in docs] == ["ab", "bc", "cd", "de", "ef", "fg", "gh"]
17 | assert [d.metadata for d in docs] == [doc.metadata] * len(docs)
18 |
19 |
20 | @pytest.mark.asyncio
21 | async def test_simple_processor(quivr_pdf, quivr_txt):
22 | proc = SimpleTxtProcessor(
23 | splitter_config=SplitterConfig(chunk_size=100, chunk_overlap=20)
24 | )
25 | assert proc.supported_extensions == [FileExtension.txt]
26 |
27 | with pytest.raises(ValueError):
28 | await proc.process_file(quivr_pdf)
29 |
30 | docs = await proc.process_file(quivr_txt)
31 |
32 | assert len(docs) == 1
33 | assert docs[0].page_content == "This is some test data."
34 |
--------------------------------------------------------------------------------
/core/tests/processor/test_tika_processor.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from quivr_core.processor.implementations.tika_processor import TikaProcessor
3 |
4 | # TODO: TIKA server should be set
5 |
6 |
7 | @pytest.mark.tika
8 | @pytest.mark.asyncio
9 | async def test_process_file(quivr_pdf):
10 | tparser = TikaProcessor()
11 | doc = await tparser.process_file(quivr_pdf)
12 | assert len(doc) > 0
13 | assert doc[0].page_content.strip("\n") == "Dummy PDF download"
14 |
15 |
16 | @pytest.mark.tika
17 | @pytest.mark.asyncio
18 | async def test_send_parse_tika_exception(quivr_pdf):
19 | # TODO: Mock correct tika for retries
20 | tparser = TikaProcessor(tika_url="test.test")
21 | with pytest.raises(RuntimeError):
22 | doc = await tparser.process_file(quivr_pdf)
23 | assert len(doc) > 0
24 | assert doc[0].page_content.strip("\n") == "Dummy PDF download"
25 |
--------------------------------------------------------------------------------
/core/tests/processor/test_txt_processor.py:
--------------------------------------------------------------------------------
1 | from uuid import uuid4
2 |
3 | import pytest
4 | from quivr_core.storage.file import FileExtension, QuivrFile
5 |
6 | unstructured = pytest.importorskip("unstructured")
7 |
8 |
9 | @pytest.fixture
10 | def txt_qfile(temp_data_file):
11 | return QuivrFile(
12 | id=uuid4(),
13 | brain_id=uuid4(),
14 | original_filename="data.txt",
15 | path=temp_data_file,
16 | file_extension=FileExtension.txt,
17 | file_sha1="hash",
18 | )
19 |
20 |
21 | @pytest.mark.base
22 | @pytest.mark.asyncio
23 | async def test_process_txt(txt_qfile):
24 | from quivr_core.processor.implementations.default import TikTokenTxtProcessor
25 | from quivr_core.processor.splitter import SplitterConfig
26 |
27 | tparser = TikTokenTxtProcessor(
28 | splitter_config=SplitterConfig(chunk_size=20, chunk_overlap=0)
29 | )
30 | doc = await tparser.process_file(txt_qfile)
31 | assert len(doc) > 0
32 | assert doc[0].page_content == "This is some test data."
33 | assert (
34 | doc[0].metadata.items()
35 | >= {
36 | "chunk_index": 1,
37 | "original_file_name": "data.txt",
38 | "chunk_size": 6,
39 | "processor_cls": "TextLoader",
40 | "splitter": {"chunk_size": 20, "chunk_overlap": 0},
41 | **txt_qfile.metadata,
42 | }.items()
43 | )
44 |
--------------------------------------------------------------------------------
/core/tests/rag_config.yaml:
--------------------------------------------------------------------------------
1 | ingestion_config:
2 | parser_config:
3 | megaparse_config:
4 | strategy: "fast"
5 | pdf_parser: "unstructured"
6 | splitter_config:
7 | chunk_size: 400
8 | chunk_overlap: 100
9 |
10 | retrieval_config:
11 | # Maximum number of previous conversation iterations
12 | # to include in the context of the answer
13 | max_history: 10
14 |
15 | max_files: 20
16 | reranker_config:
17 | # The reranker supplier to use
18 | supplier: "cohere"
19 |
20 | # The model to use for the reranker for the given supplier
21 | model: "rerank-multilingual-v3.0"
22 |
23 | # Number of chunks returned by the reranker
24 | top_n: 5
25 | llm_config:
26 | # The LLM supplier to use
27 | supplier: "openai"
28 |
29 | # The model to use for the LLM for the given supplier
30 | model: "gpt-3.5-turbo-0125"
31 |
32 | max_context_tokens: 2000
33 |
34 | # Maximum number of tokens to pass to the LLM
35 | # as a context to generate the answer
36 | max_output_tokens: 2000
37 |
38 | temperature: 0.7
39 | streaming: true
40 |
--------------------------------------------------------------------------------
/core/tests/rag_config_workflow.yaml:
--------------------------------------------------------------------------------
1 | ingestion_config:
2 | parser_config:
3 | megaparse_config:
4 | strategy: "fast"
5 | pdf_parser: "unstructured"
6 | splitter_config:
7 | chunk_size: 400
8 | chunk_overlap: 100
9 |
10 | retrieval_config:
11 | workflow_config:
12 | name: "standard RAG"
13 | nodes:
14 | - name: "START"
15 | edges: ["filter_history"]
16 |
17 | - name: "filter_history"
18 | edges: ["generate_chat_llm"]
19 |
20 | - name: "generate_chat_llm" # the name of the last node, from which we want to stream the answer to the user, should always start with "generate"
21 | edges: ["END"]
22 | # Maximum number of previous conversation iterations
23 | # to include in the context of the answer
24 | max_history: 10
25 |
26 | #prompt: "my prompt"
27 |
28 | max_files: 20
29 | reranker_config:
30 | # The reranker supplier to use
31 | supplier: "cohere"
32 |
33 | # The model to use for the reranker for the given supplier
34 | model: "rerank-multilingual-v3.0"
35 |
36 | # Number of chunks returned by the reranker
37 | top_n: 5
38 | llm_config:
39 | # The LLM supplier to use
40 | supplier: "openai"
41 |
42 | # The model to use for the LLM for the given supplier
43 | model: "gpt-3.5-turbo-0125"
44 |
45 | max_context_tokens: 2000
46 |
47 | # Maximum number of tokens to pass to the LLM
48 | # as a context to generate the answer
49 | max_output_tokens: 2000
50 |
51 | temperature: 0.7
52 | streaming: true
53 |
--------------------------------------------------------------------------------
/core/tests/test_brain.py:
--------------------------------------------------------------------------------
1 | from dataclasses import asdict
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from langchain_core.documents import Document
6 | from langchain_core.embeddings import Embeddings
7 | from quivr_core.brain import Brain
8 | from quivr_core.rag.entities.chat import ChatHistory
9 | from quivr_core.llm import LLMEndpoint
10 | from quivr_core.storage.local_storage import TransparentStorage
11 |
12 |
13 | @pytest.mark.base
14 | def test_brain_empty_files_no_vectordb(fake_llm, embedder):
15 | # Testing no files
16 | with pytest.raises(ValueError):
17 | Brain.from_files(
18 | name="test_brain",
19 | file_paths=[],
20 | llm=fake_llm,
21 | embedder=embedder,
22 | )
23 |
24 |
25 | def test_brain_empty_files(fake_llm, embedder, mem_vector_store):
26 | brain = Brain.from_files(
27 | name="test_brain",
28 | file_paths=[],
29 | llm=fake_llm,
30 | embedder=embedder,
31 | vector_db=mem_vector_store,
32 | )
33 | assert brain
34 |
35 |
36 | @pytest.mark.asyncio
37 | async def test_brain_from_files_success(
38 | fake_llm: LLMEndpoint, embedder, temp_data_file, mem_vector_store
39 | ):
40 | brain = await Brain.afrom_files(
41 | name="test_brain",
42 | file_paths=[temp_data_file],
43 | embedder=embedder,
44 | llm=fake_llm,
45 | vector_db=mem_vector_store,
46 | )
47 | assert brain.name == "test_brain"
48 | assert len(brain.chat_history) == 0
49 | assert brain.llm == fake_llm
50 | assert brain.vector_db.embeddings == embedder
51 | assert isinstance(brain.default_chat, ChatHistory)
52 | assert len(brain.default_chat) == 0
53 |
54 | # storage
55 | assert isinstance(brain.storage, TransparentStorage)
56 | assert len(await brain.storage.get_files()) == 1
57 |
58 |
59 | @pytest.mark.asyncio
60 | async def test_brain_from_langchain_docs(embedder, fake_llm, mem_vector_store):
61 | chunk = Document("content_1", metadata={"id": uuid4()})
62 | brain = await Brain.afrom_langchain_documents(
63 | name="test",
64 | llm=fake_llm,
65 | langchain_documents=[chunk],
66 | embedder=embedder,
67 | vector_db=mem_vector_store,
68 | )
69 | # No appended files
70 | assert len(await brain.storage.get_files()) == 0
71 | assert len(brain.chat_history) == 0
72 |
73 |
74 | @pytest.mark.base
75 | @pytest.mark.asyncio
76 | async def test_brain_search(
77 | embedder: Embeddings,
78 | ):
79 | chunk1 = Document("content_1", metadata={"id": uuid4()})
80 | chunk2 = Document("content_2", metadata={"id": uuid4()})
81 | brain = await Brain.afrom_langchain_documents(
82 | name="test", langchain_documents=[chunk1, chunk2], embedder=embedder
83 | )
84 |
85 | k = 2
86 | result = await brain.asearch("content_1", n_results=k)
87 |
88 | assert len(result) == k
89 | assert result[0].chunk == chunk1
90 | assert result[1].chunk == chunk2
91 | assert result[0].distance == 0
92 | assert result[1].distance > result[0].distance
93 |
94 |
95 | @pytest.mark.asyncio
96 | async def test_brain_get_history(
97 | fake_llm: LLMEndpoint, embedder, temp_data_file, mem_vector_store
98 | ):
99 | brain = await Brain.afrom_files(
100 | name="test_brain",
101 | file_paths=[temp_data_file],
102 | embedder=embedder,
103 | llm=fake_llm,
104 | vector_db=mem_vector_store,
105 | )
106 |
107 | await brain.aask("question")
108 | await brain.aask("question")
109 |
110 | assert len(brain.default_chat) == 4
111 |
112 |
113 | @pytest.mark.base
114 | @pytest.mark.asyncio
115 | async def test_brain_ask_streaming(
116 | fake_llm: LLMEndpoint, embedder, temp_data_file, answers
117 | ):
118 | brain = await Brain.afrom_files(
119 | name="test_brain", file_paths=[temp_data_file], embedder=embedder, llm=fake_llm
120 | )
121 |
122 | response = ""
123 | async for chunk in brain.ask_streaming("question"):
124 | response += chunk.answer
125 |
126 | assert response == answers[1]
127 |
128 |
129 | def test_brain_info_empty(fake_llm: LLMEndpoint, embedder, mem_vector_store):
130 | storage = TransparentStorage()
131 | id = uuid4()
132 | brain = Brain(
133 | name="test",
134 | id=id,
135 | llm=fake_llm,
136 | embedder=embedder,
137 | storage=storage,
138 | vector_db=mem_vector_store,
139 | )
140 |
141 | assert asdict(brain.info()) == {
142 | "brain_id": id,
143 | "brain_name": "test",
144 | "files_info": asdict(storage.info()),
145 | "chats_info": {
146 | "nb_chats": 1, # start with a default chat
147 | "current_default_chat": brain.default_chat.id,
148 | "current_chat_history_length": 0,
149 | },
150 | "llm_info": asdict(fake_llm.info()),
151 | }
152 |
--------------------------------------------------------------------------------
/core/tests/test_chat_history.py:
--------------------------------------------------------------------------------
1 | from time import sleep
2 | from uuid import uuid4
3 |
4 | import pytest
5 | from langchain_core.messages import AIMessage, HumanMessage
6 | from quivr_core.rag.entities.chat import ChatHistory
7 |
8 |
9 | @pytest.fixture
10 | def ai_message():
11 | return AIMessage("ai message")
12 |
13 |
14 | @pytest.fixture
15 | def human_message():
16 | return HumanMessage("human message")
17 |
18 |
19 | def test_chat_history_constructor():
20 | brain_id, chat_id = uuid4(), uuid4()
21 | chat_history = ChatHistory(brain_id=brain_id, chat_id=chat_id)
22 |
23 | assert chat_history.brain_id == brain_id
24 | assert chat_history.id == chat_id
25 | assert len(chat_history._msgs) == 0
26 |
27 |
28 | def test_chat_history_append(ai_message: AIMessage, human_message: HumanMessage):
29 | chat_history = ChatHistory(uuid4(), uuid4())
30 | chat_history.append(ai_message)
31 |
32 | assert len(chat_history) == 1
33 | chat_history.append(human_message)
34 | assert len(chat_history) == 2
35 |
36 |
37 | def test_chat_history_get_history(ai_message: AIMessage, human_message: HumanMessage):
38 | chat_history = ChatHistory(uuid4(), uuid4())
39 | chat_history.append(ai_message)
40 | chat_history.append(human_message)
41 | chat_history.append(ai_message)
42 | sleep(0.01)
43 | chat_history.append(human_message)
44 |
45 | msgs = chat_history.get_chat_history()
46 |
47 | assert len(msgs) == 4
48 | assert msgs[-1].message_time > msgs[0].message_time
49 | assert isinstance(msgs[0].msg, AIMessage)
50 | assert isinstance(msgs[1].msg, HumanMessage)
51 |
52 | msgs = chat_history.get_chat_history(newest_first=True)
53 | assert msgs[-1].message_time < msgs[0].message_time
54 |
55 |
56 | def test_chat_history_iter_pairs_invalid(
57 | ai_message: AIMessage, human_message: HumanMessage
58 | ):
59 | with pytest.raises(AssertionError):
60 | chat_history = ChatHistory(uuid4(), uuid4())
61 | chat_history.append(ai_message)
62 | chat_history.append(ai_message)
63 | next(chat_history.iter_pairs())
64 |
65 |
66 | def test_chat_history_iter_pais(ai_message: AIMessage, human_message: HumanMessage):
67 | chat_history = ChatHistory(uuid4(), uuid4())
68 |
69 | chat_history.append(human_message)
70 | chat_history.append(ai_message)
71 | chat_history.append(human_message)
72 | chat_history.append(ai_message)
73 |
74 | result = list(chat_history.iter_pairs())
75 |
76 | assert result == [(human_message, ai_message), (human_message, ai_message)]
77 |
--------------------------------------------------------------------------------
/core/tests/test_config.py:
--------------------------------------------------------------------------------
1 | from quivr_core.rag.entities.config import LLMEndpointConfig, RetrievalConfig
2 |
3 |
4 | def test_default_llm_config():
5 | config = LLMEndpointConfig()
6 |
7 | assert (
8 | config.model_dump()
9 | == LLMEndpointConfig(
10 | model="gpt-4o",
11 | llm_base_url=None,
12 | llm_api_key=None,
13 | max_context_tokens=2000,
14 | max_output_tokens=2000,
15 | temperature=0.7,
16 | streaming=True,
17 | ).model_dump()
18 | )
19 |
20 |
21 | def test_default_retrievalconfig():
22 | config = RetrievalConfig()
23 |
24 | assert config.max_files == 20
25 | assert config.prompt is None
26 | print("\n\n", config.llm_config, "\n\n")
27 | print("\n\n", LLMEndpointConfig(), "\n\n")
28 | assert config.llm_config == LLMEndpointConfig()
29 |
--------------------------------------------------------------------------------
/core/tests/test_llm_endpoint.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 | from langchain_core.language_models import FakeListChatModel
5 | from pydantic import ValidationError
6 | from quivr_core.rag.entities.config import LLMEndpointConfig
7 | from quivr_core.llm import LLMEndpoint
8 |
9 |
10 | @pytest.mark.base
11 | def test_llm_endpoint_from_config_default():
12 | from langchain_openai import ChatOpenAI
13 |
14 | del os.environ["OPENAI_API_KEY"]
15 |
16 | with pytest.raises((ValidationError, ValueError)):
17 | llm = LLMEndpoint.from_config(LLMEndpointConfig())
18 |
19 | # Working default
20 | config = LLMEndpointConfig(llm_api_key="test")
21 | llm = LLMEndpoint.from_config(config=config)
22 |
23 | assert llm.supports_func_calling()
24 | assert isinstance(llm._llm, ChatOpenAI)
25 | assert llm._llm.model_name in llm.get_config().model
26 |
27 |
28 | @pytest.mark.base
29 | def test_llm_endpoint_from_config():
30 | from langchain_openai import ChatOpenAI
31 |
32 | config = LLMEndpointConfig(
33 | model="llama2", llm_api_key="test", llm_base_url="http://localhost:8441"
34 | )
35 | llm = LLMEndpoint.from_config(config)
36 |
37 | assert not llm.supports_func_calling()
38 | assert isinstance(llm._llm, ChatOpenAI)
39 | assert llm._llm.model_name in llm.get_config().model
40 |
41 |
42 | def test_llm_endpoint_constructor():
43 | llm_endpoint = FakeListChatModel(responses=[])
44 | llm_endpoint = LLMEndpoint(
45 | llm=llm_endpoint, llm_config=LLMEndpointConfig(model="test")
46 | )
47 |
48 | assert not llm_endpoint.supports_func_calling()
49 |
--------------------------------------------------------------------------------
/core/tests/test_quivr_file.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from uuid import uuid4
3 |
4 | from quivr_core.files.file import FileExtension, QuivrFile
5 |
6 |
7 | def test_create_file():
8 | id = uuid4()
9 | brain_id = uuid4()
10 | qfile = QuivrFile(
11 | id=id,
12 | brain_id=brain_id,
13 | original_filename="name",
14 | path=Path("/tmp/name"),
15 | file_extension=FileExtension.txt,
16 | file_sha1="123",
17 | )
18 |
19 | assert qfile.id == id
20 | assert qfile.brain_id == brain_id
21 | assert qfile.original_filename == "name"
22 | assert qfile.path == Path("/tmp/name")
23 |
24 |
25 | def test_create_file_add_metadata():
26 | id = uuid4()
27 | brain_id = uuid4()
28 | qfile = QuivrFile(
29 | id=id,
30 | brain_id=brain_id,
31 | original_filename="name",
32 | path=Path("/tmp/name"),
33 | file_extension=FileExtension.txt,
34 | file_sha1="123",
35 | metadata={"other_id": "id"},
36 | )
37 |
38 | assert qfile.metadata["other_id"] == "id"
39 |
--------------------------------------------------------------------------------
/core/tests/test_quivr_rag.py:
--------------------------------------------------------------------------------
1 | from uuid import uuid4
2 |
3 | import pytest
4 | from quivr_core.rag.entities.chat import ChatHistory
5 | from quivr_core.rag.entities.config import LLMEndpointConfig, RetrievalConfig
6 | from quivr_core.llm import LLMEndpoint
7 | from quivr_core.rag.entities.models import ParsedRAGChunkResponse, RAGResponseMetadata
8 | from quivr_core.rag.quivr_rag_langgraph import QuivrQARAGLangGraph
9 |
10 |
11 | @pytest.fixture(scope="function")
12 | def mock_chain_qa_stream(monkeypatch, chunks_stream_answer):
13 | class MockQAChain:
14 | async def astream_events(self, *args, **kwargs):
15 | default_metadata = {
16 | "langgraph_node": "generate",
17 | "is_final_node": False,
18 | "citations": None,
19 | "followup_questions": None,
20 | "sources": None,
21 | "metadata_model": None,
22 | }
23 |
24 | # Send all chunks except the last one
25 | for chunk in chunks_stream_answer[:-1]:
26 | yield {
27 | "event": "on_chat_model_stream",
28 | "metadata": default_metadata,
29 | "data": {"chunk": chunk["answer"]},
30 | }
31 |
32 | # Send the last chunk
33 | yield {
34 | "event": "end",
35 | "metadata": {
36 | "langgraph_node": "generate",
37 | "is_final_node": True,
38 | "citations": [],
39 | "followup_questions": None,
40 | "sources": [],
41 | "metadata_model": None,
42 | },
43 | "data": {"chunk": chunks_stream_answer[-1]["answer"]},
44 | }
45 |
46 | def mock_qa_chain(*args, **kwargs):
47 | self = args[0]
48 | self.final_nodes = ["generate"]
49 | return MockQAChain()
50 |
51 | monkeypatch.setattr(QuivrQARAGLangGraph, "build_chain", mock_qa_chain)
52 |
53 |
54 | @pytest.mark.base
55 | @pytest.mark.asyncio
56 | async def test_quivrqaraglanggraph(
57 | mem_vector_store, full_response, mock_chain_qa_stream, openai_api_key
58 | ):
59 | # Making sure the model
60 | llm_config = LLMEndpointConfig(model="gpt-4o")
61 | llm = LLMEndpoint.from_config(llm_config)
62 | retrieval_config = RetrievalConfig(llm_config=llm_config)
63 | chat_history = ChatHistory(uuid4(), uuid4())
64 | rag_pipeline = QuivrQARAGLangGraph(
65 | retrieval_config=retrieval_config, llm=llm, vector_store=mem_vector_store
66 | )
67 |
68 | stream_responses: list[ParsedRAGChunkResponse] = []
69 |
70 | # Making sure that we are calling the func_calling code path
71 | assert rag_pipeline.llm_endpoint.supports_func_calling()
72 | async for resp in rag_pipeline.answer_astream(
73 | "answer in bullet points. tell me something", chat_history, []
74 | ):
75 | stream_responses.append(resp)
76 |
77 | # This assertion passed
78 | assert all(
79 | not r.last_chunk for r in stream_responses[:-1]
80 | ), "Some chunks before last have last_chunk=True"
81 | assert stream_responses[-1].last_chunk
82 |
83 | # Let's check this assertion
84 | for idx, response in enumerate(stream_responses[1:-1]):
85 | assert (
86 | len(response.answer) > 0
87 | ), f"Sent an empty answer {response} at index {idx+1}"
88 |
89 | # Verify metadata
90 | default_metadata = RAGResponseMetadata().model_dump()
91 | assert all(
92 | r.metadata.model_dump() == default_metadata for r in stream_responses[:-1]
93 | )
94 | last_response = stream_responses[-1]
95 | # TODO(@aminediro) : test responses with sources
96 | assert last_response.metadata.sources == []
97 | assert last_response.metadata.citations == []
98 |
99 | # Assert whole response makes sense
100 | assert "".join([r.answer for r in stream_responses]) == full_response
101 |
--------------------------------------------------------------------------------
/core/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from uuid import uuid4
2 |
3 | import pytest
4 | from langchain_core.messages.ai import AIMessageChunk
5 | from langchain_core.messages.tool import ToolCall
6 | from quivr_core.rag.utils import (
7 | get_prev_message_str,
8 | model_supports_function_calling,
9 | parse_chunk_response,
10 | )
11 |
12 |
13 | def test_model_supports_function_calling():
14 | assert model_supports_function_calling("gpt-4") is True
15 | assert model_supports_function_calling("ollama3") is False
16 |
17 |
18 | def test_get_prev_message_incorrect_message():
19 | with pytest.raises(StopIteration):
20 | chunk = AIMessageChunk(
21 | content="",
22 | tool_calls=[ToolCall(name="test", args={"answer": ""}, id=str(uuid4()))],
23 | )
24 | assert get_prev_message_str(chunk) == ""
25 |
26 |
27 | def test_get_prev_message_str():
28 | chunk = AIMessageChunk(content="")
29 | assert get_prev_message_str(chunk) == ""
30 | # Test a correct chunk
31 | chunk = AIMessageChunk(
32 | content="",
33 | tool_calls=[
34 | ToolCall(
35 | name="cited_answer",
36 | args={"answer": "this is an answer"},
37 | id=str(uuid4()),
38 | )
39 | ],
40 | )
41 | assert get_prev_message_str(chunk) == "this is an answer"
42 |
43 |
44 | def test_parse_chunk_response_nofunc_calling():
45 | rolling_msg = AIMessageChunk(content="")
46 | chunk = AIMessageChunk(content="next ")
47 | for i in range(10):
48 | rolling_msg, parsed_chunk, _ = parse_chunk_response(rolling_msg, chunk, False)
49 | assert rolling_msg.content == "next " * (i + 1)
50 | assert parsed_chunk == "next "
51 |
52 |
53 | def _check_rolling_msg(rol_msg: AIMessageChunk) -> bool:
54 | return (
55 | len(rol_msg.tool_calls) > 0
56 | and rol_msg.tool_calls[0]["name"] == "cited_answer"
57 | and rol_msg.tool_calls[0]["args"] is not None
58 | and "answer" in rol_msg.tool_calls[0]["args"]
59 | )
60 |
61 |
62 | def test_parse_chunk_response_func_calling(chunks_stream_answer):
63 | rolling_msg = AIMessageChunk(content="")
64 |
65 | rolling_msgs_history = []
66 | answer_str_history: list[str] = []
67 |
68 | for chunk in chunks_stream_answer:
69 | # Extract the AIMessageChunk from the chunk dictionary
70 | chunk_msg = chunk["answer"] # Get the AIMessageChunk from the dict
71 | rolling_msg, answer_str, _ = parse_chunk_response(rolling_msg, chunk_msg, True)
72 | rolling_msgs_history.append(rolling_msg)
73 | answer_str_history.append(answer_str)
74 |
75 | # Checks that we accumulate into correctly
76 | last_rol_msg = None
77 | last_answer_chunk = None
78 |
79 | # TEST1:
80 | # Asserting that parsing accumulates the chunks
81 | for rol_msg in rolling_msgs_history:
82 | if last_rol_msg is not None:
83 | # Check tool_call_chunks accumulated correctly
84 | assert (
85 | len(rol_msg.tool_call_chunks) > 0
86 | and rol_msg.tool_call_chunks[0]["name"] == "cited_answer"
87 | and rol_msg.tool_call_chunks[0]["args"]
88 | )
89 | answer_chunk = rol_msg.tool_call_chunks[0]["args"]
90 | # assert that the answer is accumulated
91 | assert last_answer_chunk in answer_chunk
92 |
93 | if _check_rolling_msg(rol_msg):
94 | last_rol_msg = rol_msg
95 | last_answer_chunk = rol_msg.tool_call_chunks[0]["args"]
96 |
97 | # TEST2:
98 | # Progressively acc answer string
99 | assert all(
100 | answer_str_history[i] in answer_str_history[i + 1]
101 | for i in range(len(answer_str_history) - 1)
102 | )
103 | # NOTE: Last chunk's answer should match the accumulated history
104 | assert last_rol_msg.tool_calls[0]["args"]["answer"] == answer_str_history[-1] # type: ignore
105 |
--------------------------------------------------------------------------------
/core/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | isolated_build = True
3 | skipsdist = true
4 | envlist =
5 | py311
6 | py311-base
7 | py311-unstructured
8 | py311-pdf
9 |
10 |
11 | [testenv:py311]
12 | allowlist_externals =
13 | poetry
14 | commands_pre =
15 | poetry install --no-root --with test
16 | commands =
17 | poetry run pytest tests/ -m "not base" \
18 | --ignore=./tests/processor/epub \
19 | --ignore=./tests/processor/docx \
20 | --ignore=./tests/processor/odt \
21 | --ignore=./tests/processor/pdf \
22 | --ignore=tests/processor/community
23 |
24 | [testenv:py311-base]
25 | allowlist_externals =
26 | poetry
27 | commands_pre =
28 | poetry install --no-root --with test -E base
29 | commands =
30 | poetry run pytest tests/ -m base \
31 | --ignore=./tests/processor/epub \
32 | --ignore=./tests/processor/docx \
33 | --ignore=./tests/processor/odt \
34 | --ignore=./tests/processor/pdf \
35 | --ignore=tests/processor/community
36 |
37 | [testenv:py311-unstructured]
38 | allowlist_externals =
39 | poetry
40 | commands_pre =
41 | poetry install --no-root \
42 | -E csv \
43 | -E md \
44 | -E ipynb \
45 | -E epub \
46 | -E odt \
47 | -E docx \
48 | -E pptx \
49 | -E xlsx \
50 | --with test
51 | commands =
52 | poetry run pytest \
53 | tests/processor/epub \
54 | tests/processor/docx \
55 | tests/processor/docx \
56 | tests/processor/odt \
57 | tests/processor/community
58 |
59 |
60 | [testenv:py311-pdf]
61 | allowlist_externals =
62 | poetry
63 | commands_pre =
64 | poetry install --no-root -E pdf --with test
65 | commands =
66 | poetry run pytest tests/processor/pdf
67 |
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | # python generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # venv
10 | .venv
11 |
--------------------------------------------------------------------------------
/docs/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # docs
2 |
3 | Describe your project here.
4 |
--------------------------------------------------------------------------------
/docs/docs/brain/brain.md:
--------------------------------------------------------------------------------
1 | ::: quivr_core.brain.brain
2 | options:
3 | heading_level: 2
4 |
--------------------------------------------------------------------------------
/docs/docs/brain/chat.md:
--------------------------------------------------------------------------------
1 | ## ChatHistory
2 |
3 | The `ChatHistory` class is where all the conversation between the user and the LLM gets stored. A `ChatHistory` object will transparently be instanciated in the `Brain` every time you create one.
4 |
5 | At each interaction with `Brain.ask_streaming` both your message and the LLM's response are added to this chat history. It's super handy because this history is used in the Retrieval-Augmented Generation (RAG) process to give the LLM more context, working as form of memory between the user and the system and helping it generate better responses by looking at what’s already been said.
6 |
7 | You can also get some cool info about the brain by printing its details with the `print_info()` method, which shows things like how many chats are stored, the current chat history, and more. This makes it easy to keep track of what’s going on in your conversations and manage the context being sent to the LLM!
8 |
9 | ::: quivr_core.rag.entities.chat
10 | options:
11 | heading_level: 2
12 |
--------------------------------------------------------------------------------
/docs/docs/brain/index.md:
--------------------------------------------------------------------------------
1 | # Brain
2 |
3 | The brain is the essential component of Quivr that stores and processes the knowledge you want to retrieve informations from. Simply create a brain with the files you want to process and use the latest Quivr RAG workflow to retrieve informations from the knowledge.
4 |
5 | Quick Start 🪄:
6 |
7 | ```python
8 | from quivr_core import Brain
9 | from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph
10 |
11 |
12 | brain = Brain.from_files(name="My Brain", file_paths=["file1.pdf", "file2.pdf"])
13 | answer = brain.ask("What is Quivr ?")
14 | print("Answer Quivr :", answer.answer)
15 |
16 | ```
17 |
18 | Pimp your Brain 🔨 :
19 |
20 | ```python
21 | from quivr_core import Brain
22 | from quivr_core.llm.llm_endpoint import LLMEndpoint
23 | from quivr_core.embedder.embedder import DeterministicFakeEmbedding
24 | from quivr_core.llm.llm_endpoint import LLMEndpointConfig
25 | from quivr_core.llm.llm_endpoint import FakeListChatModel
26 |
27 | brain = Brain.from_files(
28 | name="test_brain",
29 | file_paths=["my/information/source/file.pdf"],
30 | llm=LLMEndpoint(
31 | llm=FakeListChatModel(responses=["good"]),
32 | llm_config=LLMEndpointConfig(model="fake_model", llm_base_url="local"),
33 | ),
34 | embedder=DeterministicFakeEmbedding(size=20),
35 | )
36 |
37 | answer = brain.ask(
38 | "What is Quivr ?"
39 | )
40 | print("Answer Quivr :", answer.answer)
41 |
42 | ```
43 |
--------------------------------------------------------------------------------
/docs/docs/config/base_config.md:
--------------------------------------------------------------------------------
1 | # Configuration Base Class
2 |
3 | ::: quivr_core.base_config
4 | options:
5 | heading_level: 2
--------------------------------------------------------------------------------
/docs/docs/config/config.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | ## Retrieval Configuration
4 | ::: quivr_core.rag.entities.config.RetrievalConfig
5 |
6 | ## Workflow Configuration
7 | ::: quivr_core.rag.entities.config.WorkflowConfig
8 |
9 | ## LLM Configuration
10 | ::: quivr_core.rag.entities.config.LLMEndpointConfig
11 |
12 | ## Reranker Configuration
13 | ::: quivr_core.rag.entities.config.RerankerConfig
14 |
15 | ## Supported LLM Model Suppliers
16 | ::: quivr_core.rag.entities.config.DefaultModelSuppliers
17 |
18 | ## Supported Rerankers
19 | ::: quivr_core.rag.entities.config.DefaultRerankers
20 |
--------------------------------------------------------------------------------
/docs/docs/config/index.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | The configuration classes are based on [Pydantic](https://docs.pydantic.dev/latest/) and allow the configuration of the ingestion and retrieval workflows via YAML files.
4 |
5 | Below is an example of a YAML configuration file for a basic RAG retrieval workflow.
6 | ```yaml
7 | workflow_config:
8 | name: "standard RAG"
9 | nodes:
10 | - name: "START"
11 | edges: ["filter_history"]
12 |
13 | - name: "filter_history"
14 | edges: ["rewrite"]
15 |
16 | - name: "rewrite"
17 | edges: ["retrieve"]
18 |
19 | - name: "retrieve"
20 | edges: ["generate_rag"]
21 |
22 | - name: "generate_rag" # the name of the last node, from which we want to stream the answer to the user, should always start with "generate"
23 | edges: ["END"]
24 | # Maximum number of previous conversation iterations
25 | # to include in the context of the answer
26 | max_history: 10
27 |
28 | prompt: "my prompt"
29 |
30 | max_files: 20
31 | reranker_config:
32 | # The reranker supplier to use
33 | supplier: "cohere"
34 |
35 | # The model to use for the reranker for the given supplier
36 | model: "rerank-multilingual-v3.0"
37 |
38 | # Number of chunks returned by the reranker
39 | top_n: 5
40 | llm_config:
41 |
42 | max_context_tokens: 2000
43 |
44 | temperature: 0.7
45 | streaming: true
46 | ```
47 |
--------------------------------------------------------------------------------
/docs/docs/css/style.css:
--------------------------------------------------------------------------------
1 | .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt,
2 | .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt {
3 | display: none !important;
4 | }
5 |
6 | /* CSS styles for side-by-side layout */
7 | .container {
8 | display: flex-col;
9 | justify-content: space-between;
10 | margin-bottom: 20px;
11 | /* Adjust spacing between sections */
12 | position: sticky;
13 | top: 2.4rem;
14 | z-index: 1000;
15 | /* Ensure it's above other content */
16 | background-color: white;
17 | /* Match your page background */
18 | padding: 0.2rem;
19 | }
20 |
21 | .example-heading {
22 | margin: 0.2rem !important;
23 | }
24 |
25 | .usage-examples {
26 | width: 100%;
27 | /* Adjust the width as needed */
28 | border: 1px solid var(--md-default-fg-color--light);
29 | border-radius: 2px;
30 | padding: 0.2rem;
31 | }
32 |
33 | /* Additional styling for the toggle */
34 | .toggle-example {
35 | cursor: pointer;
36 | color: white;
37 | text-decoration: underline;
38 | background-color: var(--md-primary-fg-color);
39 | padding: 0.2rem;
40 | border-radius: 2px;
41 | }
42 |
43 | .hidden {
44 | display: none;
45 | }
46 |
47 | /* mendable search styling */
48 | #my-component-root>div {
49 | bottom: 100px;
50 | }
--------------------------------------------------------------------------------
/docs/docs/examples/assets/chatbot_voice_flask.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/docs/docs/examples/assets/chatbot_voice_flask.mp4
--------------------------------------------------------------------------------
/docs/docs/examples/chatbot.md:
--------------------------------------------------------------------------------
1 | # Chatbot with Chainlit
2 |
3 | This example demonstrates a simple chatbot using **Quivr** and **Chainlit**, where users can upload a `.txt` file and ask questions based on its content.
4 |
5 | ---
6 |
7 | ## Prerequisites
8 |
9 | - **Python**: Version 3.8 or higher.
10 | - **OpenAI API Key**: Ensure you have a valid OpenAI API key.
11 |
12 | ---
13 |
14 | ## Installation
15 |
16 | 1. Clone the repository and navigate to the appropriate directory:
17 | ```bash
18 | git clone https://github.com/QuivrHQ/quivr
19 | cd examples/chatbot
20 | ```
21 |
22 | 2. Set the OpenAI API key as an environment variable:
23 | ```bash
24 | export OPENAI_API_KEY=''
25 | ```
26 |
27 | 3. Install the required dependencies:
28 | ```bash
29 | pip install -r requirements.lock
30 | ```
31 |
32 | ---
33 |
34 | ## Running the Chatbot
35 |
36 | 1. Start the Chainlit server:
37 | ```bash
38 | chainlit run main.py
39 | ```
40 |
41 | 2. Open your web browser and navigate to the URL displayed in the terminal (default: `http://localhost:8000`).
42 |
43 | ---
44 |
45 | ## Using the Chatbot
46 |
47 | ### File Upload
48 |
49 | 1. On the chatbot interface, upload a `.txt` file when prompted.
50 | 2. Ensure the file size is under **20MB**.
51 | 3. After uploading, the file is processed, and you will be notified when the chatbot is ready.
52 |
53 | ### Asking Questions
54 |
55 | 1. Type your questions into the chat input and press Enter.
56 | 2. The chatbot will respond based on the content of the uploaded file.
57 | 3. Relevant file sources for the answers are displayed in the chat.
58 |
59 | ---
60 |
61 | ## How It Works
62 |
63 | 1. **File Upload**:
64 | - Users upload a `.txt` file, which is temporarily saved.
65 | - The chatbot processes the file using Quivr to create a "brain."
66 |
67 | 2. **Session Handling**:
68 | - Chainlit manages the session to retain the file path and brain context.
69 |
70 | 3. **Question Answering**:
71 | - The chatbot uses the `ask_streaming` method from Quivr to process user queries.
72 | - Responses are streamed incrementally for faster feedback.
73 | - Relevant file excerpts (sources) are extracted and displayed.
74 |
75 | 4. **Retrieval Configuration**:
76 | - A YAML file (`basic_rag_workflow.yaml`) defines retrieval parameters for Quivr.
77 |
78 | ---
79 |
80 | ## Workflow
81 |
82 | ### Chat Start
83 |
84 | 1. Waits for the user to upload a `.txt` file.
85 | 2. Processes the file and creates a "brain."
86 | 3. Notifies the user when the system is ready for questions.
87 |
88 | ### On User Message
89 |
90 | 1. Retrieves the "brain" from the session.
91 | 2. Processes the user's question with Quivr.
92 | 3. Streams the response and displays it in the chat.
93 | 4. Extracts and shows relevant sources from the file.
94 |
95 | ---
96 |
97 | ## Features
98 |
99 | 1. **File Processing**: Creates a context-aware "brain" from the uploaded file.
100 | 2. **Streaming Responses**: Delivers answers incrementally for better user experience.
101 | 3. **Source Highlighting**: Displays file excerpts relevant to the answers.
102 |
103 | ---
104 |
105 | Enjoy interacting with your text files in a seamless Q&A format!
--------------------------------------------------------------------------------
/docs/docs/examples/chatbot_voice.md:
--------------------------------------------------------------------------------
1 | # Voice Chatbot with Chainlit
2 |
3 | This example demonstrates how to create a voice-enabled chatbot using **Quivr** and **Chainlit**. The chatbot lets users upload a text file, ask questions about its content, and interact using speech.
4 |
5 | ---
6 |
7 | ## Prerequisites
8 |
9 | - **Python**: Version 3.8 or higher.
10 | - **OpenAI API Key**: Ensure you have a valid OpenAI API key.
11 |
12 | ---
13 |
14 | ## Installation
15 |
16 | 1. Clone the repository and navigate to the appropriate directory:
17 | ```bash
18 | git clone https://github.com/QuivrHQ/quivr
19 | cd examples/chatbot_voice
20 | ```
21 |
22 | 2. Set the OpenAI API key as an environment variable:
23 | ```bash
24 | export OPENAI_API_KEY=''
25 | ```
26 |
27 | 3. Install the required dependencies:
28 | ```bash
29 | pip install -r requirements.lock
30 | ```
31 |
32 | ---
33 |
34 | ## Running the Chatbot
35 |
36 | 1. Start the Chainlit server:
37 | ```bash
38 | chainlit run main.py
39 | ```
40 |
41 | 2. Open your web browser and navigate to the URL displayed in the terminal (default: `http://localhost:8000`).
42 |
43 | ---
44 |
45 | ## Using the Chatbot
46 |
47 | ### File Upload
48 |
49 | 1. Once the interface loads, the chatbot will prompt you to upload a `.txt` file.
50 | 2. Click on the upload area or drag-and-drop a text file. Ensure the file size is under **20MB**.
51 | 3. After processing, the chatbot will notify you that it’s ready for interaction.
52 |
53 | ### Asking Questions
54 |
55 | 1. Type your questions in the input box or upload an audio file containing your question.
56 | 2. If using text input, the chatbot will respond with an answer derived from the uploaded file's content.
57 | 3. If using audio input:
58 | - The chatbot converts speech to text using OpenAI Whisper.
59 | - Processes the text query and provides a response.
60 | - Converts the response to audio, enabling hands-free interaction.
61 |
62 | ---
63 |
64 | ## Features
65 |
66 | 1. **Text File Processing**: Creates a "brain" for the uploaded file using Quivr for question answering.
67 | 2. **Speech-to-Text (STT)**: Transcribes user-uploaded audio queries using OpenAI Whisper.
68 | 3. **Text-to-Speech (TTS)**: Converts chatbot responses into audio for a seamless voice chat experience.
69 | 4. **Source Display**: Shows relevant file sources for each response.
70 | 5. **Real-Time Updates**: Uses streaming for live feedback during processing.
71 |
72 | ---
73 |
74 | ## How It Works
75 |
76 | 1. **File Upload**: The user uploads a `.txt` file, which is temporarily saved and processed into a "brain" using Quivr.
77 | 2. **Session Handling**: Chainlit manages user sessions to retain the uploaded file and brain context.
78 | 3. **Voice Interaction**:
79 | - Audio queries are processed via the OpenAI Whisper API.
80 | - Responses are generated and optionally converted into audio for playback.
81 | 4. **Streaming**: The chatbot streams its answers incrementally, improving response speed.
82 |
83 | ---
84 |
85 | ## Workflow
86 |
87 | ### Chat Start
88 |
89 | 1. Waits for a text file upload.
90 | 2. Processes the file into a "brain."
91 | 3. Notifies the user when ready for interaction.
92 |
93 | ### On User Message
94 |
95 | 1. Extracts the "brain" and queries it using the message content.
96 | 2. Streams the response back to the user.
97 | 3. Displays file sources related to the response.
98 |
99 | ### Audio Interaction
100 |
101 | 1. Captures and processes audio chunks during user input.
102 | 2. Converts captured audio into text using Whisper.
103 | 3. Queries the brain and provides both text and audio responses.
104 |
105 | ---
106 |
107 | Enjoy interacting with your documents in both text and voice modes!
--------------------------------------------------------------------------------
/docs/docs/examples/chatbot_voice_flask.md:
--------------------------------------------------------------------------------
1 | # Voice Chatbot with Flask
2 |
3 | This example demonstrates a simple chatbot using **Flask** and **Quivr**, where users can upload a `.txt` file and ask questions based on its content. It supports speech-to-text and text-to-speech capabilities for a seamless interactive experience.
4 |
5 |
8 | ---
9 |
10 | ## Prerequisites
11 |
12 | - **Python**: Version 3.8 or higher.
13 | - **OpenAI API Key**: Ensure you have a valid OpenAI API key.
14 |
15 | ---
16 |
17 | ## Installation
18 |
19 | 1. Clone the repository and navigate to the project directory:
20 | ```bash
21 | git clone https://github.com/QuivrHQ/quivr
22 | cd examples/quivr-whisper
23 | ```
24 |
25 | 2. Set the OpenAI API key as an environment variable:
26 | ```bash
27 | export OPENAI_API_KEY=''
28 | ```
29 |
30 | 3. Install the required dependencies:
31 | ```bash
32 | pip install -r requirements.lock
33 | ```
34 |
35 | ---
36 |
37 | ## Running the Application
38 |
39 | 1. Start the Flask server:
40 | ```bash
41 | python app.py
42 | ```
43 |
44 | 2. Open your web browser and navigate to the URL displayed in the terminal (default: `http://localhost:5000`).
45 |
46 | ---
47 |
48 | ## Using the Chatbot
49 |
50 | ### File Upload
51 |
52 | 1. On the interface, upload a `.txt` file.
53 | 2. Ensure the file format is supported and its size is manageable.
54 | 3. The file will be processed, and a "brain" instance will be created.
55 |
56 | ### Asking Questions
57 |
58 | 1. Use the microphone to record your question (audio upload).
59 | 2. The chatbot will process your question and respond with an audio answer.
60 |
61 | ---
62 |
63 | ## How It Works
64 |
65 | ### File Upload
66 | - Users upload a `.txt` file.
67 | - The file is saved to the `uploads` directory and used to create a "brain" using **Quivr**.
68 |
69 | ### Session Management
70 | - Each session is associated with a unique ID, allowing the system to cache the user's "brain."
71 |
72 | ### Speech-to-Text
73 | - User audio files are processed with OpenAI's **Whisper** model to generate transcripts.
74 |
75 | ### Question Answering
76 | - The "brain" processes the transcribed text, retrieves relevant answers, and generates a response.
77 |
78 | ### Text-to-Speech
79 | - The answer is converted to audio using OpenAI's text-to-speech model and returned to the user.
80 |
81 | ---
82 |
83 | ## Workflow
84 |
85 | 1. **Upload File**:
86 | - The user uploads a `.txt` file.
87 | - A "brain" is created and cached for the session.
88 |
89 | 2. **Ask Questions**:
90 | - The user uploads an audio file containing a question.
91 | - The question is transcribed, processed, and answered using the "brain."
92 |
93 | 3. **Answer Delivery**:
94 | - The answer is converted to audio and returned to the user as a Base64-encoded string.
95 |
96 | ---
97 |
98 | ## Features
99 |
100 | 1. **File Upload and Processing**:
101 | - Creates a context-aware "brain" from the uploaded text file.
102 |
103 | 2. **Audio-based Interaction**:
104 | - Supports speech-to-text for input and text-to-speech for responses.
105 |
106 | 3. **Session Management**:
107 | - Retains user context throughout the interaction.
108 |
109 | 4. **Integration with OpenAI**:
110 | - Uses OpenAI models for transcription, answer generation, and audio synthesis.
111 |
112 | ---
113 |
114 | Enjoy interacting with your text files through an intuitive voice-based interface!
--------------------------------------------------------------------------------
/docs/docs/examples/custom_storage.md:
--------------------------------------------------------------------------------
1 | # Transparent Storage
2 |
3 | **todo**
4 |
--------------------------------------------------------------------------------
/docs/docs/examples/index.md:
--------------------------------------------------------------------------------
1 | # Examples
--------------------------------------------------------------------------------
/docs/docs/index.md:
--------------------------------------------------------------------------------
1 | # Welcome to Quivr Documentation
2 |
3 | Quivr, helps you build your second brain, utilizes the power of GenerativeAI to be your personal assistant !
4 |
5 | ## Key Features 🎯
6 |
7 | - **Opiniated RAG**: We created a RAG that is opinionated, fast and efficient so you can focus on your product
8 | - **LLMs**: Quivr works with any LLM, you can use it with OpenAI, Anthropic, Mistral, Gemma, etc.
9 | - **Any File**: Quivr works with any file, you can use it with PDF, TXT, Markdown, etc and even add your own parsers.
10 | - **Customize your RAG**: Quivr allows you to customize your RAG, add internet search, add tools, etc.
11 | - **Integrations with Megaparse**: Quivr works with [Megaparse](https://github.com/quivrhq/megaparse), so you can ingest your files with Megaparse and use the RAG with Quivr.
12 |
13 | >We take care of the RAG so you can focus on your product. Simply install quivr-core and add it to your project. You can now ingest your files and ask questions.*
14 |
15 | **We will be improving the RAG and adding more features everything, stay tuned!**
16 |
17 |
18 | This is the core of Quivr, the brain of Quivr.com.
19 |
20 |
23 |
24 | ## Getting Started 🚀
25 |
26 | You can find everything on the [documentation](https://core.quivr.app/).
27 |
28 | ### Prerequisites 📋
29 |
30 | Ensure you have the following installed:
31 |
32 | - Python 3.10 or newer
33 |
34 | ### 30 seconds Installation 💽
35 |
36 |
37 | - **Step 1**: Install the package
38 |
39 |
40 |
41 | ```bash
42 | pip install quivr-core # Check that the installation worked
43 | ```
44 |
45 |
46 | - **Step 2**: Create a RAG with 5 lines of code
47 |
48 | ```python
49 | import tempfile
50 |
51 | from quivr_core import Brain
52 |
53 | if __name__ == "__main__":
54 | with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file:
55 | temp_file.write("Gold is a liquid of blue-like colour.")
56 | temp_file.flush()
57 |
58 | brain = Brain.from_files(
59 | name="test_brain",
60 | file_paths=[temp_file.name],
61 | )
62 |
63 | answer = brain.ask(
64 | "what is gold? asnwer in french"
65 | )
66 | print("answer:", answer)
67 | ```
68 |
69 |
--------------------------------------------------------------------------------
/docs/docs/parsers/index.md:
--------------------------------------------------------------------------------
1 | # Parsers
2 |
3 | Quivr provides a suite of parsers to extract structured data from various sources.
4 |
--------------------------------------------------------------------------------
/docs/docs/parsers/megaparse.md:
--------------------------------------------------------------------------------
1 | ## Megaparse
2 |
3 | ::: quivr_core.processor.implementations.megaparse_processor
4 | options:
5 | heading_level: 2
--------------------------------------------------------------------------------
/docs/docs/parsers/simple.md:
--------------------------------------------------------------------------------
1 | ## Simple Txt
2 |
3 | ::: quivr_core.processor.implementations.simple_txt_processor
4 | options:
5 | heading_level: 2
--------------------------------------------------------------------------------
/docs/docs/quickstart.md:
--------------------------------------------------------------------------------
1 | # Quick start
2 |
3 | If you need to quickly start talking to your list of files, here are the steps.
4 |
5 | 1. Add your API Keys to your environment variables
6 | ```python
7 | import os
8 | os.environ["OPENAI_API_KEY"] = "myopenai_apikey"
9 |
10 | ```
11 | Check our `.env.example` file to see the possible environment variables you can configure. Quivr supports APIs from Anthropic, OpenAI, and Mistral. It also supports local models using Ollama.
12 |
13 | 2. Create a Brain with Quivr default configuration
14 | ```python
15 | from quivr_core import Brain
16 |
17 | brain = Brain.from_files(name = "my smart brain",
18 | file_paths = ["/my_smart_doc.pdf", "/my_intelligent_doc.txt"],
19 | )
20 |
21 | ```
22 |
23 | 3. Launch a Chat
24 | ```python
25 | brain.print_info()
26 |
27 | from rich.console import Console
28 | from rich.panel import Panel
29 | from rich.prompt import Prompt
30 |
31 | console = Console()
32 | console.print(Panel.fit("Ask your brain !", style="bold magenta"))
33 |
34 | while True:
35 | # Get user input
36 | question = Prompt.ask("[bold cyan]Question[/bold cyan]")
37 |
38 | # Check if user wants to exit
39 | if question.lower() == "exit":
40 | console.print(Panel("Goodbye!", style="bold yellow"))
41 | break
42 |
43 | answer = brain.ask(question)
44 | # Print the answer with typing effect
45 | console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
46 |
47 | console.print("-" * console.width)
48 |
49 | brain.print_info()
50 | ```
51 |
52 | And now you are all set up to talk with your brain !
53 |
54 | ## Custom Brain
55 | If you want to change the language or embeddings model, you can modify the parameters of the brain.
56 |
57 | Let's say you want to use a LLM from Mistral and a specific embedding model :
58 | ```python
59 | from quivr_core import Brain
60 | from langchain_core.embeddings import Embeddings
61 |
62 | brain = Brain.from_files(name = "my smart brain",
63 | file_paths = ["/my_smart_doc.pdf", "/my_intelligent_doc.txt"],
64 | llm=LLMEndpoint(
65 | llm_config=LLMEndpointConfig(model="mistral-small-latest", llm_base_url="https://api.mistral.ai/v1/chat/completions"),
66 | ),
67 | embedder=Embeddings(size=64),
68 | )
69 | ```
70 |
71 | Note : [Embeddings](https://python.langchain.com/docs/integrations/text_embedding/) is a langchain class that lets you chose from a large variety of embedding models. Please check out the following docs to know the panel of models you can try.
72 |
73 | ## Launch with Chainlit
74 |
75 | If you want to quickly launch an interface with streamlit, you can simply do at the root of the project :
76 | ```bash
77 | cd examples/chatbot /
78 | rye sync /
79 | rye run chainlit run chainlit.py
80 | ```
81 | For more detail, go in [examples/chatbot/chainlit.md](https://github.com/QuivrHQ/quivr/tree/main/examples/chatbot)
82 |
83 | Note : Modify the Brain configs directly in examples/chatbot/main.py;
84 |
--------------------------------------------------------------------------------
/docs/docs/storage/base.md:
--------------------------------------------------------------------------------
1 | # StorageBase
2 |
3 | ::: quivr_core.storage.storage_base
4 | options:
5 | heading_level: 2
6 |
--------------------------------------------------------------------------------
/docs/docs/storage/index.md:
--------------------------------------------------------------------------------
1 | # 🗄️ Storage
2 |
3 | ## Your Brain’s File Management System
4 |
5 | The `Storage` class is the backbone of how a brain interacts with files in `quivr-core`. Every brain holds a reference to an underlying storage system to manage its files. All storages should implement the `StorageBase` base classe that provides the structure and methods to make that happen seamlessly. Let's walk through how it works:
6 |
7 | - **Brain-Storage Connection:** Your brain holds a reference to a storage system. This class is the main way your brain can interact with and manage the files it uses. Adding files to a brain will upload them to the storage. This means that files in the storage are stored **before** processing!
8 | - **File Management:** the storage holds a set of `QuivrFile` objects, which are the building blocks of your brain’s file system. The storage can store them remotely or locally or hold simple
9 |
10 | ### What can you do with this storage system?
11 |
12 | 1. Upload Files: You can add new files to your storage whenever you need. The system also lets you decide whether to overwrite existing files or not.
13 | 2. Get Files: Need to see what's in your storage? No problem. You can easily retrieve a list of all the files that are stored.
14 | 3. Delete Files: Clean-up is simple. You can remove any file from your storage by referencing its unique file ID (more on that in `QuivrFile`).
15 |
16 | StorageBase is the foundation of how your brain organizes, uploads, retrieves, and deletes its files. It ensures that your brain can always stay up-to-date with the files it needs, making file management smooth and intuitive. You can build your own storage system by subclassing the `StorageBase` class and passing it to the brain. See [custom_storage](../examples/custom_storage.md) for more details.
17 |
18 | ### Storage Implementations in `quivr_core`
19 |
20 | `quivr_core` currently offers two storage implementations: `LocalStorage` and `TransparentStorage`:
21 |
22 | - **LocalStorage**:
23 | This storage type is perfect when you want to keep files on your local machine. `LocalStorage` saves your files to a specific directory, either a default path (`~/.cache/quivr/files`) or a user-defined location. It can store files by copying them or by creating symbolic links to the original files, based on your preference. This storage type also keeps track of file hashes to prevent accidental overwrites during uploads.
24 |
25 | - **TransparentStorage**:
26 | The `TransparentStorage` implementation offers a lightweight and flexible approach, mainly managing files in memory without a need for local file paths. This storage system is useful when you don't need persistent storage but rather an easy way to store and retrieve files temporarily during the brain's operation.
27 |
28 | Each of these storage systems has its own strengths, catering to different use cases. As `quivr_core` evolves, we will implementat more ande more storage systems allowing for even more advanced and customized ways to manage your files like `S3Storage`, `NFSStorage` ...
29 |
--------------------------------------------------------------------------------
/docs/docs/storage/local_storage.md:
--------------------------------------------------------------------------------
1 | # LocalStorage
2 |
3 | ::: quivr_core.storage.local_storage
4 | options:
5 | heading_level: 2
6 |
--------------------------------------------------------------------------------
/docs/docs/vectorstores/faiss.md:
--------------------------------------------------------------------------------
1 | # Faiss
--------------------------------------------------------------------------------
/docs/docs/vectorstores/index.md:
--------------------------------------------------------------------------------
1 | # Vector Stores
2 |
3 |
--------------------------------------------------------------------------------
/docs/docs/vectorstores/pgvector.md:
--------------------------------------------------------------------------------
1 | # PGVector
--------------------------------------------------------------------------------
/docs/docs/workflows/examples/basic_ingestion.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/docs/docs/workflows/examples/basic_ingestion.excalidraw.png
--------------------------------------------------------------------------------
/docs/docs/workflows/examples/basic_ingestion.md:
--------------------------------------------------------------------------------
1 | # Basic ingestion
2 |
3 | 
4 |
5 |
6 | Creating a basic ingestion workflow like the one above is simple, here are the steps:
7 |
8 | 1. Add your API Keys to your environment variables
9 | ```python
10 | import os
11 | os.environ["OPENAI_API_KEY"] = "myopenai_apikey"
12 |
13 | ```
14 | Check our `.env.example` file to see the possible environment variables you can configure. Quivr supports APIs from Anthropic, OpenAI, and Mistral. It also supports local models using Ollama.
15 |
16 | 2. Create the YAML file ``basic_ingestion_workflow.yaml`` and copy the following content in it
17 | ```yaml
18 | parser_config:
19 | megaparse_config:
20 | strategy: "auto" # for unstructured, it can be "auto", "fast", "hi_res", "ocr_only", see https://docs.unstructured.io/open-source/concepts/partitioning-strategies#partitioning-strategies
21 | pdf_parser: "unstructured"
22 | splitter_config:
23 | chunk_size: 400 # in tokens
24 | chunk_overlap: 100 # in tokens
25 | ```
26 |
27 | 3. Create a Brain using the above configuration and the list of files you want to ingest
28 | ```python
29 | from quivr_core import Brain
30 | from quivr_core.config import IngestionConfig
31 |
32 | config_file_name = "./basic_ingestion_workflow.yaml"
33 |
34 | ingestion_config = IngestionConfig.from_yaml(config_file_name)
35 |
36 | processor_kwargs = {
37 | "megaparse_config": ingestion_config.parser_config.megaparse_config,
38 | "splitter_config": ingestion_config.parser_config.splitter_config,
39 | }
40 |
41 | brain = Brain.from_files(name = "my smart brain",
42 | file_paths = ["./my_first_doc.pdf", "./my_second_doc.txt"],
43 | processor_kwargs=processor_kwargs,
44 | )
45 |
46 | ```
47 |
48 | 4. Launch a Chat
49 | ```python
50 | brain.print_info()
51 |
52 | from rich.console import Console
53 | from rich.panel import Panel
54 | from rich.prompt import Prompt
55 |
56 | console = Console()
57 | console.print(Panel.fit("Ask your brain !", style="bold magenta"))
58 |
59 | while True:
60 | # Get user input
61 | question = Prompt.ask("[bold cyan]Question[/bold cyan]")
62 |
63 | # Check if user wants to exit
64 | if question.lower() == "exit":
65 | console.print(Panel("Goodbye!", style="bold yellow"))
66 | break
67 |
68 | answer = brain.ask(question)
69 | # Print the answer with typing effect
70 | console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
71 |
72 | console.print("-" * console.width)
73 |
74 | brain.print_info()
75 | ```
76 |
77 | 5. You are now all set up to talk with your brain and test different chunking strategies by simply changing the configuration file!
78 |
--------------------------------------------------------------------------------
/docs/docs/workflows/examples/basic_rag.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/docs/docs/workflows/examples/basic_rag.excalidraw.png
--------------------------------------------------------------------------------
/docs/docs/workflows/examples/basic_rag.md:
--------------------------------------------------------------------------------
1 | # Basic RAG
2 |
3 | 
4 |
5 |
6 | Creating a basic RAG workflow like the one above is simple, here are the steps:
7 |
8 |
9 | 1. Add your API Keys to your environment variables
10 | ```python
11 | import os
12 | os.environ["OPENAI_API_KEY"] = "myopenai_apikey"
13 |
14 | ```
15 | Check our `.env.example` file to see the possible environment variables you can configure. Quivr supports APIs from Anthropic, OpenAI, and Mistral. It also supports local models using Ollama.
16 |
17 | 2. Create the YAML file ``basic_rag_workflow.yaml`` and copy the following content in it
18 | ```yaml
19 | workflow_config:
20 | name: "standard RAG"
21 | nodes:
22 | - name: "START"
23 | edges: ["filter_history"]
24 |
25 | - name: "filter_history"
26 | edges: ["rewrite"]
27 |
28 | - name: "rewrite"
29 | edges: ["retrieve"]
30 |
31 | - name: "retrieve"
32 | edges: ["generate_rag"]
33 |
34 | - name: "generate_rag" # the name of the last node, from which we want to stream the answer to the user
35 | edges: ["END"]
36 |
37 | # Maximum number of previous conversation iterations
38 | # to include in the context of the answer
39 | max_history: 10
40 |
41 | # Reranker configuration
42 | reranker_config:
43 | # The reranker supplier to use
44 | supplier: "cohere"
45 |
46 | # The model to use for the reranker for the given supplier
47 | model: "rerank-multilingual-v3.0"
48 |
49 | # Number of chunks returned by the reranker
50 | top_n: 5
51 |
52 | # Configuration for the LLM
53 | llm_config:
54 |
55 | # maximum number of tokens passed to the LLM to generate the answer
56 | max_input_tokens: 4000
57 |
58 | # temperature for the LLM
59 | temperature: 0.7
60 | ```
61 |
62 | 3. Create a Brain with the default configuration
63 | ```python
64 | from quivr_core import Brain
65 |
66 | brain = Brain.from_files(name = "my smart brain",
67 | file_paths = ["./my_first_doc.pdf", "./my_second_doc.txt"],
68 | )
69 |
70 | ```
71 |
72 | 4. Launch a Chat
73 | ```python
74 | brain.print_info()
75 |
76 | from rich.console import Console
77 | from rich.panel import Panel
78 | from rich.prompt import Prompt
79 | from quivr_core.config import RetrievalConfig
80 |
81 | config_file_name = "./basic_rag_workflow.yaml"
82 |
83 | retrieval_config = RetrievalConfig.from_yaml(config_file_name)
84 |
85 | console = Console()
86 | console.print(Panel.fit("Ask your brain !", style="bold magenta"))
87 |
88 | while True:
89 | # Get user input
90 | question = Prompt.ask("[bold cyan]Question[/bold cyan]")
91 |
92 | # Check if user wants to exit
93 | if question.lower() == "exit":
94 | console.print(Panel("Goodbye!", style="bold yellow"))
95 | break
96 |
97 | answer = brain.ask(question, retrieval_config=retrieval_config)
98 | # Print the answer with typing effect
99 | console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
100 |
101 | console.print("-" * console.width)
102 |
103 | brain.print_info()
104 | ```
105 |
106 | 5. You are now all set up to talk with your brain and test different retrieval strategies by simply changing the configuration file!
107 |
--------------------------------------------------------------------------------
/docs/docs/workflows/examples/rag_with_web_search.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/docs/docs/workflows/examples/rag_with_web_search.excalidraw.png
--------------------------------------------------------------------------------
/docs/docs/workflows/examples/rag_with_web_search.md:
--------------------------------------------------------------------------------
1 | # RAG with web search
2 |
3 |
4 | 
5 |
6 | Follow the instructions below to create the agentic RAG workflow shown above, which includes some advanced capabilities such as:
7 |
8 | * **user intention detection** - the agent can detect if the user wants to activate the web search tool to look for information not present in the documents;
9 | * **dynamic chunk retrieval** - the number of retrieved chunks is not fixed, but determined dynamically using the reranker's relevance scores and the user-provided ``relevance_score_threshold``;
10 | * **web search** - the agent can search the web for more information if needed.
11 |
12 |
13 | ---
14 |
15 | 1. Add your API Keys to your environment variables
16 | ```python
17 | import os
18 | os.environ["OPENAI_API_KEY"] = "my_openai_api_key"
19 | os.environ["TAVILY_API_KEY"] = "my_tavily_api_key"
20 |
21 | ```
22 | Check our `.env.example` file to see the possible environment variables you can configure. Quivr supports APIs from Anthropic, OpenAI, and Mistral. It also supports local models using Ollama.
23 |
24 | 2. Create the YAML file ``rag_with_web_search_workflow.yaml`` and copy the following content in it
25 | ```yaml
26 | workflow_config:
27 | name: "RAG with web search"
28 |
29 | # List of tools that the agent can activate if the user instructions require it
30 | available_tools:
31 | - "web search"
32 |
33 | nodes:
34 | - name: "START"
35 | conditional_edge:
36 | routing_function: "routing_split"
37 | conditions: ["edit_system_prompt", "filter_history"]
38 |
39 | - name: "edit_system_prompt"
40 | edges: ["filter_history"]
41 |
42 | - name: "filter_history"
43 | edges: ["dynamic_retrieve"]
44 |
45 | - name: "dynamic_retrieve"
46 | conditional_edge:
47 | routing_function: "tool_routing"
48 | conditions: ["run_tool", "generate_rag"]
49 |
50 | - name: "run_tool"
51 | edges: ["generate_rag"]
52 |
53 | - name: "generate_rag" # the name of the last node, from which we want to stream the answer to the user
54 | edges: ["END"]
55 | tools:
56 | - name: "cited_answer"
57 |
58 | # Maximum number of previous conversation iterations
59 | # to include in the context of the answer
60 | max_history: 10
61 |
62 | # Number of chunks returned by the retriever
63 | k: 40
64 |
65 | # Reranker configuration
66 | reranker_config:
67 | # The reranker supplier to use
68 | supplier: "cohere"
69 |
70 | # The model to use for the reranker for the given supplier
71 | model: "rerank-multilingual-v3.0"
72 |
73 | # Number of chunks returned by the reranker
74 | top_n: 5
75 |
76 | # Among the chunks returned by the reranker, only those with relevance
77 | # scores equal or above the relevance_score_threshold will be returned
78 | # to the LLM to generate the answer (allowed values are between 0 and 1,
79 | # a value of 0.1 works well with the cohere and jina rerankers)
80 | relevance_score_threshold: 0.01
81 |
82 | # LLM configuration
83 | llm_config:
84 |
85 | # maximum number of tokens passed to the LLM to generate the answer
86 | max_input_tokens: 8000
87 |
88 | # temperature for the LLM
89 | temperature: 0.7
90 | ```
91 |
92 | 3. Create a Brain with the default configuration
93 | ```python
94 | from quivr_core import Brain
95 |
96 | brain = Brain.from_files(name = "my smart brain",
97 | file_paths = ["./my_first_doc.pdf", "./my_second_doc.txt"],
98 | )
99 |
100 | ```
101 |
102 | 4. Launch a Chat
103 | ```python
104 | brain.print_info()
105 |
106 | from rich.console import Console
107 | from rich.panel import Panel
108 | from rich.prompt import Prompt
109 | from quivr_core.config import RetrievalConfig
110 |
111 | config_file_name = "./rag_with_web_search_workflow.yaml"
112 |
113 | retrieval_config = RetrievalConfig.from_yaml(config_file_name)
114 |
115 | console = Console()
116 | console.print(Panel.fit("Ask your brain !", style="bold magenta"))
117 |
118 | while True:
119 | # Get user input
120 | question = Prompt.ask("[bold cyan]Question[/bold cyan]")
121 |
122 | # Check if user wants to exit
123 | if question.lower() == "exit":
124 | console.print(Panel("Goodbye!", style="bold yellow"))
125 | break
126 |
127 | answer = brain.ask(question, retrieval_config=retrieval_config)
128 | # Print the answer with typing effect
129 | console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
130 |
131 | console.print("-" * console.width)
132 |
133 | brain.print_info()
134 | ```
135 |
136 | 5. You are now all set up to talk with your brain and test different retrieval strategies by simply changing the configuration file!
137 |
--------------------------------------------------------------------------------
/docs/docs/workflows/index.md:
--------------------------------------------------------------------------------
1 | # Workflows
2 |
3 | In this section, you will find examples of workflows that you can use to create your own agentic RAG systems.
4 |
--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Quivr
2 | extra_css:
3 | - css/style.css
4 |
5 | markdown_extensions:
6 | - attr_list
7 | - admonition
8 | - pymdownx.details
9 | - pymdownx.superfences
10 | - md_in_html
11 | - toc:
12 | permalink: "#"
13 |
14 | theme:
15 | custom_dir: overrides
16 | features:
17 | - navigation.instant
18 | - navigation.tabs
19 | - navigation.indexes
20 | - navigation.top
21 | - navigation.footer
22 | - toc.follow
23 | - content.code.copy
24 | - search.suggest
25 | - search.highlight
26 | name: material
27 | palette:
28 | - media: (prefers-color-scheme)
29 | toggle:
30 | icon: material/brightness-auto
31 | name: Switch to light mode
32 | - accent: purple
33 | media: "(prefers-color-scheme: light)"
34 | primary: white
35 | scheme: default
36 | toggle:
37 | icon: material/brightness-7
38 | name: Switch to dark mode
39 | - accent: purple
40 | media: "(prefers-color-scheme: dark)"
41 | primary: black
42 | scheme: slate
43 | toggle:
44 | icon: material/brightness-4
45 | name: Switch to system preference
46 |
47 | plugins:
48 | - search
49 | - mkdocstrings:
50 | default_handler: python
51 | handlers:
52 | python:
53 | docstring_style: google
54 | options:
55 | show_source: false
56 | heading_level: 2
57 | separate_signature: true
58 |
59 | nav:
60 | - Home:
61 | - index.md
62 | - quickstart.md
63 | - Brain:
64 | - brain/index.md
65 | - brain/brain.md
66 | - brain/chat.md
67 | - Storage:
68 | - storage/index.md
69 | - storage/base.md
70 | - storage/local_storage.md
71 | - Parsers:
72 | - parsers/index.md
73 | - parsers/megaparse.md
74 | - parsers/simple.md
75 | - Vector Stores:
76 | - vectorstores/index.md
77 | - vectorstores/faiss.md
78 | - vectorstores/pgvector.md
79 | - Workflows:
80 | - workflows/index.md
81 | - Examples:
82 | - workflows/examples/basic_ingestion.md
83 | - workflows/examples/basic_rag.md
84 | - workflows/examples/rag_with_web_search.md
85 | - Configuration:
86 | - config/index.md
87 | - config/config.md
88 | - config/base_config.md
89 | - Examples:
90 | - examples/index.md
91 | - examples/custom_storage.md
92 | - examples/chatbot.md
93 | - examples/chatbot_voice.md
94 | - examples/chatbot_voice_flask.md
95 | - Enterprise: https://docs.quivr.app/
96 |
--------------------------------------------------------------------------------
/docs/overrides/empty:
--------------------------------------------------------------------------------
1 | empty
--------------------------------------------------------------------------------
/docs/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "docs"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | authors = [
6 | { name = "Stan Girard", email = "stan@quivr.app" }
7 | ]
8 | dependencies = [
9 | "quivr-core @ file:///${PROJECT_ROOT}/../core",
10 | "mkdocs>=1.6.1",
11 | "mkdocstrings[python]>=0.26.0",
12 | "mkdocs-jupyter>=0.24.8",
13 | "mkdocs-include-dir-to-nav>=1.2.0",
14 | "mkdocs-material>=9.5.34",
15 | "mkdocs-redirects>=1.2.1",
16 | ]
17 | readme = "README.md"
18 | requires-python = ">= 3.8"
19 |
20 | [build-system]
21 | requires = ["hatchling"]
22 | build-backend = "hatchling.build"
23 |
24 | [tool.rye]
25 | managed = true
26 | dev-dependencies = []
27 | virtual = true
28 |
29 | [tool.rye.scripts]
30 | docs = "mkdocs serve"
31 | build_docs = "mkdocs build --strict"
32 |
33 | [tool.basedpyright]
34 | include = ["src/"]
35 | # Ensure that it uses the .venv that we created for this project with the lockfile
36 | venvPath="./"
37 | venv=".venv"
38 | # We really only care about some import issues, so we disable everything and report on missing imports:
39 | typeCheckingMode = "off"
40 | reportMissingImports = true
41 |
42 |
43 | [tool.hatch.metadata]
44 | allow-direct-references = true
45 |
--------------------------------------------------------------------------------
/docs/src/docs/__init__.py:
--------------------------------------------------------------------------------
1 | def hello() -> str:
2 | return "Hello from docs!"
3 |
--------------------------------------------------------------------------------
/examples/chatbot/.chainlit/config.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | # Whether to enable telemetry (default: true). No personal data is collected.
3 | enable_telemetry = true
4 |
5 |
6 | # List of environment variables to be provided by each user to use the app.
7 | user_env = []
8 |
9 | # Duration (in seconds) during which the session is saved when the connection is lost
10 | session_timeout = 3600
11 |
12 | # Enable third parties caching (e.g LangChain cache)
13 | cache = false
14 |
15 | # Authorized origins
16 | allow_origins = ["*"]
17 |
18 | # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
19 | # follow_symlink = false
20 |
21 | [features]
22 | # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
23 | unsafe_allow_html = false
24 |
25 | # Process and display mathematical expressions. This can clash with "$" characters in messages.
26 | latex = false
27 |
28 | # Automatically tag threads with the current chat profile (if a chat profile is used)
29 | auto_tag_thread = true
30 |
31 | # Authorize users to spontaneously upload files with messages
32 | [features.spontaneous_file_upload]
33 | enabled = true
34 | accept = ["*/*"]
35 | max_files = 20
36 | max_size_mb = 500
37 |
38 | [features.audio]
39 | # Threshold for audio recording
40 | min_decibels = -45
41 | # Delay for the user to start speaking in MS
42 | initial_silence_timeout = 3000
43 | # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
44 | silence_timeout = 1500
45 | # Above this duration (MS), the recording will forcefully stop.
46 | max_duration = 15000
47 | # Duration of the audio chunks in MS
48 | chunk_duration = 1000
49 | # Sample rate of the audio
50 | sample_rate = 44100
51 |
52 | edit_message = true
53 |
54 | [UI]
55 | # Name of the assistant.
56 | name = "Quivr"
57 |
58 | # Description of the assistant. This is used for HTML tags.
59 | description = "Demo of Quivr"
60 |
61 | # Large size content are by default collapsed for a cleaner ui
62 | default_collapse_content = true
63 |
64 | # Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full".
65 | cot = "full"
66 |
67 | # Link to your github repo. This will add a github button in the UI's header.
68 | github = "https://github.com/quivrhq/quivr"
69 |
70 | # Specify a CSS file that can be used to customize the user interface.
71 | # The CSS file can be served from the public directory or via an external link.
72 | # custom_css = "/public/custom.css"
73 |
74 | # Specify a Javascript file that can be used to customize the user interface.
75 | # The Javascript file can be served from the public directory.
76 | # custom_js = "/public/test.js"
77 |
78 | # Specify a custom font url.
79 | # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
80 |
81 | # Specify a custom meta image url.
82 | # custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
83 |
84 | # Specify a custom build directory for the frontend.
85 | # This can be used to customize the frontend code.
86 | # Be careful: If this is a relative path, it should not start with a slash.
87 | # custom_build = "./public/build"
88 |
89 | [UI.theme]
90 | default = "dark"
91 | font_family = "Tahoma,Verdana,Segoe,sans-serif"
92 |
93 | # Override default MUI light theme. (Check theme.ts)
94 | [UI.theme.light]
95 | background = "#fcfcfc"
96 | paper = "#f8f8f8"
97 |
98 | [UI.theme.light.primary]
99 | main = "#6142d4"
100 | dark = "#6e53cf"
101 | light = "#6e53cf30"
102 | [UI.theme.light.text]
103 | primary = "#1f1f1f"
104 | secondary = "#818080"
105 |
106 | # Override default MUI dark theme. (Check theme.ts)
107 | [UI.theme.dark]
108 | background = "#252525"
109 | paper = "#1f1f1f"
110 |
111 | [UI.theme.dark.primary]
112 | main = "#6142d4"
113 | dark = "#6e53cf"
114 | light = "#6e53cf30"
115 | [UI.theme.dark.text]
116 | primary = "#f4f4f4"
117 | secondary = "#c8c8c8"
118 |
119 | [meta]
120 | generated_by = "1.1.402"
121 |
--------------------------------------------------------------------------------
/examples/chatbot/.gitignore:
--------------------------------------------------------------------------------
1 | # python generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # venv
10 | .venv
11 | .files
--------------------------------------------------------------------------------
/examples/chatbot/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/examples/chatbot/README.md:
--------------------------------------------------------------------------------
1 | # Quivr Chatbot Example
2 |
3 | This example demonstrates how to create a simple chatbot using Quivr and Chainlit. The chatbot allows users to upload a text file and then ask questions about its content.
4 |
5 | ## Prerequisites
6 |
7 | - Python 3.8 or higher
8 |
9 | ## Installation
10 |
11 | 1. Clone the repository and navigate to the `examples/chatbot` directory.
12 |
13 | 2. Make sure you have [rye](https://rye.astral.sh/) installed.
14 |
15 | 3. Install the requirements using `rye`:
16 |
17 | ```sh
18 | rye sync
19 | ```
20 | 4. Activate the venv
21 |
22 | ```sh
23 | source ./venv/bin/activate
24 | ```
25 |
26 | ## Running the Chatbot
27 |
28 | 1. Define your API key as environment variable. e.g. `export OPENAI_API_KEY=your-key-here`
29 |
30 | 2. Start the Chainlit server:
31 |
32 | ```
33 | chainlit run main.py
34 | ```
35 |
36 | 3. Open your web browser and go to the URL displayed in the terminal (usually `http://localhost:8000`).
37 |
38 | ## Using the Chatbot
39 |
40 | 1. When the chatbot interface loads, you will be prompted to upload a text file.
41 |
42 | 2. Click on the upload area and select a `.txt` file from your computer. The file size should not exceed 20MB.
43 |
44 | 3. After uploading, the chatbot will process the file and inform you when it's ready.
45 |
46 | 4. You can now start asking questions about the content of the uploaded file.
47 |
48 | 5. Type your questions in the chat input and press Enter. The chatbot will respond based on the information in the uploaded file.
49 |
50 | ## How It Works
51 |
52 | The chatbot uses the Quivr library to create a "brain" from the uploaded text file. This brain is then used to answer questions about the file's content. The Chainlit library provides the user interface and handles the chat interactions.
53 |
54 | Enjoy chatting with your documents!
55 |
--------------------------------------------------------------------------------
/examples/chatbot/basic_rag_workflow.yaml:
--------------------------------------------------------------------------------
1 | workflow_config:
2 | name: "standard RAG"
3 | nodes:
4 | - name: "START"
5 | edges: ["filter_history"]
6 |
7 | - name: "filter_history"
8 | edges: ["rewrite"]
9 |
10 | - name: "rewrite"
11 | edges: ["retrieve"]
12 |
13 | - name: "retrieve"
14 | edges: ["generate_rag"]
15 |
16 | - name: "generate_rag" # the name of the last node, from which we want to stream the answer to the user
17 | edges: ["END"]
18 | tools:
19 | - name: "cited_answer"
20 |
21 | # Maximum number of previous conversation iterations
22 | # to include in the context of the answer
23 | max_history: 10
24 |
25 | # Reranker configuration
26 | # reranker_config:
27 | # # The reranker supplier to use
28 | # supplier: "cohere"
29 |
30 | # # The model to use for the reranker for the given supplier
31 | # model: "rerank-multilingual-v3.0"
32 |
33 | # # Number of chunks returned by the reranker
34 | # top_n: 5
35 |
36 | # Configuration for the LLM
37 | llm_config:
38 |
39 | # maximum number of tokens passed to the LLM to generate the answer
40 | max_output_tokens: 4000
41 |
42 | # temperature for the LLM
43 | temperature: 0.7
44 |
--------------------------------------------------------------------------------
/examples/chatbot/chainlit.md:
--------------------------------------------------------------------------------
1 | # Quivr Chatbot Example
2 |
3 | This example demonstrates how to create a simple chatbot using Quivr and Chainlit. The chatbot allows users to upload a text file and then ask questions about its content.
4 |
5 | ## Prerequisites
6 |
7 | - Python 3.8 or higher
8 |
9 | ## Installation
10 |
11 | 1. Clone the repository or navigate to the `backend/core/examples/chatbot` directory.
12 |
13 | 2. Install the required dependencies:
14 |
15 | ```
16 | pip install -r requirements.txt
17 | ```
18 |
19 | ## Running the Chatbot
20 |
21 | 1. Start the Chainlit server:
22 |
23 | ```
24 | chainlit run main.py
25 | ```
26 |
27 | 2. Open your web browser and go to the URL displayed in the terminal (usually `http://localhost:8000`).
28 |
29 | ## Using the Chatbot
30 |
31 | 1. When the chatbot interface loads, you will be prompted to upload a text file.
32 |
33 | 2. Click on the upload area and select a `.txt` file from your computer. The file size should not exceed 20MB.
34 |
35 | 3. After uploading, the chatbot will process the file and inform you when it's ready.
36 |
37 | 4. You can now start asking questions about the content of the uploaded file.
38 |
39 | 5. Type your questions in the chat input and press Enter. The chatbot will respond based on the information in the uploaded file.
40 |
41 | ## How It Works
42 |
43 | The chatbot uses the Quivr library to create a "brain" from the uploaded text file. This brain is then used to answer questions about the file's content. The Chainlit library provides the user interface and handles the chat interactions.
44 |
45 | Enjoy chatting with your documents!
46 |
--------------------------------------------------------------------------------
/examples/chatbot/main.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 |
3 | import chainlit as cl
4 | from quivr_core import Brain
5 | from quivr_core.rag.entities.config import RetrievalConfig
6 |
7 |
8 | @cl.on_chat_start
9 | async def on_chat_start():
10 | files = None
11 |
12 | # Wait for the user to upload a file
13 | while files is None:
14 | files = await cl.AskFileMessage(
15 | content="Please upload a text .txt file to begin!",
16 | accept=["text/plain"],
17 | max_size_mb=20,
18 | timeout=180,
19 | ).send()
20 |
21 | file = files[0]
22 |
23 | msg = cl.Message(content=f"Processing `{file.name}`...")
24 | await msg.send()
25 |
26 | with open(file.path, "r", encoding="utf-8") as f:
27 | text = f.read()
28 |
29 | with tempfile.NamedTemporaryFile(
30 | mode="w", suffix=file.name, delete=False
31 | ) as temp_file:
32 | temp_file.write(text)
33 | temp_file.flush()
34 | temp_file_path = temp_file.name
35 |
36 | brain = Brain.from_files(name="user_brain", file_paths=[temp_file_path])
37 |
38 | # Store the file path in the session
39 | cl.user_session.set("file_path", temp_file_path)
40 |
41 | # Let the user know that the system is ready
42 | msg.content = f"Processing `{file.name}` done. You can now ask questions!"
43 | await msg.update()
44 |
45 | cl.user_session.set("brain", brain)
46 |
47 |
48 | @cl.on_message
49 | async def main(message: cl.Message):
50 | brain = cl.user_session.get("brain") # type: Brain
51 | path_config = "basic_rag_workflow.yaml"
52 | retrieval_config = RetrievalConfig.from_yaml(path_config)
53 |
54 | if brain is None:
55 | await cl.Message(content="Please upload a file first.").send()
56 | return
57 |
58 | # Prepare the message for streaming
59 | msg = cl.Message(content="", elements=[])
60 | await msg.send()
61 |
62 | saved_sources = set()
63 | saved_sources_complete = []
64 | elements = []
65 |
66 | # Use the ask_stream method for streaming responses
67 | async for chunk in brain.ask_streaming(message.content, retrieval_config=retrieval_config):
68 | await msg.stream_token(chunk.answer)
69 | for source in chunk.metadata.sources:
70 | if source.page_content not in saved_sources:
71 | saved_sources.add(source.page_content)
72 | saved_sources_complete.append(source)
73 | print(source)
74 | elements.append(cl.Text(name=source.metadata["original_file_name"], content=source.page_content, display="side"))
75 |
76 |
77 | await msg.send()
78 | sources = ""
79 | for source in saved_sources_complete:
80 | sources += f"- {source.metadata['original_file_name']}\n"
81 | msg.elements = elements
82 | msg.content = msg.content + f"\n\nSources:\n{sources}"
83 | await msg.update()
--------------------------------------------------------------------------------
/examples/chatbot/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/examples/chatbot/public/favicon.ico
--------------------------------------------------------------------------------
/examples/chatbot/public/logo_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/examples/chatbot/public/logo_dark.png
--------------------------------------------------------------------------------
/examples/chatbot/public/logo_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/examples/chatbot/public/logo_light.png
--------------------------------------------------------------------------------
/examples/chatbot/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "chatbot"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | authors = [
6 | { name = "Stan Girard", email = "stan@quivr.app" }
7 | ]
8 | dependencies = [
9 | "quivr-core @ file:///${PROJECT_ROOT}/../../core",
10 | "chainlit>=1.2.0",
11 | ]
12 | readme = "README.md"
13 | requires-python = ">= 3.11"
14 |
15 | [tool.rye]
16 | managed = true
17 | virtual = true
18 | dev-dependencies = []
19 |
--------------------------------------------------------------------------------
/examples/chatbot_voice/.chainlit/config.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | # Whether to enable telemetry (default: true). No personal data is collected.
3 | enable_telemetry = true
4 |
5 |
6 | # List of environment variables to be provided by each user to use the app.
7 | user_env = []
8 |
9 | # Duration (in seconds) during which the session is saved when the connection is lost
10 | session_timeout = 3600
11 |
12 | # Enable third parties caching (e.g LangChain cache)
13 | cache = false
14 |
15 | # Authorized origins
16 | allow_origins = ["*"]
17 |
18 | # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
19 | # follow_symlink = false
20 |
21 | [features]
22 | # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
23 | unsafe_allow_html = false
24 |
25 | # Process and display mathematical expressions. This can clash with "$" characters in messages.
26 | latex = false
27 |
28 | # Automatically tag threads with the current chat profile (if a chat profile is used)
29 | auto_tag_thread = true
30 |
31 | # Authorize users to spontaneously upload files with messages
32 | [features.spontaneous_file_upload]
33 | enabled = false
34 | accept = ["*/*"]
35 | max_files = 20
36 | max_size_mb = 500
37 |
38 | [features.audio]
39 | # Threshold for audio recording
40 | min_decibels = -45
41 | # Delay for the user to start speaking in MS
42 | initial_silence_timeout = 3000
43 | # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
44 | silence_timeout = 1500
45 | # Above this duration (MS), the recording will forcefully stop.
46 | max_duration = 15000
47 | # Duration of the audio chunks in MS
48 | chunk_duration = 1000
49 | # Sample rate of the audio
50 | sample_rate = 44100
51 |
52 | edit_message = true
53 |
54 | [UI]
55 | # Name of the assistant.
56 | name = "Quivr"
57 |
58 | # Description of the assistant. This is used for HTML tags.
59 | description = "Demo of Quivr"
60 |
61 | # Large size content are by default collapsed for a cleaner ui
62 | default_collapse_content = true
63 |
64 | # Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full".
65 | cot = "hidden"
66 |
67 | # Link to your github repo. This will add a github button in the UI's header.
68 | github = "https://github.com/quivrhq/quivr"
69 |
70 | # Specify a CSS file that can be used to customize the user interface.
71 | # The CSS file can be served from the public directory or via an external link.
72 | # custom_css = "/public/style.css"
73 |
74 | # Specify a Javascript file that can be used to customize the user interface.
75 | # The Javascript file can be served from the public directory.
76 | # custom_js = "/public/test.js"
77 |
78 | # Specify a custom font url.
79 | # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
80 |
81 | # Specify a custom meta image url.
82 | # custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
83 |
84 | # Specify a custom build directory for the frontend.
85 | # This can be used to customize the frontend code.
86 | # Be careful: If this is a relative path, it should not start with a slash.
87 | # custom_build = "./public/build"
88 |
89 | [UI.theme]
90 | default = "dark"
91 | font_family = "Tahoma,Verdana,Segoe,sans-serif"
92 |
93 | # Override default MUI light theme. (Check theme.ts)
94 | [UI.theme.light]
95 | background = "#fcfcfc"
96 | paper = "#f8f8f8"
97 |
98 | [UI.theme.light.primary]
99 | main = "#6142d4"
100 | dark = "#6e53cf"
101 | light = "#6e53cf30"
102 | [UI.theme.light.text]
103 | primary = "#1f1f1f"
104 | secondary = "#818080"
105 |
106 | # Override default MUI dark theme. (Check theme.ts)
107 | [UI.theme.dark]
108 | background = "#252525"
109 | paper = "#1f1f1f"
110 |
111 | [UI.theme.dark.primary]
112 | main = "#6142d4"
113 | dark = "#6e53cf"
114 | light = "#6e53cf30"
115 | [UI.theme.dark.text]
116 | primary = "#f4f4f4"
117 | secondary = "#c8c8c8"
118 |
119 | [meta]
120 | generated_by = "1.1.402"
121 |
--------------------------------------------------------------------------------
/examples/chatbot_voice/.gitignore:
--------------------------------------------------------------------------------
1 | # python generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # venv
10 | .venv
11 | .files
--------------------------------------------------------------------------------
/examples/chatbot_voice/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/examples/chatbot_voice/README.md:
--------------------------------------------------------------------------------
1 | # Quivr Chatbot Example
2 |
3 | This example demonstrates how to create a simple chatbot using Quivr and Chainlit. The chatbot allows users to upload a text file and then ask questions about its content.
4 |
5 | ## Prerequisites
6 |
7 | - Python 3.8 or higher
8 |
9 | ## Installation
10 |
11 | 1. Clone the repository or navigate to the `core/examples/chatbot` directory.
12 |
13 | 2. Install the required dependencies:
14 |
15 | ```
16 | pip install -r requirements.lock
17 | ```
18 |
19 | ## Running the Chatbot
20 |
21 | 1. Start the Chainlit server:
22 |
23 | ```
24 | chainlit run main.py
25 | ```
26 |
27 | 2. Open your web browser and go to the URL displayed in the terminal (usually `http://localhost:8000`).
28 |
29 | ## Using the Chatbot
30 |
31 | 1. When the chatbot interface loads, you will be prompted to upload a text file.
32 |
33 | 2. Click on the upload area and select a `.txt` file from your computer. The file size should not exceed 20MB.
34 |
35 | 3. After uploading, the chatbot will process the file and inform you when it's ready.
36 |
37 | 4. You can now start asking questions about the content of the uploaded file.
38 |
39 | 5. Type your questions in the chat input and press Enter. The chatbot will respond based on the information in the uploaded file.
40 |
41 | ## How It Works
42 |
43 | The chatbot uses the Quivr library to create a "brain" from the uploaded text file. This brain is then used to answer questions about the file's content. The Chainlit library provides the user interface and handles the chat interactions.
44 |
45 | Enjoy chatting with your documents!
46 |
--------------------------------------------------------------------------------
/examples/chatbot_voice/basic_rag_workflow.yaml:
--------------------------------------------------------------------------------
1 | workflow_config:
2 | name: "standard RAG"
3 | nodes:
4 | - name: "START"
5 | edges: ["filter_history"]
6 |
7 | - name: "filter_history"
8 | edges: ["rewrite"]
9 |
10 | - name: "rewrite"
11 | edges: ["retrieve"]
12 |
13 | - name: "retrieve"
14 | edges: ["generate_rag"]
15 |
16 | - name: "generate_rag" # the name of the last node, from which we want to stream the answer to the user
17 | edges: ["END"]
18 |
19 | # Maximum number of previous conversation iterations
20 | # to include in the context of the answer
21 | max_history: 10
22 |
23 | # Reranker configuration
24 | # reranker_config:
25 | # # The reranker supplier to use
26 | # supplier: "cohere"
27 |
28 | # # The model to use for the reranker for the given supplier
29 | # model: "rerank-multilingual-v3.0"
30 |
31 | # # Number of chunks returned by the reranker
32 | # top_n: 5
33 |
34 | # Configuration for the LLM
35 | llm_config:
36 |
37 | # maximum number of tokens passed to the LLM to generate the answer
38 | max_output_tokens: 4000
39 |
40 | # temperature for the LLM
41 | temperature: 0.7
42 |
--------------------------------------------------------------------------------
/examples/chatbot_voice/chainlit.md:
--------------------------------------------------------------------------------
1 | # Quivr Chatbot Example
2 |
3 | This example demonstrates how to create a simple chatbot using Quivr and Chainlit. The chatbot allows users to upload a text file and then ask questions about its content.
4 |
5 | ## Prerequisites
6 |
7 | - Python 3.8 or higher
8 |
9 | ## Installation
10 |
11 | 1. Clone the repository or navigate to the `backend/core/examples/chatbot` directory.
12 |
13 | 2. Install the required dependencies:
14 |
15 | ```
16 | pip install -r requirements.txt
17 | ```
18 |
19 | ## Running the Chatbot
20 |
21 | 1. Start the Chainlit server:
22 |
23 | ```
24 | chainlit run main.py
25 | ```
26 |
27 | 2. Open your web browser and go to the URL displayed in the terminal (usually `http://localhost:8000`).
28 |
29 | ## Using the Chatbot
30 |
31 | 1. When the chatbot interface loads, you will be prompted to upload a text file.
32 |
33 | 2. Click on the upload area and select a `.txt` file from your computer. The file size should not exceed 20MB.
34 |
35 | 3. After uploading, the chatbot will process the file and inform you when it's ready.
36 |
37 | 4. You can now start asking questions about the content of the uploaded file.
38 |
39 | 5. Type your questions in the chat input and press Enter. The chatbot will respond based on the information in the uploaded file.
40 |
41 | ## How It Works
42 |
43 | The chatbot uses the Quivr library to create a "brain" from the uploaded text file. This brain is then used to answer questions about the file's content. The Chainlit library provides the user interface and handles the chat interactions.
44 |
45 | Enjoy chatting with your documents!
46 |
--------------------------------------------------------------------------------
/examples/chatbot_voice/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/examples/chatbot_voice/public/favicon.ico
--------------------------------------------------------------------------------
/examples/chatbot_voice/public/logo_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/examples/chatbot_voice/public/logo_dark.png
--------------------------------------------------------------------------------
/examples/chatbot_voice/public/logo_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/examples/chatbot_voice/public/logo_light.png
--------------------------------------------------------------------------------
/examples/chatbot_voice/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "chatbot"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | authors = [
6 | { name = "Stan Girard", email = "stan@quivr.app" }
7 | ]
8 | dependencies = [
9 | "quivr-core @ file:///${PROJECT_ROOT}/../../core",
10 | "chainlit>=1.2.0",
11 | "openai>=1.54.5",
12 | ]
13 | readme = "README.md"
14 | requires-python = ">= 3.11"
15 |
16 | [tool.rye]
17 | managed = true
18 | virtual = true
19 | dev-dependencies = []
20 |
--------------------------------------------------------------------------------
/examples/pdf_parsing_tika.py:
--------------------------------------------------------------------------------
1 | from langchain_core.embeddings import DeterministicFakeEmbedding
2 | from langchain_core.language_models import FakeListChatModel
3 | from quivr_core import Brain
4 | from quivr_core.rag.entities.config import LLMEndpointConfig
5 | from quivr_core.llm.llm_endpoint import LLMEndpoint
6 | from rich.console import Console
7 | from rich.panel import Panel
8 | from rich.prompt import Prompt
9 |
10 | if __name__ == "__main__":
11 | brain = Brain.from_files(
12 | name="test_brain",
13 | file_paths=["tests/processor/data/dummy.pdf"],
14 | llm=LLMEndpoint(
15 | llm=FakeListChatModel(responses=["good"]),
16 | llm_config=LLMEndpointConfig(model="fake_model", llm_base_url="local"),
17 | ),
18 | embedder=DeterministicFakeEmbedding(size=20),
19 | )
20 | # Check brain info
21 | brain.print_info()
22 |
23 | console = Console()
24 | console.print(Panel.fit("Ask your brain !", style="bold magenta"))
25 |
26 | while True:
27 | # Get user input
28 | question = Prompt.ask("[bold cyan]Question[/bold cyan]")
29 |
30 | # Check if user wants to exit
31 | if question.lower() == "exit":
32 | console.print(Panel("Goodbye!", style="bold yellow"))
33 | break
34 |
35 | answer = brain.ask(question)
36 | # Print the answer with typing effect
37 | console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
38 |
39 | console.print("-" * console.width)
40 |
41 | brain.print_info()
42 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/.env_example:
--------------------------------------------------------------------------------
1 | QUIVR_API_KEY=XXXX
2 | QUIVR_CHAT_ID=1XXXX
3 | QUIVR_BRAIN_ID=XXXX
4 | QUIVR_URL=XXXX
5 | OPENAI_API_KEY=XXXX
6 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | uploads
--------------------------------------------------------------------------------
/examples/quivr-whisper/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/README.md:
--------------------------------------------------------------------------------
1 | # Quivr-Whisper
2 |
3 | Quivr-Whisper is a web application that allows users to ask questions via audio input. It leverages OpenAI's Whisper model for speech transcription and synthesizes responses using OpenAI's text-to-speech capabilities. The application queries the Quivr API to get a response based on the transcribed audio input.
4 |
5 |
6 |
7 | https://github.com/StanGirard/quivr-whisper/assets/19614572/9cc270c9-07e4-4ce1-bcff-380f195c9313
8 |
9 |
10 |
11 | ## Features
12 |
13 | - Audio input for asking questions
14 | - Speech transcription using OpenAI's Whisper model
15 | - Integration with Quivr API for intelligent responses
16 | - Speech synthesis of the response for audio playback
17 |
18 | ## Getting Started
19 |
20 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
21 |
22 | ### Prerequisites
23 |
24 | What things you need to install the software and how to install them:
25 |
26 | - Python 3.6+
27 | - pip for Python 3
28 | - Flask
29 | - OpenAI Python package
30 | - Requests package
31 |
32 | ### Installing
33 |
34 | A step by step series of examples that tell you how to get a development environment running:
35 |
36 | 1. Clone the repository to your local machine.
37 | ```bash
38 | git clone https://github.com/stangirard/quivr-whisper.git
39 | cd Quivr-talk
40 | ```
41 |
42 | 2. Install the required packages.
43 | ```bash
44 | pip install flask openai requests python-dotenv
45 | ```
46 |
47 | 3. Create a `.env` file in the root directory of the project and add your API keys and other configuration variables.
48 | ```env
49 | OPENAI_API_KEY='your_openai_api_key'
50 | QUIVR_API_KEY='your_quivr_api_key'
51 | QUIVR_CHAT_ID='your_quivr_chat_id'
52 | QUIVR_BRAIN_ID='your_quivr_brain_id'
53 | QUIVR_URL='https://api.quivr.app' # Optional, only if different from the default
54 | ```
55 |
56 | 4. Run the Flask application.
57 | ```bash
58 | flask run
59 | ```
60 |
61 | Your app should now be running on `http://localhost:5000`.
62 |
63 | ## Usage
64 |
65 | To use Quivr-talk, navigate to `http://localhost:5000` in your web browser, click on "Ask a question to Quivr", and record your question. Wait for the transcription and response to be synthesized, and you will hear the response played back to you.
66 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, request, jsonify, session
2 | import openai
3 | import base64
4 | import os
5 | import requests
6 | from dotenv import load_dotenv
7 | from quivr_core import Brain
8 | from quivr_core.rag.entities.config import RetrievalConfig
9 | from tempfile import NamedTemporaryFile
10 | from werkzeug.utils import secure_filename
11 | from asyncio import to_thread
12 | import asyncio
13 |
14 |
15 | UPLOAD_FOLDER = "uploads"
16 | ALLOWED_EXTENSIONS = {"txt"}
17 |
18 | os.makedirs(UPLOAD_FOLDER, exist_ok=True)
19 |
20 | app = Flask(__name__)
21 | app.secret_key = "secret"
22 | app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER
23 | app.config["CACHE_TYPE"] = "SimpleCache" # In-memory cache for development
24 | app.config["CACHE_DEFAULT_TIMEOUT"] = 60 * 60 # 1 hour cache timeout
25 | load_dotenv()
26 |
27 | openai.api_key = os.getenv("OPENAI_API_KEY")
28 |
29 | brains = {}
30 |
31 |
32 | @app.route("/")
33 | def index():
34 | return render_template("index.html")
35 |
36 |
37 | def run_in_event_loop(func, *args, **kwargs):
38 | loop = asyncio.new_event_loop()
39 | asyncio.set_event_loop(loop)
40 | if asyncio.iscoroutinefunction(func):
41 | result = loop.run_until_complete(func(*args, **kwargs))
42 | else:
43 | result = func(*args, **kwargs)
44 | loop.close()
45 | return result
46 |
47 |
48 | def allowed_file(filename):
49 | return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
50 |
51 |
52 | @app.route("/upload", methods=["POST"])
53 | async def upload_file():
54 | if "file" not in request.files:
55 | return "No file part", 400
56 |
57 | file = request.files["file"]
58 |
59 | if file.filename == "":
60 | return "No selected file", 400
61 | if not (file and file.filename and allowed_file(file.filename)):
62 | return "Invalid file type", 400
63 |
64 | filename = secure_filename(file.filename)
65 | filepath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
66 | file.save(filepath)
67 |
68 | print(f"File uploaded and saved at: {filepath}")
69 |
70 | print("Creating brain instance...")
71 |
72 | brain: Brain = await to_thread(
73 | run_in_event_loop, Brain.from_files, name="user_brain", file_paths=[filepath]
74 | )
75 |
76 | # Store brain instance in cache
77 | session_id = session.sid if hasattr(session, "sid") else os.urandom(16).hex()
78 | session["session_id"] = session_id
79 | # cache.set(session_id, brain) # Store the brain instance in the cache
80 | brains[session_id] = brain
81 | print(f"Brain instance created and stored in cache for session ID: {session_id}")
82 |
83 | return jsonify({"message": "Brain created successfully"})
84 |
85 |
86 | @app.route("/ask", methods=["POST"])
87 | async def ask():
88 | if "audio_data" not in request.files:
89 | return "Missing audio data", 400
90 |
91 | # Retrieve the brain instance from the cache using the session ID
92 | session_id = session.get("session_id")
93 | if not session_id:
94 | return "Session ID not found. Upload a file first.", 400
95 |
96 | brain = brains.get(session_id)
97 | if not brain:
98 | return "Brain instance not found in dict. Upload a file first.", 400
99 |
100 | print("Brain instance loaded from cache.")
101 |
102 | print("Speech to text...")
103 | audio_file = request.files["audio_data"]
104 | transcript = transcribe_audio_file(audio_file)
105 | print("Transcript result: ", transcript)
106 |
107 | print("Getting response...")
108 | quivr_response = await to_thread(run_in_event_loop, brain.ask, transcript)
109 |
110 | print("Text to speech...")
111 | audio_base64 = synthesize_speech(quivr_response.answer)
112 |
113 | print("Done")
114 | return jsonify({"audio_base64": audio_base64})
115 |
116 |
117 | def transcribe_audio_file(audio_file):
118 | with NamedTemporaryFile(suffix=".webm", delete=False) as temp_audio_file:
119 | audio_file.save(temp_audio_file)
120 | temp_audio_file_path = temp_audio_file.name
121 |
122 | try:
123 | with open(temp_audio_file_path, "rb") as f:
124 | transcript_response = openai.audio.transcriptions.create(
125 | model="whisper-1", file=f
126 | )
127 | transcript = transcript_response.text
128 | finally:
129 | os.unlink(temp_audio_file_path)
130 |
131 | return transcript
132 |
133 |
134 | def synthesize_speech(text):
135 | speech_response = openai.audio.speech.create(
136 | model="tts-1", voice="nova", input=text
137 | )
138 | audio_content = speech_response.content
139 | audio_base64 = base64.b64encode(audio_content).decode("utf-8")
140 | return audio_base64
141 |
142 |
143 | if __name__ == "__main__":
144 | app.run(debug=True)
145 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "quivr-whisper"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | authors = [
6 | { name = "Stan Girard", email = "stan@quivr.app" }
7 | ]
8 | dependencies = [
9 | "quivr-core @ file:///${PROJECT_ROOT}/../../core",
10 | "flask[async]>=3.1.0",
11 | "openai>=1.54.5",
12 | "flask-caching>=2.3.0",
13 | ]
14 | readme = "README.md"
15 | requires-python = ">= 3.11"
16 |
17 | [build-system]
18 | requires = ["hatchling"]
19 | build-backend = "hatchling.build"
20 |
21 | [tool.rye]
22 | managed = true
23 | virtual = true
24 | dev-dependencies = []
25 |
26 | [tool.hatch.metadata]
27 | allow-direct-references = true
28 |
29 | [tool.hatch.build.targets.wheel]
30 | packages = ["src/quivr_whisper"]
31 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/static/loader.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/static/mic-off.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/static/mic.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/quivr-whisper/static/styles.css:
--------------------------------------------------------------------------------
1 | * {
2 | box-sizing: border-box;
3 | margin: 0;
4 | padding: 0;
5 | }
6 |
7 |
8 | body {
9 | color: #f4f4f4;
10 | background-color: #252525;
11 | display: flex;
12 | gap: 1rem;
13 | align-items: center;
14 | flex-direction: column;
15 | justify-content: center;
16 | min-height: 100vh;
17 | }
18 |
19 | .primary {
20 | background-color: #6142d4;
21 | }
22 |
23 | button {
24 | background-color: #6142d4;
25 | border: none;
26 | padding: .75rem 2rem;
27 | border-radius: 0.5rem;
28 | color: #f4f4f4;
29 | cursor: pointer;
30 | }
31 |
32 | canvas {
33 | position: absolute;
34 | width: 100%;
35 | height: 100%;
36 | top: 0;
37 | left: 0;
38 | background-color: #252525;
39 | z-index: -1;
40 | }
41 |
42 | .record-btn {
43 | background-color: #f5f5f5;
44 | border: none;
45 | outline: none;
46 | width: 256px;
47 | height: 256px;
48 | background-repeat: no-repeat;
49 | background-position: center;
50 | border-radius: 50%;
51 | background-size: 50%;
52 | transition: background-color 200ms ease-in, transform 200ms ease-out;
53 | }
54 |
55 | .record-btn:hover {
56 | background-color: #fff;
57 | transform: scale(1.025);
58 | }
59 |
60 | .record-btn:active {
61 | background-color: #e2e2e2;
62 | transform: scale(0.975);
63 | }
64 |
65 | .record-btn[data-recording="true"] {
66 | background-image: url("./mic.svg");
67 | }
68 |
69 | .record-btn[data-recording="false"] {
70 | background-image: url("./mic-off.svg");
71 | }
72 |
73 | .record-btn[data-pending="true"] {
74 | background-image: url("./loader.svg") !important;
75 | animation: spin 1s linear infinite;
76 | }
77 |
78 | .hidden {
79 | display: none !important;
80 | visibility: hidden;
81 | }
82 |
83 | .custom-file-input {
84 | display: flex;
85 | flex-direction: column;
86 | align-items: center;
87 | gap: 10px;
88 | }
89 |
90 | .custom-file-input input[type="file"] {
91 | display: none;
92 | }
93 |
94 | .custom-file-input label {
95 | border: solid 2px #6142d4;
96 | color: white;
97 | padding: 8px 16px;
98 | border-radius: 4px;
99 | cursor: pointer;
100 | font-size: 14px;
101 | font-weight: bold;
102 | transition: background-color 0.3s;
103 | }
104 |
105 | .custom-file-input label:hover {
106 | background-color: #6142d4;
107 | }
108 |
109 | .custom-file-input span {
110 | font-size: 14px;
111 | color: #f4f4f4;
112 | }
113 |
114 | /* Adjust appearance when a file is selected */
115 | .custom-file-input span.file-selected {
116 | color: #ffffff;
117 | font-weight: bold;
118 | }
119 |
120 | /*
121 | # Override default MUI light theme. (Check theme.ts)
122 | [UI.theme.light]
123 | background = "#fcfcfc"
124 | paper = "#f8f8f8"
125 |
126 | [UI.theme.light.primary]
127 | main = "#6142d4"
128 | dark = "#6e53cf"
129 | light = "#6e53cf30"
130 | [UI.theme.light.text]
131 | primary = "#1f1f1f"
132 | secondary = "#818080"
133 |
134 | # Override default MUI dark theme. (Check theme.ts)
135 | [UI.theme.dark]
136 | background = "#252525"
137 | paper = "#1f1f1f"
138 |
139 | [UI.theme.dark.primary]
140 | main = "#6142d4"
141 | dark = "#6e53cf"
142 | light = "#6e53cf30"
143 | [UI.theme.dark.text]
144 | primary = "#f4f4f4"
145 | secondary = "#c8c8c8"
146 |
147 | */
148 |
149 | .loader {
150 | border: 4px solid #f3f3f3;
151 | border-radius: 50%;
152 | border-top: 4px solid #3498db;
153 | width: 50px;
154 | height: 50px;
155 | -webkit-animation: spin 2s linear infinite;
156 | animation: spin 2s linear infinite;
157 | position: absolute;
158 | /* Center the loader in the viewport */
159 | top: 50%;
160 | left: 50%;
161 | transform: translate(-50%, -50%);
162 | display: none;
163 | /* Hide it by default */
164 | }
165 |
166 | @-webkit-keyframes spin {
167 | 0% {
168 | -webkit-transform: rotate(0deg);
169 | }
170 |
171 | 100% {
172 | -webkit-transform: rotate(360deg);
173 | }
174 | }
175 |
176 | @keyframes spin {
177 | 0% {
178 | transform: rotate(0deg);
179 | }
180 |
181 | 100% {
182 | transform: rotate(360deg);
183 | }
184 | }
--------------------------------------------------------------------------------
/examples/quivr-whisper/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Audio Interaction WebApp
7 |
11 |
12 |
13 |
14 |
21 |
22 |
23 |
30 | No file chosen
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/examples/save_load_brain.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import tempfile
3 |
4 | from quivr_core import Brain
5 |
6 |
7 | async def main():
8 | with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file:
9 | temp_file.write("Gold is a liquid of blue-like colour.")
10 | temp_file.flush()
11 |
12 | brain = await Brain.afrom_files(name="test_brain", file_paths=[temp_file.name])
13 |
14 | save_path = await brain.save("/home/amine/.local/quivr")
15 |
16 | brain_loaded = Brain.load(save_path)
17 | brain_loaded.print_info()
18 |
19 |
20 | if __name__ == "__main__":
21 | # Run the main function in the existing event loop
22 | asyncio.run(main())
23 |
--------------------------------------------------------------------------------
/examples/simple_question/.gitignore:
--------------------------------------------------------------------------------
1 | # python generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # venv
10 | .venv
11 |
--------------------------------------------------------------------------------
/examples/simple_question/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.9
2 |
--------------------------------------------------------------------------------
/examples/simple_question/README.md:
--------------------------------------------------------------------------------
1 | # simple-question
2 |
3 | Describe your project here.
4 |
--------------------------------------------------------------------------------
/examples/simple_question/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "simple-question"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | authors = [
6 | { name = "Stan Girard", email = "stan@quivr.app" }
7 | ]
8 | dependencies = [
9 | "quivr-core @ file:///${PROJECT_ROOT}/../../core",
10 | "python-dotenv>=1.0.1",
11 | ]
12 | readme = "README.md"
13 | requires-python = ">= 3.11"
14 |
15 | [tool.rye]
16 | managed = true
17 | virtual = true
18 | dev-dependencies = []
19 |
--------------------------------------------------------------------------------
/examples/simple_question/simple_question.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 |
3 | from quivr_core import Brain
4 |
5 | import dotenv
6 |
7 | dotenv.load_dotenv()
8 |
9 | if __name__ == "__main__":
10 | with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file:
11 | temp_file.write("Gold is a liquid of blue-like colour.")
12 | temp_file.flush()
13 |
14 | brain = Brain.from_files(
15 | name="test_brain",
16 | file_paths=[temp_file.name],
17 | )
18 |
19 | answer = brain.ask("what is gold? answer in french")
20 | print("answer QuivrQARAGLangGraph :", answer)
21 |
--------------------------------------------------------------------------------
/examples/simple_question/simple_question_streaming.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import tempfile
3 |
4 | from dotenv import load_dotenv
5 | from quivr_core import Brain
6 | from quivr_core.quivr_rag import QuivrQARAG
7 | from quivr_core.rag.quivr_rag_langgraph import QuivrQARAGLangGraph
8 |
9 |
10 | async def main():
11 | dotenv_path = "/Users/jchevall/Coding/QuivrHQ/quivr/.env"
12 | load_dotenv(dotenv_path)
13 |
14 | with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as temp_file:
15 | temp_file.write("Gold is a liquid of blue-like colour.")
16 | temp_file.flush()
17 |
18 | brain = await Brain.afrom_files(name="test_brain", file_paths=[temp_file.name])
19 |
20 | await brain.save("~/.local/quivr")
21 |
22 | question = "what is gold? answer in french"
23 | async for chunk in brain.ask_streaming(question, rag_pipeline=QuivrQARAG):
24 | print("answer QuivrQARAG:", chunk.answer)
25 |
26 | async for chunk in brain.ask_streaming(
27 | question, rag_pipeline=QuivrQARAGLangGraph
28 | ):
29 | print("answer QuivrQARAGLangGraph:", chunk.answer)
30 |
31 |
32 | if __name__ == "__main__":
33 | # Run the main function in the existing event loop
34 | asyncio.run(main())
35 |
--------------------------------------------------------------------------------
/examples/simple_question_megaparse.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4 | from quivr_core import Brain
5 | from quivr_core.llm.llm_endpoint import LLMEndpoint
6 | from quivr_core.rag.entities.config import LLMEndpointConfig
7 | from rich.console import Console
8 | from rich.panel import Panel
9 | from rich.prompt import Prompt
10 |
11 | if __name__ == "__main__":
12 | brain = Brain.from_files(
13 | name="test_brain",
14 | file_paths=["./tests/processor/pdf/sample.pdf"],
15 | llm=LLMEndpoint(
16 | llm_config=LLMEndpointConfig(model="gpt-4o"),
17 | llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),
18 | ),
19 | )
20 | embedder = embeddings = OpenAIEmbeddings(
21 | model="text-embedding-3-large",
22 | )
23 | # Check brain info
24 | brain.print_info()
25 |
26 | console = Console()
27 | console.print(Panel.fit("Ask your brain !", style="bold magenta"))
28 |
29 | while True:
30 | # Get user input
31 | question = Prompt.ask("[bold cyan]Question[/bold cyan]")
32 |
33 | # Check if user wants to exit
34 | if question.lower() == "exit":
35 | console.print(Panel("Goodbye!", style="bold yellow"))
36 | break
37 |
38 | answer = brain.ask(question)
39 | # Print the answer with typing effect
40 | console.print(f"[bold green]Quivr Assistant[/bold green]: {answer.answer}")
41 |
42 | console.print("-" * console.width)
43 |
44 | brain.print_info()
45 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QuivrHQ/quivr/5dd44d8eb37b25e9d7c14c14df47f19849cdd031/logo.png
--------------------------------------------------------------------------------
/release-please-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "packages": {
3 | "core": {
4 | "release-type": "python",
5 | "package-name": "core",
6 | "bump-patch-for-minor-pre-major": true,
7 | "include-v-in-tag": false,
8 | "tag-separator": "-",
9 | "component": "core"
10 | }
11 | }
12 | }
--------------------------------------------------------------------------------
/vercel.json:
--------------------------------------------------------------------------------
1 | {
2 | "git": {
3 | "deploymentEnabled": {
4 | "main": false
5 | }
6 | }
7 | }
--------------------------------------------------------------------------------