├── .devcontainer
    └── devcontainer.json
├── .env.example
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.yml
    │   ├── config.yml
    │   └── feature-request.yml
    ├── actions
    │   └── install-deps-and-canopy
    │   │   └── action.yml
    └── workflows
    │   ├── PR.yml
    │   ├── build-push-image.yml
    │   ├── pre-release-CI.yml
    │   └── release.yml
├── .gitignore
├── .readme-content
    ├── class_architecture.png
    ├── new.gif
    ├── rag_flow.png
    ├── resin-chat-no-rag.gif
    └── sketch.png
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── deployment-gcp.md
    └── library.md
├── examples
    └── canopy-lib-quickstart.ipynb
├── pyproject.toml
├── scripts
    └── cleanup_indexes.py
├── src
    ├── canopy
    │   ├── __init__.py
    │   ├── chat_engine
    │   │   ├── __init__.py
    │   │   ├── chat_engine.py
    │   │   ├── exceptions.py
    │   │   ├── history_pruner
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── raising.py
    │   │   │   └── recent.py
    │   │   └── query_generator
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── cohere.py
    │   │   │   ├── function_calling.py
    │   │   │   ├── instruction.py
    │   │   │   └── last_message.py
    │   ├── config_templates
    │   │   ├── anyscale.yaml
    │   │   ├── azure.yaml
    │   │   ├── cohere.yaml
    │   │   ├── default.yaml
    │   │   └── octoai.yaml
    │   ├── context_engine
    │   │   ├── __init__.py
    │   │   ├── context_builder
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   └── stuffing.py
    │   │   └── context_engine.py
    │   ├── knowledge_base
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── chunker
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── langchain_text_splitter.py
    │   │   │   ├── markdown.py
    │   │   │   ├── recursive_character.py
    │   │   │   └── token_chunker.py
    │   │   ├── knowledge_base.py
    │   │   ├── models.py
    │   │   ├── qdrant
    │   │   │   ├── constants.py
    │   │   │   ├── converter.py
    │   │   │   ├── qdrant_knowledge_base.py
    │   │   │   └── utils.py
    │   │   ├── record_encoder
    │   │   │   ├── __init__.py
    │   │   │   ├── anyscale.py
    │   │   │   ├── azure_openai.py
    │   │   │   ├── base.py
    │   │   │   ├── cohere.py
    │   │   │   ├── dense.py
    │   │   │   ├── hybrid.py
    │   │   │   ├── jina.py
    │   │   │   ├── octoai.py
    │   │   │   ├── openai.py
    │   │   │   └── sentence_transformers.py
    │   │   └── reranker
    │   │   │   ├── __init__.py
    │   │   │   ├── cohere.py
    │   │   │   ├── reranker.py
    │   │   │   └── transparent.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── anyscale.py
    │   │   ├── azure_openai_llm.py
    │   │   ├── base.py
    │   │   ├── cohere.py
    │   │   ├── models.py
    │   │   ├── octoai.py
    │   │   └── openai.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── api_models.py
    │   │   └── data_models.py
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── cohere.py
    │   │   ├── llama.py
    │   │   ├── openai.py
    │   │   └── tokenizer.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── debugging.py
    │   │   └── directory.py
    ├── canopy_cli
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── cli_spinner.py
    │   ├── data_loader
    │   │   ├── __init__.py
    │   │   ├── data_loader.py
    │   │   └── errors.py
    │   └── errors.py
    └── canopy_server
    │   ├── __init__.py
    │   ├── _redocs_template.py
    │   ├── app.py
    │   └── models
    │       └── v1
    │           └── api_models.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── e2e
        ├── __init__.py
        └── test_app.py
    ├── system
        ├── __init__.py
        ├── knowledge_base
        │   ├── __init__.py
        │   ├── qdrant
        │   │   ├── __init__.py
        │   │   ├── common.py
        │   │   ├── conftest.py
        │   │   ├── test_async_qdrant_knowledge_base.py
        │   │   ├── test_config.yml
        │   │   └── test_qdrant_knowledge_base.py
        │   └── test_knowledge_base.py
        ├── llm
        │   ├── __init__.py
        │   ├── conftest.py
        │   ├── test_azure_openai.py
        │   ├── test_cohere.py
        │   └── test_openai.py
        ├── query_generator
        │   ├── test_cohere_query_generator.py
        │   └── test_query_generator_integration.py
        ├── record_encoder
        │   ├── test_anyscale_record_encoder.py
        │   ├── test_cohere_record_encoder.py
        │   ├── test_jina_record_encoder.py
        │   ├── test_octoai_record_encoder.py
        │   ├── test_openai_record_encoder.py
        │   └── test_sentence_transformers_encoder.py
        ├── reranker
        │   ├── __init__.py
        │   ├── test_cohere_reranker.py
        │   └── test_transparent_reranker.py
        ├── tokenizer
        │   ├── __init__.py
        │   └── test_cohere_api_tokenizer.py
        └── utils
        │   ├── __init__.py
        │   └── test_config.py
    ├── unit
        ├── __init__.py
        ├── chat_engine
        │   ├── __init__.py
        │   └── test_chat_engine.py
        ├── chunker
        │   ├── __init__.py
        │   ├── base_test_chunker.py
        │   ├── test_markdown_chunker.py
        │   ├── test_recursive_character_chunker.py
        │   ├── test_stub_chunker.py
        │   └── test_token_chunker.py
        ├── cli
        │   ├── test_data_loader.py
        │   └── test_non_schematic_data_loader.py
        ├── context_builder
        │   ├── __init__.py
        │   └── test_stuffing_context_builder.py
        ├── context_engine
        │   └── test_context_engine.py
        ├── history_pruner
        │   ├── test_raising_history_pruner.py
        │   └── test_recent_history_pruner.py
        ├── query_generators
        │   ├── __init__.py
        │   ├── test_function_calling_query_generator.py
        │   ├── test_instruction_query_generator.py
        │   └── test_last_message_query_generator.py
        ├── record_encoder
        │   ├── __init__.py
        │   ├── base_test_record_encoder.py
        │   ├── test_dense_record_encoder.py
        │   ├── test_hybrid_record_encoder.py
        │   ├── test_jina_record_encoder.py
        │   ├── test_sentence_transformers_encoder.py
        │   └── test_stub_record_encoder.py
        ├── stubs
        │   ├── __init__.py
        │   ├── stub_chunker.py
        │   ├── stub_dense_encoder.py
        │   ├── stub_record_encoder.py
        │   └── stub_tokenizer.py
        ├── tokenizer
        │   ├── __init__.py
        │   ├── base_test_tokenizer.py
        │   ├── test_cohere_hf_tokenizer.py
        │   ├── test_llama_tokenizer.py
        │   ├── test_openai_tokenizer.py
        │   ├── test_stub_tokenizer.py
        │   └── test_tokenizer_singleton.py
        └── utils
        │   ├── __init__.py
        │   ├── _stub_classes.py
        │   └── test_config.py
    └── util.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python
 3 | {
 4 | 	"name": "Python 3",
 5 | 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 6 | 	"image": "mcr.microsoft.com/devcontainers/python:1-3.9-bullseye"
 7 | 
 8 | 	// Features to add to the dev container. More info: https://containers.dev/features.
 9 | 	// "features": {},
10 | 
11 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
12 | 	// "forwardPorts": [],
13 | 
14 | 	// Use 'postCreateCommand' to run commands after the container is created.
15 | 	// "postCreateCommand": "pip3 install --user -r requirements.txt",
16 | 
17 | 	// Configure tool-specific properties.
18 | 	// "customizations": {},
19 | 
20 | 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
21 | 	// "remoteUser": "root"
22 | }
23 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | PINECONE_API_KEY="<PINECONE_API_KEY>"
2 | OPENAI_API_KEY="<OPENAI_API_KEY>"
3 | INDEX_NAME="<INDEX_NAME>"
4 | CANOPY_CONFIG_FILE="config/config.yaml"
5 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: 🐞 Bug
 2 | description: Report a bug or an issue you've found
 3 | title: "[Bug] <title>"
 4 | labels: ["bug", "triage"]
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         Thanks for taking the time to fill out this bug report!
10 |   - type: checkboxes
11 |     attributes:
12 |       label: Is this a new bug?
13 |       description: >
14 |         In other words: Is this an error, flaw, failure or fault? Please search issues to see if someone has already reported the bug you encountered.
15 |       options:
16 |         - label: I believe this is a new bug
17 |           required: true
18 |         - label: I have searched the existing issues, and I could not find an existing issue for this bug
19 |           required: true
20 |   - type: textarea
21 |     attributes:
22 |       label: Current Behavior
23 |       description: A concise description of what you're experiencing.
24 |     validations:
25 |       required: true
26 |   - type: textarea
27 |     attributes:
28 |       label: Expected Behavior
29 |       description: A concise description of what you expected to happen.
30 |     validations:
31 |       required: true
32 |   - type: textarea
33 |     attributes:
34 |       label: Steps To Reproduce
35 |       description: Steps to reproduce the behavior.
36 |       placeholder: |
37 |         1. In this environment...
38 |         2. With this config...
39 |         3. Run '...'
40 |         4. See error...
41 |     validations:
42 |       required: true
43 |   - type: textarea
44 |     id: logs
45 |     attributes:
46 |       label: Relevant log output
47 |       description: |
48 |         If applicable, log output to help explain your problem.
49 |       render: shell
50 |     validations:
51 |       required: false
52 |   - type: textarea
53 |     attributes:
54 |       label: Environment
55 |       description: |
56 |         examples:
57 |           - **OS**: Ubuntu 20.04
58 |           - **Language version**: Python 3.9.12 (`python3 --version`)
59 |           - **Canopy version**: 0.2 (`canopy --version`)
60 |       value: |
61 |         - **OS**:
62 |         - **Language version**:
63 |         - **Canopy version**:
64 |       render: markdown
65 |     validations:
66 |       required: false
67 |   - type: textarea
68 |     attributes:
69 |       label: Additional Context
70 |       description: |
71 |         Links? References? Anything that will give us more context about the issue you are encountering!
72 | 
73 |     validations:
74 |       required: false
75 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: 🤔 Ask a Question
4 |     url: 'https://github.com/pinecone-io/canopy/discussions/new?category=q-a'
5 |     about: Ask a question about how to use Canopy using GitHub discussions
6 | 
7 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: ✨ Feature
 2 | description: Propose a straightforward extension
 3 | title: "[Feature] <title>"
 4 | labels: ["enhancement", "triage"]
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         Thanks for taking the time to fill out this feature request!
10 |   - type: checkboxes
11 |     attributes:
12 |       label: Is this your first time submitting a feature request?
13 |       description: >
14 |         We want to make sure that features are distinct and discoverable,
15 |         so that other members of the community can find them and offer their thoughts.
16 | 
17 |         Issues are the right place to request straightforward extensions of existing functionality.
18 |       options:
19 |         - label: I have searched the existing issues, and I could not find an existing issue for this feature
20 |           required: true
21 |         - label: I am requesting a straightforward extension of existing functionality
22 |   - type: textarea
23 |     attributes:
24 |       label: Describe the feature
25 |       description: A clear and concise description of what you want to happen.
26 |     validations:
27 |       required: true
28 |   - type: textarea
29 |     attributes:
30 |       label: Describe alternatives you've considered
31 |       description: |
32 |         A clear and concise description of any alternative solutions or features you've considered.
33 |     validations:
34 |       required: false
35 |   - type: textarea
36 |     attributes:
37 |       label: Who will this benefit?
38 |       description: |
39 |         What kind of use case will this feature be useful for? Please be specific and provide examples, this will help us prioritize properly.
40 |     validations:
41 |       required: false
42 |   - type: input
43 |     attributes:
44 |       label: Are you interested in contributing this feature?
45 |       description: Let us know if you want to write some code, and how we can help.
46 |     validations:
47 |       required: false
48 |   - type: textarea
49 |     attributes:
50 |       label: Anything else?
51 |       description: |
52 |         Links? References? Anything that will give us more context about the feature you are suggesting!
53 |     validations:
54 |       required: false
55 | 


--------------------------------------------------------------------------------
/.github/actions/install-deps-and-canopy/action.yml:
--------------------------------------------------------------------------------
 1 | name: Install dependencies and canopy
 2 | description: "Installs Poetry, dependencies and optionally canopy library"
 3 | inputs:
 4 |   python-version:
 5 |     description: "Python version"
 6 |     required: true
 7 |     default: "3.9"
 8 |   install-canopy:
 9 |     description: "Whether to install canopy library, or dependencies only"
10 |     required: true
11 |     default: "true"
12 | runs:
13 |   using: "composite"
14 |   steps:
15 |   - name: Install Poetry
16 |     uses: snok/install-poetry@v1
17 |     with:
18 |       version: 1.3.2
19 |       virtualenvs-create: true
20 |       virtualenvs-in-project: true
21 |       installer-parallel: true
22 |   #----------------------------------------------
23 |   #       load cached venv if cache exists
24 |   #----------------------------------------------
25 |   - name: Load cached venv
26 |     id: cached-poetry-dependencies
27 |     uses: actions/cache@v3
28 |     with:
29 |       path: |
30 |         .venv
31 |         poetry.lock
32 |       key: venv-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('pyproject.toml') }}
33 |   #----------------------------------------------
34 |   # install dependencies if cache does not exist
35 |   #----------------------------------------------
36 |   - name: Install dependencies
37 |     shell: bash
38 |     if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
39 |     run: make install-extras POETRY_INSTALL_ARGS="--no-interaction --no-root --with dev"
40 |   - name: Install project
41 |     if: ${{ inputs.install-canopy == 'true' }}
42 |     shell: bash
43 |     run: make install-extras POETRY_INSTALL_ARGS="--with dev --no-interaction"
44 | 


--------------------------------------------------------------------------------
/.github/workflows/build-push-image.yml:
--------------------------------------------------------------------------------
 1 | name: Create and publish a Docker image
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       version:
 7 |         description: 'Version to tag the image with'
 8 |         required: true
 9 |         type: string
10 |   workflow_call:
11 |     inputs:
12 |       version:
13 |         description: 'Version to tag the image with'
14 |         required: true
15 |         type: string
16 |   push:
17 |     branches:
18 |       - 'main'
19 |     tags:
20 |       - 'v*'
21 | jobs:
22 |   build-and-push-image:
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       contents: read
26 |       packages: write
27 |     steps:
28 |       - name: Checkout repository
29 |         uses: actions/checkout@v4
30 |       - name: Set up QEMU
31 |         uses: docker/setup-qemu-action@v3
32 |       - name: Set up Docker Buildx
33 |         uses: docker/setup-buildx-action@v3
34 |       - name: Log in to the Container registry
35 |         uses: docker/login-action@v3
36 |         with:
37 |           registry: ghcr.io
38 |           username: ${{ github.actor }}
39 |           password: ${{ secrets.GITHUB_TOKEN }}
40 |       - name: Docker metadata
41 |         id: meta
42 |         uses: docker/metadata-action@v5
43 |         with:
44 |           images: |
45 |               ghcr.io/${{ github.repository }}
46 |           tags: |
47 |             type=ref,event=branch,enable=${{ github.event_name == 'push' }}
48 |             type=semver,pattern={{version}},enable=${{ github.event_name == 'push' }}
49 |             type=raw,value=latest,enable=${{ github.event_name != 'push' }}
50 |             type=raw,value=${{inputs.version}},enable=${{ github.event_name != 'push' }}
51 |       - name: Create build args
52 |         run: |
53 |           export POETRY_INSTALL_ARGS="$(make print-var VAR=POETRY_DEFAULT_EXTRAS)"
54 |           echo "POETRY_INSTALL_ARGS=$POETRY_INSTALL_ARGS" >> $GITHUB_OUTPUT
55 |         id: build-args
56 |       - name: Build and push
57 |         uses: docker/build-push-action@v5
58 |         with:
59 |           context: .
60 |           platforms: linux/amd64
61 |           push: true
62 |           build-args: |
63 |             POETRY_INSTALL_ARGS=${{steps.build-args.outputs.POETRY_INSTALL_ARGS}}
64 |           tags: ${{ steps.meta.outputs.tags }}
65 |           labels: ${{ steps.meta.outputs.labels }}
66 |           provenance: false
67 |           cache-from: type=gha
68 |           cache-to: type=gha,mode=max


--------------------------------------------------------------------------------
/.github/workflows/pre-release-CI.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Test installation
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   workflow_call:
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | jobs:
12 |   test-installation:
13 |     name: Test on ${{ matrix.os }}-py${{ matrix.python-version }}
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         os: [ubuntu-latest, windows-latest, macos-latest]
18 |         python-version: [3.9, '3.10', 3.11]
19 |     defaults:
20 |       run:
21 |         shell: bash
22 | 
23 |     steps:
24 |       - uses: actions/checkout@v3
25 | 
26 |       - name: Set up Python ${{ matrix.python-version }}
27 |         uses: actions/setup-python@v4
28 |         with:
29 |           python-version: ${{ matrix.python-version }}
30 | 
31 |       - name: Install Poetry
32 |         uses: snok/install-poetry@v1
33 |         with:
34 |           version: 1.3.2
35 |           virtualenvs-create: true
36 |           virtualenvs-in-project: true
37 |           installer-parallel: true
38 | 
39 |       - name: Download wheels
40 |         uses: actions/download-artifact@v2
41 |         with:
42 |           name: wheels
43 |           path: ./dist/
44 | 
45 |       - name: Install dev dependencies
46 |         run: |
47 |           poetry install --no-root --only dev --no-interaction
48 | 
49 |       - name: Install the wheel
50 |         run: |
51 |           source $VENV
52 |           ls -lah ./dist
53 |           pip install ./dist/canopy_sdk*.whl
54 | 
55 |       - name: Run unit tests
56 |         run: |
57 |           source $VENV
58 |           pytest --html=report.html --self-contained-html tests/unit
59 | 
60 |       - name: Upload pytest reports
61 |         if: failure()
62 |         uses: actions/upload-artifact@v3
63 |         with:
64 |           name: pytest-report-${{ matrix.os }}-py${{ matrix.python-version }}
65 |           path: .pytest_cache
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | # Mac OS
163 | **/.DS_Store
164 | 
165 | datafiles/*
166 | canopy-api-docs.html
167 | .vscode/
168 | *.jsonl
169 | 


--------------------------------------------------------------------------------
/.readme-content/class_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/class_architecture.png


--------------------------------------------------------------------------------
/.readme-content/new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/new.gif


--------------------------------------------------------------------------------
/.readme-content/rag_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/rag_flow.png


--------------------------------------------------------------------------------
/.readme-content/resin-chat-no-rag.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/resin-chat-no-rag.gif


--------------------------------------------------------------------------------
/.readme-content/sketch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/sketch.png


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TEST_WORKER_COUNT = 8
 2 | 
 3 | POETRY_DEFAULT_EXTRAS = -E cohere -E transformers -E grpc
 4 | POETRY_INSTALL_ARGS =
 5 | 
 6 | REPOSITORY = ghcr.io/pinecone-io/canopy
 7 | IMAGE_TAG = $(shell poetry version -s)
 8 | 
 9 | CONTAINER_PORT = 8000
10 | CONTAINER_ENV_FILE = .env
11 | CONTAINER_BUILD_DIR = .
12 | CONTAINER_BUILD_PLATFORM = linux/amd64
13 | CONTAINER_SYSTEM_BUILD_ARGS = --progress plain --platform $(CONTAINER_BUILD_PLATFORM) --build-arg PORT=$(CONTAINER_PORT) --build-arg POETRY_INSTALL_ARGS="$(POETRY_DEFAULT_EXTRAS) $(POETRY_INSTALL_ARGS)"
14 | CONTAINER_BUILD_ARGS =
15 | 
16 | # Only add the env file if it exists
17 | CONTAINER_SYSTEM_RUN_ARGS = --platform linux/amd64 -p $(CONTAINER_PORT):$(CONTAINER_PORT) $(shell [ -e "$(CONTAINER_ENV_FILE)" ] && echo "--env-file $(CONTAINER_ENV_FILE)")
18 | CONTAINER_RUN_ARGS =
19 | 
20 | 
21 | .PHONY: lint static install install-extras install-all-extras test test-unit test-system test-e2e docker-build docker-build-dev docker-run docker-run-dev print-var help
22 | 
23 | lint:
24 | 	poetry run flake8 .
25 | 
26 | static:
27 | 	poetry run mypy src
28 | 
29 | install:
30 | 	poetry install $(POETRY_INSTALL_ARGS)
31 | 
32 | install-extras:
33 | 	poetry install $(POETRY_DEFAULT_EXTRAS) $(POETRY_INSTALL_ARGS)
34 | 
35 | install-all-extras:
36 | 	poetry install --all-extras $(POETRY_INSTALL_ARGS)
37 | 
38 | test:
39 | 	poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope
40 | 
41 | test-unit:
42 | 	poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope tests/unit
43 | 
44 | test-system:
45 | 	poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope tests/system
46 | 
47 | test-e2e:
48 | 	poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope tests/e2e
49 | 
50 | docker-build:
51 | 	@echo "Building Docker image..."
52 | 	docker build $(CONTAINER_SYSTEM_BUILD_ARGS) $(CONTAINER_BUILD_ARGS) -t $(REPOSITORY):$(IMAGE_TAG) $(CONTAINER_BUILD_DIR)
53 | 	@echo "Docker build complete."
54 | 
55 | docker-build-dev:
56 | 	@echo "Building Docker image for development..."
57 | 	docker build $(CONTAINER_SYSTEM_BUILD_ARGS) $(CONTAINER_BUILD_ARGS) -t $(REPOSITORY)-dev:$(IMAGE_TAG) --target=development $(CONTAINER_BUILD_DIR)
58 | 	@echo "Development Docker build complete."
59 | 
60 | docker-run:
61 | 	docker run $(CONTAINER_SYSTEM_RUN_ARGS) $(CONTAINER_RUN_ARGS) $(REPOSITORY):$(IMAGE_TAG)
62 | 
63 | docker-run-dev:
64 | 	docker run $(CONTAINER_SYSTEM_RUN_ARGS) $(CONTAINER_RUN_ARGS) -it $(REPOSITORY)-dev:$(IMAGE_TAG)
65 | 
66 | print-var:
67 | 	@echo "$($(VAR))"
68 | 
69 | help:
70 | 	@echo "Available targets:"
71 | 	@echo ""
72 | 	@echo " -- DEV -- "
73 | 	@echo "  make install                     - Install only the required dependencies without any extras."
74 | 	@echo "  make install-extras              - Install the dependencies with the default extras."
75 | 	@echo "  make install-all-extras          - Install the dependencies with all extras."
76 | 	@echo "  make lint                        - Lint the code."
77 | 	@echo "  make static                      - Run static type checks."
78 | 	@echo "  make test                        - Test the code."
79 | 	@echo "  make test-unit                   - Run unit tests."
80 | 	@echo "  make test-system                 - Run system tests."
81 | 	@echo "  make test-e2e                    - Run e2e tests."
82 | 	@echo ""
83 | 	@echo " -- DOCKER -- "
84 | 	@echo "  make docker-build                - Build the Docker image."
85 | 	@echo "  make docker-build-dev            - Build the Docker image for development."
86 | 	@echo "  make docker-run                  - Run the Docker image."
87 | 	@echo "  make docker-run-dev              - Run the Docker image for development."
88 | 	@echo ""
89 | 	@echo " -- MISC -- "
90 | 	@echo "  make print-var VAR=<variable>    - Print the value of a variable."
91 | 
92 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "canopy-sdk"
  3 | version = "0.9.0"
  4 | description = "Retrieval Augmented Generation (RAG) framework and context engine powered by Pinecone"
  5 | authors = ["Relevance Team <relevance@pinecone.io>"]
  6 | readme = "README.md"
  7 | license = "Apache-2.0"
  8 | packages = [{include = "canopy", from = "src"},
  9 |             {include = "canopy_cli", from = "src"},
 10 |             {include = "canopy_server", from = "src"},]
 11 | 
 12 | [tool.poetry.dependencies]
 13 | python = ">=3.9,<3.13"
 14 | python-dotenv = "^1.0.0"
 15 | openai = "^1.2.3"
 16 | tiktoken = "^0.3.3"
 17 | pydantic = "^2.0.0"
 18 | pandas-stubs = "^2.0.3.230814"
 19 | fastapi = ">=0.93.0, <1.0.0"
 20 | uvicorn = ">=0.20.0, <1.0.0"
 21 | tenacity = "^8.2.1"
 22 | sse-starlette = "^1.6.5"
 23 | types-tqdm = "^4.61.0"
 24 | tqdm = "^4.66.1"
 25 | gunicorn = "^21.2.0"
 26 | types-pyyaml = "^6.0.12.12"
 27 | jsonschema = "^4.2.0"
 28 | types-jsonschema = "^4.2.0"
 29 | prompt-toolkit = "^3.0.39"
 30 | tokenizers = "^0.15.0"
 31 | transformers = {version = "^4.35.2", optional = true}
 32 | sentencepiece = "^0.1.99"
 33 | pandas = "2.0.0"
 34 | pyarrow = "^14.0.1"
 35 | qdrant-client = {version = "^1.8.0", optional = true}
 36 | cohere = { version = "^4.37", optional = true }
 37 | 
 38 | 
 39 | pinecone-text =  "^0.8.0"
 40 | # Extra: torch (Relies on pinecone-text[dense])
 41 | # Dependencies here should be synced with pinecone-text's pyproject.toml
 42 | # See: https://github.com/pinecone-io/pinecone-text/blob/0eb00a202f5c9bc8cc48c8b7536fcbabf95f096e/pyproject.toml#L30
 43 | torch = { version = ">=1.13.1", optional = true }
 44 | sentence-transformers = { version = ">=2.0.0", optional = true }
 45 | 
 46 | 
 47 | pinecone-client =  "^3.0.0"
 48 | # Extra: grpc (Relies on pinecone-client[grpc])
 49 | # Dependencies here should be synced with pinecone-python-client's pyproject.toml
 50 | # See: https://github.com/pinecone-io/pinecone-python-client/blob/886f932b66521a6ab5b1e076f6a53ba2f16eb41b/pyproject.toml#L94
 51 | grpcio = { version = ">=1.44.0", optional = true }
 52 | grpc-gateway-protoc-gen-openapiv2 = { version = "0.1.0", optional = true }
 53 | googleapis-common-protos = { version = ">=1.53.0", optional = true }
 54 | lz4 = { version = ">=3.1.3", optional = true }
 55 | protobuf = { version = "~=3.20.0", optional = true }
 56 | 
 57 | 
 58 | 
 59 | [tool.poetry.extras]
 60 | cohere = ["cohere"]
 61 | torch = ["torch", "sentence-transformers"]
 62 | transformers = ["transformers"]
 63 | grpc = ["grpcio", "grpc-gateway-protoc-gen-openapiv2", "googleapis-common-protos", "lz4", "protobuf"]
 64 | qdrant = ["qdrant-client"]
 65 | 
 66 | 
 67 | [tool.poetry.group.dev.dependencies]
 68 | pytest = "^7.3.2"
 69 | jupyter = "^1.0.0"
 70 | mypy = "^1.4.1"
 71 | flake8 = "^6.1.0"
 72 | pytest-html = "^4.1.0"
 73 | flake8-pyproject = "^1.2.3"
 74 | asyncio = "^3.4.3"
 75 | pytest-asyncio = "^0.14.0"
 76 | pytest-mock = "^3.6.1"
 77 | pytest-xdist = "^3.3.1"
 78 | types-requests = "^2.31.0.2"
 79 | httpx = "^0.25.0"
 80 | pydoclint = "^0.3.8"
 81 | pytest-dotenv = "^0.5.2"
 82 | 
 83 | [build-system]
 84 | requires = ["poetry-core"]
 85 | build-backend = "poetry.core.masonry.api"
 86 | 
 87 | 
 88 | [tool.mypy]
 89 | allow_redefinition = true
 90 | exclude = ['tests', '.venv']
 91 | 
 92 | [[tool.mypy.overrides]]
 93 | module = [
 94 |     'pinecone_text.*',
 95 |     'pinecone_datasets',
 96 |     'pinecone',
 97 |     'transformers.*',
 98 |     'tokenizers.*',
 99 |     'cohere.*',
100 |     'pinecone.grpc',
101 |     'huggingface_hub.utils',
102 |     'qdrant_client.*',
103 |     'grpc.*'
104 | ]
105 | ignore_missing_imports = true
106 | 
107 | 
108 | [tool.flake8]
109 | per-file-ignores = [
110 |     '*/__init__.py:F401',
111 | ]
112 | exclude = ['.venv']
113 | max-line-length = 88
114 | 
115 | # PyDocLint configuration
116 | style = 'google'
117 | arg-type-hints-in-docstring = false
118 | require-return-section-when-returning-nothing = false
119 | allow-init-docstring = true
120 | check-return-types = false
121 | skip-checking-raises = true
122 | 
123 | [tool.poetry.scripts]
124 | canopy = "canopy_cli.cli:cli"
125 | 
126 | [tool.pytest.ini_options]
127 | log_cli = true
128 | 


--------------------------------------------------------------------------------
/scripts/cleanup_indexes.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from tests.util import cleanup_indexes
 4 | 
 5 | 
 6 | def main():
 7 |     logging.basicConfig(level=logging.INFO)
 8 |     logger = logging.getLogger(__name__)
 9 | 
10 |     if len(sys.argv) != 2:
11 |         logger.info("Usage: python scripts/cleanup_indexes.py <testrun_uid>")
12 |         sys.exit(1)
13 | 
14 |     testrun_uid = sys.argv[1]
15 |     if testrun_uid:
16 |         logger.info(f"Cleaning up indexes for testrun_uid '{testrun_uid}'")
17 |         cleanup_indexes(testrun_uid)
18 |     else:
19 |         logger.info("Passed testrun_uid is empty.")
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/src/canopy/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib.metadata
 2 | import warnings
 3 | import logging
 4 | import os
 5 | from typing import List
 6 | 
 7 | # Taken from https://stackoverflow.com/a/67097076
 8 | __version__ = importlib.metadata.version("canopy-sdk")
 9 | 
10 | 
11 | IGNORED_WARNINGS: List[str] = [
12 | ]
13 | 
14 | IGNORED_WARNING_IN_MODULES = [
15 |     "transformers",
16 | ]
17 | 
18 | for warning in IGNORED_WARNINGS:
19 |     warnings.filterwarnings("ignore", message=warning)
20 | for module in IGNORED_WARNING_IN_MODULES:
21 |     warnings.filterwarnings("ignore", module=module)
22 |     logging.getLogger(module).setLevel(logging.ERROR)
23 | 
24 | # Apparently, `transformers` has its own logging system, and needs to be silenced separately # noqa: E501
25 | os.environ["TRANSFORMERS_VERBOSITY"] = "error"
26 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/__init__.py:
--------------------------------------------------------------------------------
1 | from .chat_engine import ChatEngine
2 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/exceptions.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | class InvalidRequestError(Exception):
4 |     pass
5 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/history_pruner/__init__.py:
--------------------------------------------------------------------------------
1 | from .recent import RecentHistoryPruner
2 | from .raising import RaisingHistoryPruner
3 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/history_pruner/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Optional
 3 | 
 4 | from canopy.tokenizer import Tokenizer
 5 | from canopy.models.data_models import Messages, SystemMessage, Context
 6 | from canopy.utils.config import ConfigurableMixin
 7 | 
 8 | 
 9 | class HistoryPruner(ABC, ConfigurableMixin):
10 | 
11 |     def __init__(self):
12 |         self._tokenizer = Tokenizer()
13 | 
14 |     @abstractmethod
15 |     def build(self,
16 |               chat_history: Messages,
17 |               max_tokens: int,
18 |               system_prompt: Optional[str] = None,
19 |               context: Optional[Context] = None,
20 |               ) -> Messages:
21 |         raise NotImplementedError
22 | 
23 |     async def abuild(self,
24 |                      chat_history: Messages,
25 |                      max_tokens: int) -> Messages:
26 |         raise NotImplementedError()
27 | 
28 |     def _max_tokens_history(self,
29 |                             max_tokens: int,
30 |                             system_prompt: Optional[str] = None,
31 |                             context: Optional[Context] = None, ) -> int:
32 |         if system_prompt is not None:
33 |             max_tokens -= self._tokenizer.messages_token_count(
34 |                 [SystemMessage(content=system_prompt)]
35 |             )
36 | 
37 |         if context is not None:
38 |             max_tokens -= context.num_tokens
39 | 
40 |         return max_tokens
41 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/history_pruner/raising.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from canopy.chat_engine.history_pruner.base import HistoryPruner
 4 | from canopy.models.data_models import Messages, Context
 5 | 
 6 | 
 7 | class RaisingHistoryPruner(HistoryPruner):
 8 | 
 9 |     def build(self,
10 |               chat_history: Messages,
11 |               max_tokens: int,
12 |               system_prompt: Optional[str] = None,
13 |               context: Optional[Context] = None, ) -> Messages:
14 |         max_tokens = self._max_tokens_history(max_tokens,
15 |                                               system_prompt,
16 |                                               context)
17 |         token_count = self._tokenizer.messages_token_count(chat_history)
18 |         if token_count > max_tokens:
19 |             raise ValueError(f"The history require {token_count} tokens, "
20 |                              f"which exceeds the calculated limit for history "
21 |                              f"of {max_tokens} tokens left for"
22 |                              f" history out of {max_tokens} tokens"
23 |                              f" allowed in context window.")
24 |         return chat_history
25 | 
26 |     async def abuild(self,
27 |                      chat_history: Messages,
28 |                      max_tokens: int) -> Messages:
29 |         raise NotImplementedError()
30 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/history_pruner/recent.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from canopy.chat_engine.history_pruner.base import HistoryPruner
 4 | from canopy.models.data_models import Messages, Context
 5 | 
 6 | 
 7 | class RecentHistoryPruner(HistoryPruner):
 8 | 
 9 |     def __init__(self,
10 |                  min_history_messages: int = 1):
11 |         super().__init__()
12 |         self._min_history_messages = min_history_messages
13 | 
14 |     def build(self,
15 |               chat_history: Messages,
16 |               max_tokens: int,
17 |               system_prompt: Optional[str] = None,
18 |               context: Optional[Context] = None,
19 |               ) -> Messages:
20 |         max_tokens = self._max_tokens_history(max_tokens,
21 |                                               system_prompt,
22 |                                               context)
23 |         token_count = self._tokenizer.messages_token_count(chat_history)
24 |         if token_count < max_tokens:
25 |             return chat_history
26 | 
27 |         truncated_history = chat_history[-self._min_history_messages:]
28 |         token_count = self._tokenizer.messages_token_count(truncated_history)
29 |         if token_count > max_tokens:
30 |             raise ValueError(f"The {self._min_history_messages} most recent messages in"
31 |                              f" history require {token_count} tokens, which exceeds the"
32 |                              f" calculated limit for history of {max_tokens}"
33 |                              f" tokens out of total {max_tokens} allowed"
34 |                              f" in context window.")
35 | 
36 |         for message in reversed(chat_history[:-self._min_history_messages]):
37 |             token_count = self._tokenizer.messages_token_count(
38 |                 truncated_history + [message]
39 |             )
40 | 
41 |             # If the message can fit into the remaining tokens, add it
42 |             if token_count > max_tokens:
43 |                 break
44 | 
45 |             truncated_history.insert(0, message)
46 | 
47 |         return truncated_history
48 | 
49 |     async def abuild(self,
50 |                      chat_history: Messages,
51 |                      max_tokens: int) -> Messages:
52 |         raise NotImplementedError()
53 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/query_generator/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import QueryGenerator
2 | from .function_calling import FunctionCallingQueryGenerator
3 | from .last_message import LastMessageQueryGenerator
4 | from .instruction import InstructionQueryGenerator
5 | from .cohere import CohereQueryGenerator
6 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/query_generator/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | from canopy.models.data_models import Messages, Query
 5 | from canopy.utils.config import ConfigurableMixin
 6 | 
 7 | 
 8 | class QueryGenerator(ABC, ConfigurableMixin):
 9 |     @abstractmethod
10 |     def generate(self,
11 |                  messages: Messages,
12 |                  max_prompt_tokens: int,
13 |                  ) -> List[Query]:
14 |         pass
15 | 
16 |     @abstractmethod
17 |     async def agenerate(self,
18 |                         messages: Messages,
19 |                         max_prompt_tokens: int,
20 |                         ) -> List[Query]:
21 |         raise NotImplementedError
22 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/query_generator/cohere.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, cast
 2 | 
 3 | from canopy.chat_engine.query_generator import QueryGenerator
 4 | from canopy.chat_engine.history_pruner.raising import RaisingHistoryPruner
 5 | from canopy.llm import BaseLLM, CohereLLM
 6 | from canopy.models.data_models import Messages, Query
 7 | 
 8 | 
 9 | class CohereQueryGenerator(QueryGenerator):
10 |     """
11 |     Query generator for LLM clients that have a built-in feature to
12 |     generate search queries from chat messages.
13 |     """
14 |     _DEFAULT_COMPONENTS = {
15 |         "llm": CohereLLM,
16 |     }
17 | 
18 |     def __init__(self,
19 |                  *,
20 |                  llm: Optional[BaseLLM] = None):
21 |         self._llm = llm or self._DEFAULT_COMPONENTS["llm"]()
22 | 
23 |         if not isinstance(self._llm, CohereLLM):
24 |             raise NotImplementedError(
25 |                 "CohereQueryGenerator only compatible with CohereLLM"
26 |             )
27 | 
28 |         self._history_pruner = RaisingHistoryPruner()
29 | 
30 |     def generate(self,
31 |                  messages: Messages,
32 |                  max_prompt_tokens: int) -> List[Query]:
33 |         messages = self._history_pruner.build(chat_history=messages,
34 |                                               max_tokens=max_prompt_tokens)
35 |         llm = cast(CohereLLM, self._llm)
36 |         queries = llm.generate_search_queries(messages)
37 |         return [Query(text=query) for query in queries]
38 | 
39 |     async def agenerate(self,
40 |                         messages: Messages,
41 |                         max_prompt_tokens: int) -> List[Query]:
42 |         raise NotImplementedError
43 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/query_generator/function_calling.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | from canopy.chat_engine.history_pruner import RaisingHistoryPruner
 4 | from canopy.chat_engine.query_generator import QueryGenerator
 5 | from canopy.llm import BaseLLM, OpenAILLM
 6 | from canopy.llm.models import (Function, FunctionParameters,
 7 |                                FunctionArrayProperty)
 8 | from canopy.models.data_models import Messages, Query
 9 | 
10 | DEFAULT_SYSTEM_PROMPT = """Your task is to formulate search queries for a search engine, to assist in responding to the user's question.
11 | You should break down complex questions into sub-queries if needed.
12 | """  # noqa: E501
13 | 
14 | DEFAULT_FUNCTION_DESCRIPTION = """Query search engine for relevant information"""
15 | 
16 | 
17 | class FunctionCallingQueryGenerator(QueryGenerator):
18 | 
19 |     _DEFAULT_COMPONENTS = {
20 |         "llm": OpenAILLM,
21 |     }
22 | 
23 |     def __init__(self,
24 |                  *,
25 |                  llm: Optional[BaseLLM] = None,
26 |                  prompt: Optional[str] = None,
27 |                  function_description: Optional[str] = None):
28 |         self._llm = llm or self._DEFAULT_COMPONENTS["llm"]()
29 |         self._system_prompt = prompt or DEFAULT_SYSTEM_PROMPT
30 |         self._function_description = \
31 |             function_description or DEFAULT_FUNCTION_DESCRIPTION
32 |         self._history_pruner = RaisingHistoryPruner()
33 | 
34 |     def generate(self,
35 |                  messages: Messages,
36 |                  max_prompt_tokens: int) -> List[Query]:
37 |         messages = self._history_pruner.build(system_prompt=self._system_prompt,
38 |                                               chat_history=messages,
39 |                                               max_tokens=max_prompt_tokens)
40 |         try:
41 |             arguments = self._llm.enforced_function_call(
42 |                 system_prompt=self._system_prompt,
43 |                 chat_history=messages,
44 |                 function=self._function
45 |             )
46 |         except NotImplementedError as e:
47 |             raise RuntimeError(
48 |                 "FunctionCallingQueryGenerator requires an LLM that supports "
49 |                 "function calling. Please provide a different LLM, "
50 |                 "or alternatively select a different QueryGenerator class. "
51 |                 f"Received the following error from LLM:\n{e}"
52 |             ) from e
53 | 
54 |         return [Query(text=q)
55 |                 for q in arguments["queries"]]
56 | 
57 |     async def agenerate(self,
58 |                         messages: Messages,
59 |                         max_prompt_tokens: int) -> List[Query]:
60 |         raise NotImplementedError
61 | 
62 |     @property
63 |     def _function(self) -> Function:
64 |         return Function(
65 |             name="query_knowledgebase",
66 |             description=self._function_description,
67 |             parameters=FunctionParameters(
68 |                 required_properties=[
69 |                     FunctionArrayProperty(
70 |                         name="queries",
71 |                         items_type="string",
72 |                         description='List of queries to send to the search engine.',
73 |                     ),
74 |                 ]
75 |             ),
76 |         )
77 | 


--------------------------------------------------------------------------------
/src/canopy/chat_engine/query_generator/last_message.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from canopy.chat_engine.query_generator import QueryGenerator
 4 | from canopy.models.data_models import Messages, Query, Role
 5 | 
 6 | 
 7 | class LastMessageQueryGenerator(QueryGenerator):
 8 |     """
 9 |         Returns the last message as a query without running any LLMs. This can be
10 |         considered as the most basic query generation. Please use other query generators
11 |         for more accurate results.
12 |     """
13 | 
14 |     def generate(self,
15 |                  messages: Messages,
16 |                  max_prompt_tokens: int) -> List[Query]:
17 |         """
18 |             max_prompt_token is dismissed since we do not consume any token for
19 |             generating the queries.
20 |         """
21 | 
22 |         if len(messages) == 0:
23 |             raise ValueError("Passed chat history does not contain any messages. "
24 |                              "Please include at least one message in the history.")
25 | 
26 |         last_message = messages[-1]
27 | 
28 |         if last_message.role != Role.USER:
29 |             raise ValueError(f"Expected a UserMessage, got {type(last_message)}.")
30 | 
31 |         return [Query(text=last_message.content)]
32 | 
33 |     async def agenerate(self,
34 |                         messages: Messages,
35 |                         max_prompt_tokens: int) -> List[Query]:
36 |         return self.generate(messages, max_prompt_tokens)
37 | 


--------------------------------------------------------------------------------
/src/canopy/config_templates/anyscale.yaml:
--------------------------------------------------------------------------------
 1 | # ===========================================================
 2 | #            Configuration file for Canopy Server
 3 | # ===========================================================
 4 | tokenizer:
 5 |   # -------------------------------------------------------------------------------------------
 6 |   # Tokenizer configuration
 7 |   # Use LLamaTokenizer from HuggingFace with the relevant OSS model (e.g. LLama2)
 8 |   # -------------------------------------------------------------------------------------------
 9 |   type: LlamaTokenizer                 # Options: [OpenAITokenizer, LlamaTokenizer]
10 |   params:
11 |     model_name: hf-internal-testing/llama-tokenizer
12 | 
13 | chat_engine:
14 |   # -------------------------------------------------------------------------------------------
15 |   # Chat engine configuration
16 |   # Use Anyscale Endpoint as the open source LLM provider
17 |   # You can find the list of supported LLM at https://docs.endpoints.anyscale.com/category/supported-models
18 |   # -------------------------------------------------------------------------------------------
19 |   params:
20 |     max_prompt_tokens: 2048             # The maximum number of tokens to use for input prompt to the LLM.
21 |   llm: &llm
22 |     type: AnyscaleLLM
23 |     params:
24 |       model_name: meta-llama/Llama-2-7b-chat-hf         # The name of the model to use.
25 | 
26 |   query_builder:
27 |     type: FunctionCallingQueryGenerator     # Options: [FunctionCallingQueryGenerator, LastMessageQueryGenerator, InstructionQueryGenerator]
28 |     llm: 
29 |       type: AnyscaleLLM
30 |       params:
31 |         model_name: mistralai/Mistral-7B-Instruct-v0.1
32 | 
33 |   context_engine:
34 |     # -------------------------------------------------------------------------------------------------------------
35 |     # ContextEngine configuration
36 |     # -------------------------------------------------------------------------------------------------------------
37 |     knowledge_base:
38 |       # -----------------------------------------------------------------------------------------------------------
39 |       # KnowledgeBase configuration
40 |       # -----------------------------------------------------------------------------------------------------------
41 |       record_encoder:
42 |         # --------------------------------------------------------------------------
43 |         # Configuration for the RecordEncoder subcomponent of the knowledge base.
44 |         # Use Anyscale's Embedding endpoint for dense encoding
45 |         # --------------------------------------------------------------------------
46 |         type: AnyscaleRecordEncoder
47 |         params:
48 |           model_name:                   # The name of the model to use for encoding
49 |             thenlper/gte-large
50 |           batch_size: 100               # The number of document chunks to encode in each call to the encoding model
51 | 


--------------------------------------------------------------------------------
/src/canopy/config_templates/azure.yaml:
--------------------------------------------------------------------------------
 1 | # ===========================================================
 2 | #            Configuration file for Azure OpenAI
 3 | # ===========================================================
 4 | 
 5 | 
 6 | query_builder_prompt: &query_builder_prompt |
 7 |   Your task is to formulate search queries for a search engine, to assist in responding to the user's question.
 8 |   You should break down complex questions into sub-queries if needed.
 9 | 
10 | 
11 | tokenizer:
12 |   type: OpenAITokenizer                 # Options: [OpenAITokenizer]
13 |   params:
14 |     model_name: gpt-3.5-turbo           # Configure the tokenizer that matches the OpenAI model in your deployment
15 | 
16 | 
17 | chat_engine:
18 | 
19 |   llm: &llm
20 |     # -------------------------------------------------------------------------------------------------------------
21 |     # LLM configuration
22 |     # Configuration of the LLM (Large Language Model)
23 |     # -------------------------------------------------------------------------------------------------------------
24 |     type: AzureOpenAILLM                # Options: [OpenAILLM, AzureOpenAILLM]
25 |     params:
26 |       model_name: your-deployment-name  # Specify the name of the LLM deployment to use.
27 |       api_version: 2023-12-01-preview   # Specify the API version to use.
28 | 
29 |   query_builder:
30 |     # -------------------------------------------------------------------------------------------------------------
31 |     # LLM configuration
32 |     # Configuration of the LLM (Large Language Model)
33 |     # -------------------------------------------------------------------------------------------------------------
34 |     type: FunctionCallingQueryGenerator # Options: [FunctionCallingQueryGenerator, LastMessageQueryGenerator, InstructionQueryGenerator]
35 |     params:
36 |       prompt: *query_builder_prompt     # The query builder's system prompt for calling the LLM
37 |       function_description:             # A function description passed to the LLM's `function_calling` API
38 |         Query search engine for relevant information
39 | 
40 |     llm:  # The LLM that the query builder will use to generate queries. Leave `*llm` to use the chat engine's LLM
41 |       <<: *llm
42 | 
43 |   context_engine:
44 | 
45 |     knowledge_base:
46 | 
47 |       record_encoder:
48 |         # --------------------------------------------------------------------------
49 |         # Configuration for the RecordEncoder subcomponent of the knowledge base.
50 |         # The record encoder is responsible for encoding document chunks to a vector representation
51 |         # --------------------------------------------------------------------------
52 |         type: AzureOpenAIRecordEncoder  # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder]
53 |         params:
54 |           model_name:                   # Specify the name of the embedding deployment to use.
55 |             your-embedding-deployment-name
56 |           batch_size: 400               # The number of document chunks to encode in each call to the encoding model


--------------------------------------------------------------------------------
/src/canopy/config_templates/octoai.yaml:
--------------------------------------------------------------------------------
 1 | # ===========================================================
 2 | #            Configuration file for Canopy Server
 3 | # ===========================================================
 4 | tokenizer:
 5 |   # -------------------------------------------------------------------------------------------
 6 |   # Tokenizer configuration
 7 |   # Use LLamaTokenizer from HuggingFace with the relevant OSS model (e.g. LLama2)
 8 |   # -------------------------------------------------------------------------------------------
 9 |   type: LlamaTokenizer                 # Options: [OpenAITokenizer, LlamaTokenizer]
10 |   params:
11 |     model_name: hf-internal-testing/llama-tokenizer
12 | 
13 | chat_engine:
14 |   # -------------------------------------------------------------------------------------------
15 |   # Chat engine configuration
16 |   # Use OctoAI as the open source LLM provider
17 |   # You can find the list of supported LLMs at https://octo.ai/docs/text-gen-solution/rest-api
18 |   # -------------------------------------------------------------------------------------------
19 |   params:
20 |     max_prompt_tokens: 2048             # The maximum number of tokens to use for input prompt to the LLM.
21 |   llm: &llm
22 |     type: OctoAILLM
23 |     params:
24 |       model_name: mistral-7b-instruct-fp16         # The name of the model to use.
25 | 
26 |   # query_builder:
27 |   #   type: FunctionCallingQueryGenerator     # Options: [FunctionCallingQueryGenerator, LastMessageQueryGenerator, InstructionQueryGenerator]
28 |   #   llm: 
29 |   #     type: OctoAILLM
30 |   #     params:
31 |   #       model_name: mistral-7b-instruct-fp16
32 | 
33 |   context_engine:
34 |     # -------------------------------------------------------------------------------------------------------------
35 |     # ContextEngine configuration
36 |     # -------------------------------------------------------------------------------------------------------------
37 |     knowledge_base:
38 |       # -----------------------------------------------------------------------------------------------------------
39 |       # KnowledgeBase configuration
40 |       # -----------------------------------------------------------------------------------------------------------
41 |       record_encoder:
42 |         # --------------------------------------------------------------------------
43 |         # Configuration for the RecordEncoder subcomponent of the knowledge base.
44 |         # Use OctoAI's Embedding endpoint for dense encoding
45 |         # --------------------------------------------------------------------------
46 |         type: OctoAIRecordEncoder
47 |         params:
48 |           model_name:                   # The name of the model to use for encoding
49 |             thenlper/gte-large
50 |           batch_size: 2048              # The number of document chunks to encode in each call to the encoding model
51 | 


--------------------------------------------------------------------------------
/src/canopy/context_engine/__init__.py:
--------------------------------------------------------------------------------
1 | from .context_engine import ContextEngine
2 | 


--------------------------------------------------------------------------------
/src/canopy/context_engine/context_builder/__init__.py:
--------------------------------------------------------------------------------
1 | from .stuffing import StuffingContextBuilder
2 | from .base import ContextBuilder
3 | 


--------------------------------------------------------------------------------
/src/canopy/context_engine/context_builder/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | from canopy.knowledge_base.models import QueryResult
 5 | from canopy.models.data_models import Context
 6 | from canopy.utils.config import ConfigurableMixin
 7 | 
 8 | 
 9 | class ContextBuilder(ABC, ConfigurableMixin):
10 |     """
11 |     BaseContextBuilder is an abstract class that defines the interface for a context
12 |     builder.
13 |     """
14 | 
15 |     @abstractmethod
16 |     def build(self,
17 |               query_results: List[QueryResult],
18 |               max_context_tokens: int, ) -> Context:
19 |         pass
20 | 
21 |     @abstractmethod
22 |     async def abuild(self,
23 |                      query_results: List[QueryResult],
24 |                      max_context_tokens: int, ) -> Context:
25 |         pass
26 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/__init__.py:
--------------------------------------------------------------------------------
1 | from .knowledge_base import list_canopy_indexes
2 | from .knowledge_base import KnowledgeBase
3 | from .qdrant.qdrant_knowledge_base import QdrantKnowledgeBase
4 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional
 3 | 
 4 | from canopy.knowledge_base.models import QueryResult
 5 | from canopy.models.data_models import Query, Document
 6 | from canopy.utils.config import ConfigurableMixin
 7 | 
 8 | 
 9 | class BaseKnowledgeBase(ABC, ConfigurableMixin):
10 |     """
11 |     KnowledgeBase is an abstract class that defines the interface for a knowledge base.
12 |     """
13 | 
14 |     @abstractmethod
15 |     def query(self,
16 |               queries: List[Query],
17 |               global_metadata_filter: Optional[dict] = None,
18 |               namespace: Optional[str] = None
19 |               ) -> List[QueryResult]:
20 |         pass
21 | 
22 |     @abstractmethod
23 |     def upsert(self,
24 |                documents: List[Document],
25 |                namespace: str = "", ) -> None:
26 |         pass
27 | 
28 |     # TODO: Do we want delete by metadata?
29 |     @abstractmethod
30 |     def delete(self,
31 |                document_ids: List[str],
32 |                namespace: str = "") -> None:
33 |         pass
34 | 
35 |     @abstractmethod
36 |     def verify_index_connection(self) -> None:
37 |         pass
38 | 
39 |     @abstractmethod
40 |     async def aquery(self,
41 |                      queries: List[Query],
42 |                      global_metadata_filter: Optional[dict] = None,
43 |                      namespace: Optional[str] = None
44 |                      ) -> List[QueryResult]:
45 |         pass
46 | 
47 |     @abstractmethod
48 |     async def aupsert(self,
49 |                       documents: List[Document],
50 |                       namespace: str = "",
51 |                       ) -> None:
52 |         pass
53 | 
54 |     @abstractmethod
55 |     async def adelete(self,
56 |                       document_ids: List[str],
57 |                       namespace: str = "") -> None:
58 |         pass
59 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/chunker/__init__.py:
--------------------------------------------------------------------------------
 1 | from .token_chunker import TokenChunker
 2 | from .markdown import MarkdownChunker
 3 | from .base import Chunker
 4 | 
 5 | __ALL__ = [
 6 |     "MarkdownChunker",
 7 |     "TokenChunker",
 8 |     "Chunker",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/chunker/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | from canopy.knowledge_base.models import KBDocChunk
 5 | from canopy.models.data_models import Document
 6 | from canopy.utils.config import ConfigurableMixin
 7 | 
 8 | 
 9 | class Chunker(ABC, ConfigurableMixin):
10 |     """
11 |     Base class for chunkers. Chunkers take a document (id, text, ...)
12 |     and return a list of KBDocChunks  (id, text, document_id, ...)
13 |     Chunker is an abstract class that must be subclassed to be used,
14 |     also, it extends ConfigurableMixin which means that every subclass of
15 |     Chunker could be referenced by a name and configured in a config file.
16 |     """
17 | 
18 |     def chunk_documents(self, documents: List[Document]) -> List[KBDocChunk]:
19 |         """
20 |         chunk_documents takes a list of documents and returns a list of KBDocChunks
21 |         this method is just a wrapper around chunk_single_document that can be
22 |         used to chunk a list of documents.
23 | 
24 |         Args:
25 |             documents: list of documents
26 | 
27 |         Returns:
28 |             chunks: list of chunks of type KBDocChunks
29 |         """
30 |         chunks: List[KBDocChunk] = []
31 |         for doc in documents:
32 |             chunks.extend(self.chunk_single_document(doc))
33 |         return chunks
34 | 
35 |     async def achunk_documents(self, documents: List[Document]) -> List[KBDocChunk]:
36 |         chunks: List[KBDocChunk] = []
37 |         for doc in documents:
38 |             chunks.extend(await self.achunk_single_document(doc))
39 |         return chunks
40 | 
41 |     @abstractmethod
42 |     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
43 |         """
44 |         chunk_single_document takes a document and returns a
45 |         list of KBDocChunks, this is the main method
46 |         that must be implemented by every subclass of Chunker
47 | 
48 |         Args:
49 |             document: list of documents
50 | 
51 |         Returns:
52 |             chunks: list of chunks KBDocChunks
53 |         """
54 |         pass
55 | 
56 |     @abstractmethod
57 |     async def achunk_single_document(self, document: Document) -> List[KBDocChunk]:
58 |         raise NotImplementedError()
59 | 
60 |     def generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
61 |         return f"{document_id}_{chunk_index}"
62 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/chunker/markdown.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .langchain_text_splitter import Language, RecursiveCharacterTextSplitter
 4 | from .recursive_character import RecursiveCharacterChunker
 5 | from canopy.knowledge_base.models import KBDocChunk
 6 | from canopy.models.data_models import Document
 7 | 
 8 | 
 9 | class MarkdownChunker(RecursiveCharacterChunker):
10 |     """
11 |     MarkdownChunker is a subclass of RecursiveCharacterChunker that is configured
12 |     to chunk markdown documents. It uses RecursiveCharacterTextSplitter to split
13 |     the text of the document into chunks, by providing the separators for markdown documents
14 |     (also from LangChainTextSplitter, with modifications)
15 |     """  # noqa: E501
16 | 
17 |     def __init__(self,
18 |                  chunk_size: int = 256,
19 |                  chunk_overlap: int = 0,
20 |                  keep_separator: bool = True
21 |                  ):
22 |         """
23 |         Iniitalizes RecursiveCharacterChunker with the separators for markdown documents.
24 | 
25 |         Args:
26 |             chunk_size: size of the chunks. Defaults to 256 tokens.
27 |             chunk_overlap: overlap between chunks. Defaults to 0.
28 |             keep_separator: whether to keep the separator in the chunk. Defaults to True.
29 | 
30 |         """  # noqa: E501
31 |         separators = RecursiveCharacterTextSplitter.get_separators_for_language(
32 |             Language.MARKDOWN
33 |         )
34 |         super().__init__(chunk_size=chunk_size,
35 |                          chunk_overlap=chunk_overlap,
36 |                          separators=separators,
37 |                          keep_separator=keep_separator)
38 | 
39 |     async def achunk_single_document(self,
40 |                                      document: Document) -> List[KBDocChunk]:
41 |         raise NotImplementedError()
42 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/chunker/recursive_character.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import List, Optional
 3 | 
 4 | from .langchain_text_splitter import RecursiveCharacterTextSplitter
 5 | 
 6 | from canopy.knowledge_base.chunker.base import Chunker
 7 | from canopy.knowledge_base.models import KBDocChunk
 8 | from canopy.tokenizer import Tokenizer
 9 | from canopy.models.data_models import Document
10 | 
11 | 
12 | class RecursiveCharacterChunker(Chunker):
13 |     """
14 |     A chunker that splits a document into chunks of a given size, using a recursive character splitter.
15 |     A RecursiveCharacterChunker is a derived class of Chunker, which means that it can be referenced by a name
16 |     and configured in a config file.
17 |     """  # noqa: E501
18 | 
19 |     def __init__(self,
20 |                  chunk_size: int = 256,
21 |                  chunk_overlap: int = 0,
22 |                  separators: Optional[List[str]] = None,
23 |                  keep_separator: bool = True,
24 |                  ):
25 |         """
26 |         RecursiveCharacterTextSplitter is a text splitter from the langchain library.
27 |         It splits a text into chunks of a given size, using a recursive character splitter.
28 | 
29 |         Args:
30 |             chunk_size: size of the chunks, in tokens
31 |             chunk_overlap: overlap between chunks
32 |             separators: list of separators to use for splitting the text
33 |             keep_separator: whether to keep the separator in the chunk or not
34 |         """  # noqa: E501
35 |         self._tokenizer = Tokenizer()
36 |         self._chunker = RecursiveCharacterTextSplitter(
37 |             chunk_size=chunk_size,
38 |             chunk_overlap=chunk_overlap,
39 |             length_function=self._tokenizer.token_count,
40 |             separators=separators,
41 |             keep_separator=keep_separator)
42 | 
43 |     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
44 |         """
45 |         using the RecursiveCharacterTextSplitter, this method takes a document and returns a list of KBDocChunks
46 |         Args:
47 |             document: document to be chunked
48 | 
49 |         Returns:
50 |             chunks: list of chunks KBDocChunks from the document, where text is splitted
51 |                               evenly using the RecursiveCharacterTextSplitter
52 |         """  # noqa: E501
53 |         # TODO: check overlap not bigger than max_chunk_size
54 |         text_chunks = self._chunker.split_text(document.text)
55 |         return [KBDocChunk(id=self.generate_chunk_id(document.id, i),
56 |                            document_id=document.id,
57 |                            text=text_chunk,
58 |                            source=document.source,
59 |                            metadata=deepcopy(document.metadata))
60 |                 for i, text_chunk in enumerate(text_chunks)]
61 | 
62 |     async def achunk_single_document(self, document: Document) -> List[KBDocChunk]:
63 |         raise NotImplementedError()
64 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/chunker/token_chunker.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .base import Chunker
 4 | from ..models import KBDocChunk
 5 | from canopy.tokenizer import Tokenizer
 6 | from ...models.data_models import Document
 7 | 
 8 | 
 9 | class TokenChunker(Chunker):
10 |     """
11 |     Simple chunker that splits a document into chunks (group of tokens) of a given size, using a tokenizer.
12 |     A TokenChunker is a derived class of Chunker, which means that it can be referenced by a name
13 |     and configured in a config file.
14 |     """  # noqa: E501
15 | 
16 |     def __init__(self,
17 |                  max_chunk_size: int = 256,
18 |                  overlap: int = 30, ):
19 |         """
20 |         Using the global tokenizer, will set the class parameters for the TokenChunker.
21 |         will check overlap and max_chunk_size.
22 | 
23 |         Args:
24 |             max_chunk_size: size of the chunks, in tokens
25 |             overlap: overlap between chunks, in tokens
26 |         """  # noqa: E501
27 | 
28 |         # TODO: should add check for overlap not bigger than max_chunk_size
29 |         if overlap < 0:
30 |             cls_name = self.__class__.__name__
31 |             raise ValueError(
32 |                 f"overlap for {cls_name} can't be negative, got: {overlap}"
33 |             )
34 | 
35 |         if max_chunk_size <= 0:
36 |             cls_name = self.__class__.__name__
37 |             raise ValueError(
38 |                 f"max_chunk_size for {cls_name} must be positive, got: {max_chunk_size}"
39 |             )
40 | 
41 |         self._tokenizer = Tokenizer()
42 |         self._chunk_size = max_chunk_size
43 |         self._overlap = overlap
44 | 
45 |     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
46 |         """
47 |         This methods takes a document and returns a list of KBDocChunks, where text is splitted
48 |         evenly using the tokenizer. Firts the text is tokenized, then the tokens are splitted into chunks
49 |         of a given size, with overlap between chunks.
50 |         Last chunk is handled such that if the last chunk is smaller than the overlap, it will be removed.
51 | 
52 |         Args:
53 |             document: document to be chunked
54 | 
55 |         Returns:
56 |             text_chunks: list of chunks KBDocChunks from the document
57 |         """  # noqa: E501
58 |         tokens = self._tokenizer.tokenize(document.text)
59 |         token_chunks = [tokens[i:i + self._chunk_size]
60 |                         for i in range(0, len(tokens),
61 |                                        self._chunk_size - self._overlap)]
62 | 
63 |         if len(token_chunks) == 0:
64 |             return []
65 | 
66 |         # remove last chunk if it is smaller than overlap
67 |         if len(token_chunks[-1]) <= self._overlap and len(token_chunks) > 1:
68 |             token_chunks = token_chunks[:-1]
69 | 
70 |         text_chunks = [self._tokenizer.detokenize(chunk)
71 |                        for chunk in token_chunks]
72 |         return [KBDocChunk(id=self.generate_chunk_id(document.id, i),
73 |                            document_id=document.id,
74 |                            text=text_chunk,
75 |                            source=document.source,
76 |                            metadata=document.metadata)
77 |                 for i, text_chunk in enumerate(text_chunks)]
78 | 
79 |     async def achunk_single_document(self, document: Document) -> List[KBDocChunk]:
80 |         raise NotImplementedError()
81 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/models.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import List, Optional
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | from canopy.models.data_models import Document, Query, SparseVector
 7 | 
 8 | # TODO: (1) consider moving this to pinecone-text
 9 | # TODO: (2) consider renaming to "Vector" or "DenseVector"
10 | # TODO: (3) consider supporting `np.ndarray`
11 | VectorValues = List[float]
12 | 
13 | 
14 | class KBDocChunk(Document):
15 |     document_id: str
16 | 
17 | 
18 | class KBDocChunkWithScore(KBDocChunk):
19 |     score: float
20 | 
21 | 
22 | class KBEncodedDocChunk(KBDocChunk):
23 |     values: VectorValues
24 |     sparse_values: Optional[SparseVector] = None
25 | 
26 |     def to_db_record(self):
27 |         metadata = deepcopy(self.metadata)
28 |         metadata["text"] = self.text
29 |         metadata["document_id"] = self.document_id
30 |         metadata["source"] = self.source
31 | 
32 |         record = {
33 |             "id": self.id,
34 |             "values": self.values,
35 |             "metadata": metadata,
36 | 
37 |         }
38 | 
39 |         if self.sparse_values is not None and len(self.sparse_values["values"]) > 0:
40 |             record["sparse_values"] = self.sparse_values
41 | 
42 |         return record
43 | 
44 | 
45 | class KBQuery(Query):
46 |     values: Optional[VectorValues] = None
47 |     sparse_values: Optional[SparseVector] = None
48 | 
49 | 
50 | class KBQueryResult(BaseModel):
51 |     query: str
52 |     documents: List[KBDocChunkWithScore]
53 | 
54 | 
55 | class DocumentWithScore(Document):
56 |     score: float
57 | 
58 | 
59 | class QueryResult(BaseModel):
60 |     query: str
61 |     documents: List[DocumentWithScore]
62 |     debug_info: dict = Field(default_factory=dict, exclude=True)
63 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/qdrant/constants.py:
--------------------------------------------------------------------------------
1 | from canopy.knowledge_base.knowledge_base import INDEX_NAME_PREFIX
2 | 
3 | COLLECTION_NAME_PREFIX = INDEX_NAME_PREFIX
4 | DENSE_VECTOR_NAME = "dense"
5 | RESERVED_METADATA_KEYS = {"document_id", "text", "source", "chunk_id"}
6 | SPARSE_VECTOR_NAME = "sparse"
7 | UUID_NAMESPACE = "867603e3-ba69-447d-a8ef-263dff19bda7"
8 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/qdrant/converter.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from typing import Dict, List, Any, Union
  3 | import uuid
  4 | from canopy.knowledge_base.models import (
  5 |     KBDocChunkWithScore,
  6 |     KBEncodedDocChunk,
  7 |     KBQuery,
  8 |     VectorValues,
  9 | )
 10 | from pinecone_text.sparse import SparseVector
 11 | 
 12 | try:
 13 |     from qdrant_client import models
 14 | except ImportError:
 15 |     pass
 16 | 
 17 | from canopy.knowledge_base.qdrant.constants import (
 18 |     DENSE_VECTOR_NAME,
 19 |     SPARSE_VECTOR_NAME,
 20 |     UUID_NAMESPACE,
 21 | )
 22 | 
 23 | 
 24 | class QdrantConverter:
 25 |     @staticmethod
 26 |     def convert_id(_id: str) -> str:
 27 |         """
 28 |         Converts any string into a UUID string based on a seed.
 29 | 
 30 |         Qdrant accepts UUID strings and unsigned integers as point ID.
 31 |         We use a seed to convert each string into a UUID string deterministically.
 32 |         This allows us to overwrite the same point with the original ID.
 33 |         """
 34 |         return str(uuid.uuid5(uuid.UUID(UUID_NAMESPACE), _id))
 35 | 
 36 |     @staticmethod
 37 |     def encoded_docs_to_points(
 38 |         encoded_docs: List[KBEncodedDocChunk],
 39 |     ) -> "List[models.PointStruct]":
 40 |         points = []
 41 |         for doc in encoded_docs:
 42 |             record = doc.to_db_record()
 43 |             _id: str = record.pop("id")
 44 |             dense_vector: VectorValues = record.pop("values", None)
 45 |             sparse_vector: SparseVector = record.pop("sparse_values", None)
 46 | 
 47 |             vector: Dict[str, models.Vector] = {}
 48 | 
 49 |             if dense_vector:
 50 |                 vector[DENSE_VECTOR_NAME] = dense_vector
 51 | 
 52 |             if sparse_vector:
 53 |                 vector[SPARSE_VECTOR_NAME] = models.SparseVector(
 54 |                     indices=sparse_vector["indices"],
 55 |                     values=sparse_vector["values"],
 56 |                 )
 57 | 
 58 |             points.append(
 59 |                 models.PointStruct(
 60 |                     id=QdrantConverter.convert_id(_id),
 61 |                     vector=vector,
 62 |                     payload={**record["metadata"], "chunk_id": _id},
 63 |                 )
 64 |             )
 65 |         return points
 66 | 
 67 |     @staticmethod
 68 |     def scored_point_to_scored_doc(
 69 |         scored_point,
 70 |     ) -> "KBDocChunkWithScore":
 71 |         metadata: Dict[str, Any] = deepcopy(scored_point.payload or {})
 72 |         _id = metadata.pop("chunk_id")
 73 |         text = metadata.pop("text", "")
 74 |         document_id = metadata.pop("document_id")
 75 |         return KBDocChunkWithScore(
 76 |             id=_id,
 77 |             text=text,
 78 |             document_id=document_id,
 79 |             score=scored_point.score,
 80 |             source=metadata.pop("source", ""),
 81 |             metadata=metadata,
 82 |         )
 83 | 
 84 |     @staticmethod
 85 |     def kb_query_to_search_vector(
 86 |         query: KBQuery,
 87 |     ) -> "Union[models.NamedVector, models.NamedSparseVector]":
 88 |         # Use dense vector if available, otherwise use sparse vector
 89 |         query_vector: Union[models.NamedVector, models.NamedSparseVector]
 90 |         if query.values:
 91 |             query_vector = models.NamedVector(name=DENSE_VECTOR_NAME, vector=query.values)  # noqa: E501
 92 |         elif query.sparse_values:
 93 |             query_vector = models.NamedSparseVector(
 94 |                 name=SPARSE_VECTOR_NAME,
 95 |                 vector=models.SparseVector(
 96 |                     indices=query.sparse_values["indices"],
 97 |                     values=query.sparse_values["values"],
 98 |                 ),
 99 |             )
100 |         else:
101 |             raise ValueError("Query should have either dense or sparse vector.")
102 |         return query_vector
103 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/qdrant/utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import functools
  3 | from itertools import islice
  4 | from typing import Any, Callable, Optional
  5 | 
  6 | import logging
  7 | 
  8 | try:
  9 |     from qdrant_client import AsyncQdrantClient, QdrantClient
 10 |     from qdrant_client.local.async_qdrant_local import AsyncQdrantLocal
 11 | except ImportError:
 12 |     pass
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def sync_fallback(method: Callable) -> Callable:
 17 |     @functools.wraps(method)
 18 |     async def wrapper(self, *args, **kwargs):
 19 |         if self._async_client is None or isinstance(
 20 |             self._async_client._client, AsyncQdrantLocal
 21 |         ):
 22 |             sync_method_name = method.__name__[1:]
 23 | 
 24 |             logger.warning(
 25 |                 f"{method.__name__}() cannot be used for QdrantLocal. "
 26 |                 f"Falling back to {sync_method_name}()"
 27 |             )
 28 |             loop = asyncio.get_event_loop()
 29 | 
 30 |             call = functools.partial(getattr(self, sync_method_name), *args, **kwargs)
 31 |             return await loop.run_in_executor(None, call)
 32 |         else:
 33 |             return await method(self, *args, **kwargs)
 34 | 
 35 |     return wrapper
 36 | 
 37 | 
 38 | def generate_clients(
 39 |     location: Optional[str] = None,
 40 |     url: Optional[str] = None,
 41 |     port: Optional[int] = 6333,
 42 |     grpc_port: int = 6334,
 43 |     prefer_grpc: bool = False,
 44 |     https: Optional[bool] = None,
 45 |     api_key: Optional[str] = None,
 46 |     prefix: Optional[str] = None,
 47 |     timeout: Optional[int] = None,
 48 |     host: Optional[str] = None,
 49 |     path: Optional[str] = None,
 50 |     force_disable_check_same_thread: bool = False,
 51 |     **kwargs: Any,
 52 | ):
 53 |     sync_client = QdrantClient(
 54 |         location=location,
 55 |         url=url,
 56 |         port=port,
 57 |         grpc_port=grpc_port,
 58 |         prefer_grpc=prefer_grpc,
 59 |         https=https,
 60 |         api_key=api_key,
 61 |         prefix=prefix,
 62 |         timeout=timeout,
 63 |         host=host,
 64 |         path=path,
 65 |         force_disable_check_same_thread=force_disable_check_same_thread,
 66 |         **kwargs,
 67 |     )
 68 | 
 69 |     if location == ":memory:" or path is not None:
 70 |         # In-memory Qdrant doesn't interoperate with Sync and Async clients
 71 |         # We fallback to sync operations in this case using @utils.sync_fallback
 72 |         async_client = None
 73 |     else:
 74 |         async_client = AsyncQdrantClient(
 75 |             location=location,
 76 |             url=url,
 77 |             port=port,
 78 |             grpc_port=grpc_port,
 79 |             prefer_grpc=prefer_grpc,
 80 |             https=https,
 81 |             api_key=api_key,
 82 |             prefix=prefix,
 83 |             timeout=timeout,
 84 |             host=host,
 85 |             path=path,
 86 |             force_disable_check_same_thread=force_disable_check_same_thread,
 87 |             **kwargs,
 88 |         )
 89 | 
 90 |     return sync_client, async_client
 91 | 
 92 | 
 93 | def batched(iterable, n):
 94 |     """
 95 |     Batch elements of an iterable into fixed-length chunks or blocks.
 96 |     Based on itertools.batched() from Python 3.12
 97 |     """
 98 |     it = iter(iterable)
 99 |     while batch := tuple(islice(it, n)):
100 |         yield batch
101 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import RecordEncoder
 2 | from .cohere import CohereRecordEncoder
 3 | from .dense import DenseRecordEncoder
 4 | from .openai import OpenAIRecordEncoder
 5 | from .anyscale import AnyscaleRecordEncoder
 6 | from .azure_openai import AzureOpenAIRecordEncoder
 7 | from .jina import JinaRecordEncoder
 8 | from .sentence_transformers import SentenceTransformerRecordEncoder
 9 | from .hybrid import HybridRecordEncoder
10 | from .octoai import OctoAIRecordEncoder
11 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/anyscale.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from pinecone_text.dense.openai_encoder import OpenAIEncoder
 4 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery
 5 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
 6 | from canopy.models.data_models import Query
 7 | 
 8 | ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
 9 | 
10 | 
11 | class AnyscaleRecordEncoder(DenseRecordEncoder):
12 |     """
13 |     AnyscaleRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API.
14 |     The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library.
15 |     For more information about see: https://github.com/pinecone-io/pinecone-text
16 | 
17 |     """  # noqa: E501
18 |     """
19 |     Initialize the AnyscaleRecordEncoder
20 | 
21 |     Args:
22 |         api_key: The Anyscale Endpoint API Key
23 |         base_url: The Base URL for Anyscale Endpoint
24 |         model_name: The name of the Anyscale embeddings model to use for encoding. See https://docs.endpoints.anyscale.com/category/supported-models
25 |         batch_size: The number of documents or queries to encode at once.
26 |                     Defaults to 400.
27 |         **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
28 |     """  # noqa: E501
29 |     def __init__(self,
30 |                  *,
31 |                  api_key: str = "",
32 |                  base_url: str = ANYSCALE_BASE_URL,
33 |                  model_name: str = "thenlper/gte-large",
34 |                  batch_size: int = 400,
35 |                  **kwargs):
36 | 
37 |         ae_api_key = api_key or os.environ.get("ANYSCALE_API_KEY")
38 |         if not ae_api_key:
39 |             raise ValueError(
40 |                 "Anyscale API key is required to use Anyscale. "
41 |                 "Please provide it as an argument "
42 |                 "or set the ANYSCALE_API_KEY environment variable."
43 |             )
44 |         ae_base_url = base_url
45 |         encoder = OpenAIEncoder(model_name,
46 |                                 base_url=ae_base_url, api_key=ae_api_key,
47 |                                 **kwargs)
48 |         super().__init__(dense_encoder=encoder, batch_size=batch_size)
49 | 
50 |     def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
51 |         """
52 |         Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
53 | 
54 |         Args:
55 |             documents: A list of KBDocChunk to encode.
56 | 
57 |         Returns:
58 |             encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
59 |         """  # noqa: E501
60 |         return super().encode_documents(documents)
61 | 
62 |     async def _aencode_documents_batch(self,
63 |                                        documents: List[KBDocChunk]
64 |                                        ) -> List[KBEncodedDocChunk]:
65 |         raise NotImplementedError
66 | 
67 |     async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
68 |         raise NotImplementedError
69 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/azure_openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pinecone_text.dense import AzureOpenAIEncoder
 4 | 
 5 | from canopy.knowledge_base.record_encoder import OpenAIRecordEncoder, DenseRecordEncoder
 6 | import openai
 7 | 
 8 | 
 9 | class AzureOpenAIRecordEncoder(OpenAIRecordEncoder):
10 |     """
11 |     AzureOpenAIRecordEncoder is a type of DenseRecordEncoder that uses the Azure OpenAI's `embeddings` deployments.
12 |     The implementation uses the `AzureOpenAIEncoder` class from the `pinecone-text` library.
13 |     For more information about see: https://github.com/pinecone-io/pinecone-text
14 | 
15 |     Azure OpenAI services require a valid API key, and an Azure endpoint. You will need
16 |     To set the following environment variables:
17 |     - AZURE_OPENAI_API_KEY: Your Azure OpenAI API key.
18 |     - AZURE_OPENAI_ENDPOINT: Your Azure endpoint, including the resource, e.g. `https://example-resource.azure.openai.com/`
19 |     """  # noqa: E501
20 | 
21 |     def __init__(
22 |             self,
23 |             *,
24 |             model_name: str,
25 |             api_version: str = "2023-12-01-preview",
26 |             batch_size: int = 400,
27 |             **kwargs
28 |     ):
29 |         """
30 |         Initialize the AzureOpenAIRecordEncoder
31 | 
32 |         Args:
33 |             model_name: The name of embeddings model deployment to use for encoding
34 |             api_version: The Azure OpenAI API version to use. Defaults to "2023-12-01-preview".
35 |             batch_size: The number of documents or queries to encode at once.
36 |                         Defaults to 400.
37 |             **kwargs: Additional arguments to pass to the underlying `pinecone-text.AzureOpenAIEncoder`.
38 |         """  # noqa: E501
39 |         try:
40 |             encoder = AzureOpenAIEncoder(model_name, api_version=api_version, **kwargs)
41 |         except (openai.OpenAIError, ValueError) as e:
42 |             raise RuntimeError(
43 |                 "Failed to connect to Azure OpenAI, please make sure that the "
44 |                 "AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT environment variables "
45 |                 "are set correctly. "
46 |                 f"Underlying Error:\n{self._format_openai_error(e)}"
47 |             ) from e
48 | 
49 |         DenseRecordEncoder.__init__(self, dense_encoder=encoder, batch_size=batch_size,
50 |                                     **kwargs)
51 | 
52 |     def _format_error(self, err):
53 |         if isinstance(err, openai.AuthenticationError):
54 |             return (
55 |                 "Failed to connect to Azure OpenAI, please make sure that the "
56 |                 "AZURE_OPENAI_API_KEY environment variable is set correctly. "
57 |                 f"Underlying Error:\n{self._format_openai_error(err)}"
58 |             )
59 |         elif isinstance(err, openai.APIConnectionError):
60 |             return (
61 |                 f"Failed to connect to your Azure OpenAI endpoint, please make sure "
62 |                 f"that the provided endpoint {os.getenv('AZURE_OPENAI_ENDPOINT')} "
63 |                 f"is correct. Underlying Error:\n{self._format_openai_error(err)}"
64 |             )
65 |         elif isinstance(err, openai.NotFoundError):
66 |             return (
67 |                 f"Failed to connect to your Azure OpenAI. Please make sure that "
68 |                 f"you have provided the correct deployment name: {self.model_name} "
69 |                 f"and API version: {self._client._api_version}. "
70 |                 f"Underlying Error:\n{self._format_openai_error(err)}"
71 |             )
72 |         else:
73 |             return super()._format_error(err)
74 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/cohere.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from pinecone_text.dense.cohere_encoder import CohereEncoder
 3 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery
 4 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
 5 | from canopy.models.data_models import Query
 6 | 
 7 | 
 8 | class CohereRecordEncoder(DenseRecordEncoder):
 9 |     """
10 |     CohereRecordEncoder is a type of DenseRecordEncoder that uses the Cohere `embed` API.
11 |     The implementation uses the `CohereEncoder` class from the `pinecone-text` library.
12 |     For more information about see: https://github.com/pinecone-io/pinecone-text
13 | 
14 |     """  # noqa: E501
15 | 
16 |     def __init__(
17 |         self,
18 |         *,
19 |         model_name: str = "embed-english-v3.0",
20 |         batch_size: int = 100,
21 |         **kwargs,
22 |     ):
23 |         """
24 |         Initialize the CohereRecordEncoder
25 | 
26 |         Args:
27 |             model_name: The name of the Cohere embeddings model to use for encoding. See https://docs.cohere.com/reference/embed
28 |             batch_size: The number of documents or queries to encode at once.
29 |                         Defaults to 400.
30 |             **kwargs: Additional arguments to pass to the underlying `pinecone-text. CohereEncoder`.
31 |         """  # noqa: E501
32 |         encoder = CohereEncoder(model_name, **kwargs)
33 |         super().__init__(dense_encoder=encoder, batch_size=batch_size)
34 | 
35 |     def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
36 |         """
37 |         Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
38 | 
39 |         Args:
40 |             documents: A list of KBDocChunk to encode.
41 | 
42 |         Returns:
43 |             encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
44 |         """  # noqa: E501
45 |         return super().encode_documents(documents)
46 | 
47 |     async def _aencode_documents_batch(
48 |         self, documents: List[KBDocChunk]
49 |     ) -> List[KBEncodedDocChunk]:
50 |         raise NotImplementedError
51 | 
52 |     async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
53 |         raise NotImplementedError
54 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/dense.py:
--------------------------------------------------------------------------------
 1 | from functools import cached_property
 2 | from typing import List
 3 | from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder
 4 | 
 5 | from .base import RecordEncoder
 6 | from canopy.knowledge_base.models import KBQuery, KBEncodedDocChunk, KBDocChunk
 7 | from canopy.models.data_models import Query
 8 | 
 9 | 
10 | class DenseRecordEncoder(RecordEncoder):
11 |     """
12 |     DenseRecordEncoder is a subclass of RecordEncoder that generates dense vector representation of documents chunks and textual queries.
13 |     The dense representation generated by the `DenseRecordEncoder` is a list of floats in a given dimension.
14 |     DenseRecordEncoder wraps a BaseDenseEncoder from the `pinecone-text` library to encode the text itself.
15 |     for more information about the BaseDenseEncoder see: https://github.com/pinecone-io/pinecone-text
16 |     """  # noqa: E501
17 | 
18 |     def __init__(self,
19 |                  dense_encoder: BaseDenseEncoder,
20 |                  **kwargs):
21 |         """
22 |         Initialize the encoder.
23 | 
24 |         Args:
25 |             dense_encoder: A BaseDenseEncoder to encode the text.
26 |             **kwargs: Additional arguments to pass to the RecordEncoder.
27 |         """  # noqa: E501
28 |         super().__init__(**kwargs)
29 |         self._dense_encoder = dense_encoder
30 | 
31 |     def _encode_documents_batch(self,
32 |                                 documents: List[KBDocChunk]
33 |                                 ) -> List[KBEncodedDocChunk]:
34 |         """
35 |         Encode a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
36 | 
37 |         Args:
38 |             documents: A list of KBDocChunk to encode.
39 |         Returns:
40 |             encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
41 |         """  # noqa: E501
42 |         dense_values = self._dense_encoder.encode_documents([d.text for d in documents])
43 |         return [KBEncodedDocChunk(**d.model_dump(), values=v) for d, v in
44 |                 zip(documents, dense_values)]
45 | 
46 |     def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
47 |         """
48 |         Encode a batch of queries, takes a list of Query and returns a list of KBQuery.
49 |         Args:
50 |             queries: A list of Query to encode.
51 |         Returns:
52 |             encoded queries: A list of KBQuery, with the `values` field populated by the generated embeddings vector.
53 |         """  # noqa: E501
54 |         dense_values = self._dense_encoder.encode_queries([q.text for q in queries])
55 |         return [
56 |             KBQuery(**q.model_dump(), values=v) for q, v in zip(queries, dense_values)
57 |         ]
58 | 
59 |     @cached_property
60 |     def dimension(self) -> int:
61 |         """
62 |         The dimension is the length of the vector generated by the `DenseRecordEncoder`
63 | 
64 |         Returns:
65 |             dimension(int): the dimension of the encoder
66 |         """  # noqa: E501
67 |         return self._dense_encoder.dimension
68 | 
69 |     async def _aencode_documents_batch(self,
70 |                                        documents: List[KBDocChunk]
71 |                                        ) -> List[KBEncodedDocChunk]:
72 |         raise NotImplementedError
73 | 
74 |     async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
75 |         raise NotImplementedError
76 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/jina.py:
--------------------------------------------------------------------------------
 1 | from pinecone_text.dense import JinaEncoder
 2 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
 3 | 
 4 | 
 5 | class JinaRecordEncoder(DenseRecordEncoder):
 6 |     """
 7 |     JinaRecordEncoder is a type of DenseRecordEncoder that uses the JinaAI `embeddings` API.
 8 |     The implementation uses the `JinaEncoder` class from the `pinecone-text` library.
 9 |     For more information about see: https://github.com/pinecone-io/pinecone-text
10 | 
11 |     """  # noqa: E501
12 | 
13 |     def __init__(self,
14 |                  *,
15 |                  model_name: str = "jina-embeddings-v2-base-en",
16 |                  batch_size: int = 400,
17 |                  **kwargs):
18 |         """
19 |         Initialize the JinaRecordEncoder
20 | 
21 |         Args:
22 |             model_name: The name of the embedding model to use.
23 |             batch_size: The number of documents or queries to encode at once.
24 |                         Defaults to 400.
25 |             **kwargs: Additional arguments to pass to the underlying `pinecone-text. JinaEncoder`.
26 |         """  # noqa: E501
27 |         encoder = JinaEncoder(model_name, **kwargs)
28 |         super().__init__(dense_encoder=encoder, batch_size=batch_size)
29 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/octoai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from pinecone_text.dense.openai_encoder import OpenAIEncoder
 4 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery
 5 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
 6 | from canopy.models.data_models import Query
 7 | 
 8 | OCTOAI_BASE_URL = "https://text.octoai.run/v1"
 9 | 
10 | 
11 | class OctoAIRecordEncoder(DenseRecordEncoder):
12 |     """
13 |     OctoAIRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API.
14 |     The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library.
15 |     For more information about see: https://github.com/pinecone-io/pinecone-text
16 | 
17 |     """  # noqa: E501
18 |     """
19 |     Initialize the OctoAIRecordEncoder
20 | 
21 |     Args:
22 |         api_key: The OctoAI Endpoint API Key
23 |         base_url: The Base URL for the OctoAI Endpoint
24 |         model_name: The name of the OctoAI embeddings model to use for encoding. See https://octo.ai/docs/text-gen-solution/getting-started
25 |         batch_size: The number of documents or queries to encode at once.
26 |                     Defaults to 1.
27 |         **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
28 |     """  # noqa: E501
29 |     def __init__(self,
30 |                  *,
31 |                  api_key: str = "",
32 |                  base_url: str = OCTOAI_BASE_URL,
33 |                  model_name: str = "thenlper/gte-large",
34 |                  batch_size: int = 1024,
35 |                  **kwargs):
36 | 
37 |         octoai_api_key = api_key or os.environ.get("OCTOAI_API_KEY")
38 |         if not octoai_api_key:
39 |             raise ValueError(
40 |                 "An OctoAI API token is required to use OctoAI. "
41 |                 "Please provide it as an argument "
42 |                 "or set the OCTOAI_API_KEY environment variable."
43 |             )
44 |         octoai_base_url = base_url
45 |         encoder = OpenAIEncoder(model_name,
46 |                                 base_url=octoai_base_url, api_key=octoai_api_key,
47 |                                 **kwargs)
48 |         super().__init__(dense_encoder=encoder, batch_size=batch_size)
49 | 
50 |     def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
51 |         """
52 |         Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
53 | 
54 |         Args:
55 |             documents: A list of KBDocChunk to encode.
56 | 
57 |         Returns:
58 |             encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
59 |         """  # noqa: E501
60 |         return super().encode_documents(documents)
61 | 
62 |     async def _aencode_documents_batch(self,
63 |                                        documents: List[KBDocChunk]
64 |                                        ) -> List[KBEncodedDocChunk]:
65 |         raise NotImplementedError
66 | 
67 |     async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
68 |         raise NotImplementedError
69 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/openai.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError
 4 | from pinecone_text.dense.openai_encoder import OpenAIEncoder
 5 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery
 6 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
 7 | from canopy.models.data_models import Query
 8 | 
 9 | 
10 | class OpenAIRecordEncoder(DenseRecordEncoder):
11 |     """
12 |     OpenAIRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API.
13 |     The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library.
14 |     For more information about see: https://github.com/pinecone-io/pinecone-text
15 | 
16 |     """  # noqa: E501
17 | 
18 |     def __init__(
19 |         self,
20 |         *,
21 |         model_name: str = "text-embedding-3-small",
22 |         batch_size: int = 400,
23 |         dimension: Optional[int] = None,
24 |         **kwargs
25 |     ):
26 |         """
27 |         Initialize the OpenAIRecordEncoder
28 | 
29 |         Args:
30 |             model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
31 |             batch_size: The number of documents or queries to encode at once.
32 |                         Defaults to 400.
33 |             dimension: The dimension of the embeddings vector to generate.
34 |             **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
35 |         """  # noqa: E501
36 |         try:
37 |             encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs)
38 |         except OpenAIError as e:
39 |             raise RuntimeError(
40 |                 "Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY "
41 |                 "environment variable is set correctly.\n"
42 |                 f"Error: {self._format_openai_error(e)}"
43 |             ) from e
44 |         super().__init__(dense_encoder=encoder, batch_size=batch_size)
45 | 
46 |     async def _aencode_documents_batch(self,
47 |                                        documents: List[KBDocChunk]
48 |                                        ) -> List[KBEncodedDocChunk]:
49 |         raise NotImplementedError
50 | 
51 |     async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
52 |         raise NotImplementedError
53 | 
54 |     @staticmethod
55 |     def _format_openai_error(e):
56 |         try:
57 |             response = e.response.json()
58 |             if "error" in response:
59 |                 return response["error"]["message"]
60 |             elif "message" in response:
61 |                 return response["message"]
62 |             else:
63 |                 return str(e)
64 |         except Exception:
65 |             return str(e)
66 | 
67 |     def _format_error(self, err):
68 |         if isinstance(err, RateLimitError):
69 |             return (f"Your OpenAI account seem to have reached the rate limit. "
70 |                     f"Details: {self._format_openai_error(err)}")
71 |         elif isinstance(err, (AuthenticationError, APIConnectionError)):
72 |             return (f"Failed to connect to OpenAI, please make sure that the "
73 |                     f"OPENAI_API_KEY environment variable is set correctly. "
74 |                     f"Details: {self._format_openai_error(err)}")
75 |         else:
76 |             return self._format_openai_error(err)
77 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/record_encoder/sentence_transformers.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from pinecone_text.dense import SentenceTransformerEncoder
 3 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
 4 | from huggingface_hub.utils import RepositoryNotFoundError
 5 | 
 6 | 
 7 | class SentenceTransformerRecordEncoder(DenseRecordEncoder):
 8 |     """
 9 |     SentenceTransformerRecordEncoder is a type of DenseRecordEncoder that uses a Sentence Transformer model.
10 |     The implementation uses the `SentenceTransformerEncoder` class from the `pinecone-text` library.
11 |     For more information about see: https://github.com/pinecone-io/pinecone-text
12 | 
13 |     """  # noqa: E501
14 | 
15 |     def __init__(self,
16 |                  *,
17 |                  model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
18 |                  query_encoder_name: Optional[str] = None,
19 |                  batch_size: int = 400,
20 |                  device: Optional[str] = None,
21 |                  **kwargs) -> None:
22 |         """
23 |         Initialize the SentenceTransformerRecordEncoder
24 | 
25 |         Args:
26 |             model_name: The name of the embedding model to use for encoding documents.
27 |                         See https://huggingface.co/models?library=sentence-transformers
28 |                         for all possible Sentence Transformer models.
29 |             query_encoder_name: The name of the embedding model to use for encoding queries.
30 |                         See https://huggingface.co/models?library=sentence-transformers
31 |                         for all possible Sentence Transformer models.
32 |                         Defaults to `model_name`.
33 |             batch_size: The number of documents or queries to encode at once.
34 |                         Defaults to 400.
35 |             device: The local device to use for encoding, for example "cpu", "cuda" or "mps".
36 |                         Defaults to "cuda" if cuda is available, otherwise to "cpu".
37 |             **kwargs: Additional arguments to pass to the underlying `pinecone-text.SentenceTransformerEncoder`.
38 |         """  # noqa: E501
39 |         try:
40 |             encoder = SentenceTransformerEncoder(
41 |                 document_encoder_name=model_name,
42 |                 query_encoder_name=query_encoder_name,
43 |                 device=device,
44 |                 **kwargs,
45 |             )
46 |         except RepositoryNotFoundError as e:
47 |             raise RuntimeError(
48 |                 "Your chosen Sentence Transformer model(s) could not be found. "
49 |                 f"Details: {str(e)}"
50 |             ) from e
51 |         except ImportError:
52 |             raise ImportError(
53 |                 f"{self.__class__.__name__} requires the `torch` and `transformers` "
54 |                 f"extra dependencies. Please install them using "
55 |                 f"`pip install canopy-sdk[torch,transformers]`."
56 |             )
57 |         super().__init__(dense_encoder=encoder, batch_size=batch_size)
58 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/reranker/__init__.py:
--------------------------------------------------------------------------------
1 | from .reranker import Reranker
2 | from .transparent import TransparentReranker
3 | from .cohere import CohereReranker
4 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/reranker/cohere.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Optional
 3 | 
 4 | 
 5 | from canopy.knowledge_base.models import KBQueryResult
 6 | from canopy.knowledge_base.reranker import Reranker
 7 | 
 8 | try:
 9 |     import cohere
10 |     from cohere import CohereAPIError
11 | except (OSError, ImportError, ModuleNotFoundError):
12 |     _cohere_installed = False
13 | else:
14 |     _cohere_installed = True
15 | 
16 | 
17 | class CohereReranker(Reranker):
18 |     """
19 |     Reranker that uses Cohere's text embedding to rerank documents.
20 | 
21 |     For each query and documents returned for that query, returns a list
22 |     of documents ordered by their relevance to the provided query.
23 |     """
24 | 
25 |     def __init__(self,
26 |                  model_name: str = 'rerank-english-v2.0',
27 |                  *,
28 |                  top_n: int = 10,
29 |                  api_key: Optional[str] = None):
30 |         """
31 |             Initializes the Cohere reranker.
32 | 
33 |             Args:
34 |                 model_name: The identifier of the model to use, one of :
35 |                     rerank-english-v2.0, rerank-multilingual-v2.0
36 |                 top_n: The number of most relevant documents return, defaults to 10
37 |                 api_key: API key for Cohere. If not passed `CO_API_KEY` environment
38 |                     variable will be used.
39 |         """
40 | 
41 |         if not _cohere_installed:
42 |             raise ImportError(
43 |                 "Failed to import cohere. Make sure you install cohere extra "
44 |                 "dependencies by running: "
45 |                 "pip install canopy-sdk[cohere]"
46 |             )
47 |         cohere_api_key = api_key or os.environ.get("CO_API_KEY")
48 |         if cohere_api_key is None:
49 |             raise RuntimeError(
50 |                 "Cohere API key is required to use Cohere Reranker. "
51 |                 "Please provide it as an argument "
52 |                 "or set the CO_API_KEY environment variable."
53 |             )
54 |         self._client = cohere.Client(api_key=cohere_api_key)
55 |         self._model_name = model_name
56 |         self._top_n = top_n
57 | 
58 |     def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
59 |         reranked_query_results: List[KBQueryResult] = []
60 |         for result in results:
61 |             texts = [doc.text for doc in result.documents]
62 |             try:
63 |                 response = self._client.rerank(query=result.query,
64 |                                                documents=texts,
65 |                                                top_n=self._top_n,
66 |                                                model=self._model_name)
67 |             except CohereAPIError as e:
68 |                 raise RuntimeError("Failed to rerank documents using Cohere."
69 |                                    f" Underlying Error:\n{e.message}")
70 | 
71 |             reranked_docs = []
72 |             for rerank_result in response:
73 |                 doc = result.documents[rerank_result.index].model_copy(
74 |                     deep=True,
75 |                     update=dict(score=rerank_result.relevance_score)
76 |                 )
77 |                 reranked_docs.append(doc)
78 | 
79 |             reranked_query_results.append(KBQueryResult(query=result.query,
80 |                                                         documents=reranked_docs))
81 |         return reranked_query_results
82 | 
83 |     async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
84 |         raise NotImplementedError()
85 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/reranker/reranker.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | from canopy.knowledge_base.models import KBQueryResult
 5 | from canopy.utils.config import ConfigurableMixin
 6 | 
 7 | 
 8 | class Reranker(ABC, ConfigurableMixin):
 9 |     """
10 |     Abstract class for rerankers. Rerankers take a list of KBQueryResult and return a list of KBQueryResult,
11 |     where the results are reranked according to the reranker logic.
12 |     Reranker is an abstract class that must be subclassed to be used,
13 |     """  # noqa: E501
14 | 
15 |     @abstractmethod
16 |     def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
17 |         pass
18 | 
19 |     @abstractmethod
20 |     async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
21 |         pass
22 | 


--------------------------------------------------------------------------------
/src/canopy/knowledge_base/reranker/transparent.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from canopy.knowledge_base.models import KBQueryResult
 4 | from canopy.knowledge_base.reranker import Reranker
 5 | 
 6 | 
 7 | class TransparentReranker(Reranker):
 8 |     """
 9 |     Transparent reranker that does nothing, it just returns the results as is. This is the default reranker.
10 |     The TransparentReranker is used as a placeholder for future development "forcing" every result set to be reranked.
11 |     """  # noqa: E501
12 | 
13 |     def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
14 |         """
15 |         Returns the results as is.
16 | 
17 |         Args:
18 |             results: A list of KBQueryResult to rerank.
19 | 
20 |         Returns:
21 |             results: A list of KBQueryResult, same as the input.
22 |         """  # noqa: E501
23 |         return results
24 | 
25 |     async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]:
26 |         return results
27 | 


--------------------------------------------------------------------------------
/src/canopy/llm/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseLLM
2 | from .openai import OpenAILLM
3 | from .anyscale import AnyscaleLLM
4 | from .azure_openai_llm import AzureOpenAILLM
5 | from .cohere import CohereLLM
6 | from .octoai import OctoAILLM
7 | 


--------------------------------------------------------------------------------
/src/canopy/llm/anyscale.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Any
 2 | import os
 3 | from canopy.llm import OpenAILLM
 4 | from canopy.llm.models import Function
 5 | from canopy.models.data_models import Messages
 6 | 
 7 | ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
 8 | FUNCTION_MODEL_LIST = [
 9 |     "mistralai/Mistral-7B-Instruct-v0.1",
10 |     "mistralai/Mixtral-8x7B-Instruct-v0.1",
11 | ]
12 | 
13 | 
14 | class AnyscaleLLM(OpenAILLM):
15 |     """
16 |     Anyscale LLM wrapper built on top of the OpenAI Python client.
17 | 
18 |     Note: Anyscale requires a valid API key to use this class.
19 |           You can set the "ANYSCALE_API_KEY" environment variable.
20 |     """
21 | 
22 |     def __init__(
23 |         self,
24 |         model_name: str = "meta-llama/Llama-2-7b-chat-hf",
25 |         *,
26 |         base_url: Optional[str] = ANYSCALE_BASE_URL,
27 |         api_key: Optional[str] = None,
28 |         **kwargs: Any,
29 |     ):
30 |         ae_api_key = api_key or os.environ.get("ANYSCALE_API_KEY")
31 |         if not ae_api_key:
32 |             raise ValueError(
33 |                 "Anyscale API key is required to use Anyscale. "
34 |                 "Please provide it as an argument "
35 |                 "or set the ANYSCALE_API_KEY environment variable."
36 |             )
37 |         ae_base_url = base_url
38 |         super().__init__(model_name, api_key=ae_api_key, base_url=ae_base_url, **kwargs)
39 | 
40 |     def enforced_function_call(
41 |         self,
42 |         system_prompt: str,
43 |         chat_history: Messages,
44 |         function: Function,
45 |         *,
46 |         max_tokens: Optional[int] = None,
47 |         model_params: Optional[dict] = None,
48 |     ) -> dict:
49 |         model = self.model_name
50 |         if model_params and "model" in model_params:
51 |             model = model_params["model"]
52 |         if model not in FUNCTION_MODEL_LIST:
53 |             raise NotImplementedError(
54 |                 f"Model {model} doesn't support function calling. "
55 |                 "To use function calling capability, please select a different model.\n"
56 |                 "Pleaes check following link for details: "
57 |                 "https://docs.endpoints.anyscale.com/guides/function-calling"
58 |             )
59 |         else:
60 |             return super().enforced_function_call(
61 |                 system_prompt, chat_history, function,
62 |                 max_tokens=max_tokens, model_params=model_params
63 |             )
64 | 
65 |     def aenforced_function_call(self,
66 |                                 system_prompt: str,
67 |                                 chat_history: Messages,
68 |                                 function: Function,
69 |                                 *,
70 |                                 max_tokens: Optional[int] = None,
71 |                                 model_params: Optional[dict] = None
72 |                                 ):
73 |         raise NotImplementedError()
74 | 


--------------------------------------------------------------------------------
/src/canopy/llm/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Union, Iterable, Optional
 3 | 
 4 | from canopy.llm.models import Function
 5 | from canopy.models.api_models import ChatResponse, StreamingChatChunk
 6 | from canopy.models.data_models import Messages, Context
 7 | from canopy.utils.config import ConfigurableMixin
 8 | 
 9 | 
10 | class BaseLLM(ABC, ConfigurableMixin):
11 |     def __init__(self,
12 |                  model_name: str):
13 |         self.model_name = model_name
14 | 
15 |     @abstractmethod
16 |     def chat_completion(self,
17 |                         system_prompt: str,
18 |                         chat_history: Messages,
19 |                         context: Optional[Context] = None,
20 |                         *,
21 |                         stream: bool = False,
22 |                         max_tokens: Optional[int] = None,
23 |                         model_params: Optional[dict] = None,
24 |                         ) -> Union[ChatResponse, Iterable[StreamingChatChunk]]:
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def enforced_function_call(self,
29 |                                system_prompt: str,
30 |                                chat_history: Messages,
31 |                                function: Function,
32 |                                *,
33 |                                max_tokens: Optional[int] = None,
34 |                                model_params: Optional[dict] = None,
35 |                                ) -> dict:
36 |         pass
37 | 
38 |     @abstractmethod
39 |     async def achat_completion(self,
40 |                                system_prompt: str,
41 |                                chat_history: Messages,
42 |                                context: Optional[Context] = None,
43 |                                *,
44 |                                stream: bool = False,
45 |                                max_generated_tokens: Optional[int] = None,
46 |                                model_params: Optional[dict] = None,
47 |                                ) -> Union[ChatResponse,
48 |                                           Iterable[StreamingChatChunk]]:
49 |         pass
50 | 
51 |     @abstractmethod
52 |     async def aenforced_function_call(self,
53 |                                       system_prompt: str,
54 |                                       chat_history: Messages,
55 |                                       function: Function,
56 |                                       *,
57 |                                       max_tokens: Optional[int] = None,
58 |                                       model_params: Optional[dict] = None
59 |                                       ):
60 |         pass
61 | 


--------------------------------------------------------------------------------
/src/canopy/llm/models.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List, Union
 2 | 
 3 | from pydantic import BaseModel, model_serializer
 4 | 
 5 | 
 6 | class FunctionPrimitiveProperty(BaseModel):
 7 |     name: str
 8 |     type: str
 9 |     description: Optional[str] = None
10 |     enum: Optional[List[str]] = None
11 | 
12 | 
13 | class FunctionArrayProperty(BaseModel):
14 |     name: str
15 |     items_type: str
16 |     # we require description for array properties
17 |     # because the model is more struggling with them
18 |     description: str
19 | 
20 |     def model_dump(self, *args, **kwargs):
21 |         super_dict = super().model_dump(*args, **kwargs)
22 |         if "items_type" in super_dict:
23 |             super_dict["type"] = "array"
24 |             super_dict["items"] = {"type": super_dict.pop("items_type")}
25 |         return super_dict
26 | 
27 | 
28 | FunctionProperty = Union[FunctionPrimitiveProperty, FunctionArrayProperty]
29 | 
30 | 
31 | class FunctionParameters(BaseModel):
32 |     required_properties: List[FunctionProperty]
33 |     optional_properties: List[FunctionProperty] = []
34 | 
35 |     @model_serializer()
36 |     def serialize_model(self):
37 |         return {
38 |             "type": "object",
39 |             "properties": {
40 |                 pro.name: pro.model_dump(exclude_none=True, exclude={"name"})
41 |                 for pro in self.required_properties + self.optional_properties
42 |             },
43 |             "required": [pro.name for pro in self.required_properties],
44 |         }
45 | 
46 | 
47 | class Function(BaseModel):
48 |     name: str
49 |     description: str
50 |     parameters: FunctionParameters
51 | 


--------------------------------------------------------------------------------
/src/canopy/llm/octoai.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Any
 2 | import os
 3 | from canopy.llm import OpenAILLM
 4 | from canopy.llm.models import Function
 5 | from canopy.models.data_models import Messages
 6 | 
 7 | OCTOAI_BASE_URL = "https://text.octoai.run/v1"
 8 | 
 9 | 
10 | class OctoAILLM(OpenAILLM):
11 |     """
12 |     OctoAI LLM wrapper built on top of the OpenAI Python client.
13 | 
14 |     Note: OctoAI requires a valid API key to use this class.
15 |           You can set the "OCTOAI_API_KEY" environment variable.
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |         model_name: str = "mistral-7b-instruct-fp16",
21 |         *,
22 |         base_url: Optional[str] = OCTOAI_BASE_URL,
23 |         api_key: Optional[str] = None,
24 |         **kwargs: Any,
25 |     ):
26 |         octoai_api_key = api_key or os.environ.get("OCTOAI_API_KEY")
27 |         if not octoai_api_key:
28 |             raise ValueError(
29 |                 "OctoAI API key is required to use OctoAI. "
30 |                 "If you haven't done it, please sign up at https://octo.ai \n"
31 |                 "The key can be provided as an argument or "
32 |                 "via the OCTOAI_API_KEY environment variable."
33 |             )
34 |         octoai_base_url = base_url
35 |         super().__init__(
36 |             model_name,
37 |             api_key=octoai_api_key,
38 |             base_url=octoai_base_url,
39 |             **kwargs
40 |         )
41 | 
42 |     def enforced_function_call(
43 |         self,
44 |         system_prompt: str,
45 |         chat_history: Messages,
46 |         function: Function,
47 |         *,
48 |         max_tokens: Optional[int] = None,
49 |         model_params: Optional[dict] = None,
50 |     ) -> dict:
51 |         raise NotImplementedError("OctoAI doesn't support function calling.")
52 | 
53 |     def aenforced_function_call(self,
54 |                                 system_prompt: str,
55 |                                 chat_history: Messages,
56 |                                 function: Function,
57 |                                 *,
58 |                                 max_tokens: Optional[int] = None,
59 |                                 model_params: Optional[dict] = None
60 |                                 ):
61 |         raise NotImplementedError("OctoAI doesn't support function calling.")
62 | 


--------------------------------------------------------------------------------
/src/canopy/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy/models/__init__.py


--------------------------------------------------------------------------------
/src/canopy/models/api_models.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence, Iterable
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | from canopy.models.data_models import MessageBase
 6 | 
 7 | 
 8 | class _Choice(BaseModel):
 9 |     index: int
10 |     message: MessageBase
11 |     finish_reason: Optional[str] = None
12 | 
13 | 
14 | class _StreamChoice(BaseModel):
15 |     index: int
16 |     delta: dict
17 |     finish_reason: Optional[str] = None
18 | 
19 | 
20 | class TokenCounts(BaseModel):
21 |     prompt_tokens: int
22 |     completion_tokens: int
23 |     total_tokens: int
24 | 
25 | 
26 | class ChatResponse(BaseModel):
27 |     id: str = Field(description="Canopy session Id.")
28 |     object: str
29 |     created: int
30 |     model: str
31 |     choices: Sequence[_Choice]
32 |     usage: TokenCounts
33 |     debug_info: dict = Field(default_factory=dict, exclude=True)
34 | 
35 | 
36 | class StreamingChatChunk(BaseModel):
37 |     id: str
38 |     object: str
39 |     created: int
40 |     model: str
41 |     choices: Sequence[_StreamChoice]
42 | 
43 | 
44 | class StreamingChatResponse(BaseModel):
45 |     chunks: Iterable[StreamingChatChunk]
46 |     debug_info: dict = Field(default_factory=dict, exclude=True)
47 | 


--------------------------------------------------------------------------------
/src/canopy/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .cohere import CohereAPITokenizer, CohereHFTokenizer
2 | from .llama import LlamaTokenizer
3 | from .openai import OpenAITokenizer
4 | from .tokenizer import Tokenizer
5 | 


--------------------------------------------------------------------------------
/src/canopy/tokenizer/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | from canopy.models.data_models import Messages
 5 | from canopy.utils.config import ConfigurableMixin
 6 | 
 7 | 
 8 | class BaseTokenizer(ABC, ConfigurableMixin):
 9 | 
10 |     @abstractmethod
11 |     def tokenize(self, text: str) -> List[str]:
12 |         pass
13 | 
14 |     @abstractmethod
15 |     def detokenize(self, tokens: List[str]) -> str:
16 |         pass
17 | 
18 |     def token_count(self, text: str) -> int:
19 |         return len(self.tokenize(text))
20 | 
21 |     @abstractmethod
22 |     def messages_token_count(self, messages: Messages) -> int:
23 |         pass
24 | 


--------------------------------------------------------------------------------
/src/canopy/tokenizer/openai.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | from typing import List
 3 | from .base import BaseTokenizer
 4 | from ..models.data_models import Messages
 5 | 
 6 | 
 7 | class OpenAITokenizer(BaseTokenizer):
 8 |     """
 9 |     Tokenizer for OpenAI models, based on the tiktoken library.
10 | 
11 |     Usage:
12 |     Initialize the singleton tokenizer with the OpenAITokenizer class:
13 |     >>> from canopy.tokenizer import Tokenizer
14 |     >>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo")
15 | 
16 |     You can then use the tokenizer instance from anywhere in the code:
17 |     >>> from canopy.tokenizer import Tokenizer
18 |     >>> tokenizer = Tokenizer()
19 |     >>> tokenizer.tokenize("Hello world!")
20 |     ['Hello', ' world', '!']
21 |     """  # noqa: E501
22 | 
23 |     MESSAGE_TOKENS_OVERHEAD = 3
24 |     FIXED_PREFIX_TOKENS = 3
25 | 
26 |     def __init__(self, model_name: str = "gpt-3.5-turbo"):
27 |         """
28 |         Initialize the tokenizer.
29 | 
30 |         Args:
31 |             model_name: The name of the model to use. Defaults to "gpt-3.5-turbo".
32 |                         You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18
33 |                         As you can see, both gpt-3.5 and gpt-4 are using the same cl100k_base tokenizer.
34 |         """  # noqa: E501
35 |         self._encoder = tiktoken.encoding_for_model(model_name)
36 | 
37 |     def tokenize(self, text: str) -> List[str]:
38 |         """
39 |         Tokenize a text using tiktoken.
40 | 
41 |         Args:
42 |             text: The text to tokenize.
43 | 
44 |         Returns:
45 |             The list of tokens.
46 |         """
47 |         return [self._encoder.decode([encoded_token])
48 |                 for encoded_token in self._encode(text)]
49 | 
50 |     def detokenize(self, tokens: List[str]) -> str:
51 |         """
52 |         Detokenize a list of tokens that were previously tokenized using this tokenizer.
53 | 
54 |         Args:
55 |             tokens: The list of tokens to detokenize.
56 | 
57 |         Returns:
58 |             The detokenized text as a string.
59 |         """
60 |         if not isinstance(tokens, List):
61 |             raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
62 |         return "".join(tokens)
63 | 
64 |     def token_count(self, text: str) -> int:
65 |         """
66 |         Count the number of tokens in a text.
67 | 
68 |         Args:
69 |             text: The text to count the tokens of.
70 | 
71 |         Returns:
72 |             The number of tokens in the text.
73 |         """
74 |         return len(self._encode(text))
75 | 
76 |     def _encode(self, text):
77 |         return self._encoder.encode(text, disallowed_special=())
78 | 
79 |     def messages_token_count(self, messages: Messages) -> int:
80 |         """
81 |         Count the number of tokens in a list of messages as expected to be counted by OpenAI models.
82 |         Account for the overhead of the messages structure.
83 |         Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb
84 | 
85 |         Args:
86 |             messages: The list of messages to count the tokens of.
87 | 
88 |         Returns:
89 |             The number of tokens in the messages, as expected to be counted by OpenAI models.
90 |         """  # noqa: E501
91 |         num_tokens = 0
92 |         for message in messages:
93 |             num_tokens += self.MESSAGE_TOKENS_OVERHEAD
94 |             for key, value in message.model_dump().items():
95 |                 num_tokens += self.token_count(value)
96 |         num_tokens += self.FIXED_PREFIX_TOKENS
97 |         return num_tokens
98 | 


--------------------------------------------------------------------------------
/src/canopy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy/utils/__init__.py


--------------------------------------------------------------------------------
/src/canopy/utils/debugging.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | CANOPY_DEBUG_INFO = os.getenv("CANOPY_DEBUG_INFO", "FALSE").lower() == "true"
4 | 


--------------------------------------------------------------------------------
/src/canopy/utils/directory.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | 
4 | class Directory:
5 |     """Stores the directory paths for Canopy library"""
6 | 
7 |     ROOT = Path(__file__).parent.parent
8 |     CONFIG_TEMPLATES = ROOT.joinpath("config_templates")
9 | 


--------------------------------------------------------------------------------
/src/canopy_cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy_cli/__init__.py


--------------------------------------------------------------------------------
/src/canopy_cli/cli_spinner.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import threading
 3 | import itertools
 4 | 
 5 | 
 6 | class Spinner(object):
 7 | 
 8 |     def __init__(self, disable=False, force=False, stream=sys.stdout, cycle=None):
 9 |         _cycle = cycle or ['-', '/', '|', '\\']
10 |         self.spinner_cycle = itertools.cycle(_cycle)
11 |         self.disable = disable
12 |         self.force = force
13 |         self.stream = stream
14 |         self.stop_running = None
15 |         self.spin_thread = None
16 | 
17 |     def start(self):
18 |         if self.disable:
19 |             return
20 |         if self.stream.isatty() or self.force:
21 |             self.stop_running = threading.Event()
22 |             self.spin_thread = threading.Thread(target=self.init_spin)
23 |             self.spin_thread.start()
24 | 
25 |     def stop(self):
26 |         if self.spin_thread:
27 |             self.stop_running.set()
28 |             self.spin_thread.join()
29 | 
30 |     def init_spin(self):
31 |         while not self.stop_running.is_set():
32 |             content_to_stream = next(self.spinner_cycle)
33 |             self.stream.write(content_to_stream)
34 |             self.stream.flush()
35 |             self.stop_running.wait(0.25)
36 |             self.stream.write(''.join(['\b'] * len(content_to_stream)))
37 |             self.stream.flush()
38 | 
39 |     def __enter__(self):
40 |         self.start()
41 |         return self
42 | 
43 |     def __exit__(self, exc_type, exc_val, exc_tb):
44 |         if self.disable:
45 |             return False
46 |         self.stop()
47 |         return False
48 | 


--------------------------------------------------------------------------------
/src/canopy_cli/data_loader/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_loader import (
2 |     load_from_path,
3 |     IDsNotUniqueError,
4 |     DocumentsValidationError
5 | )
6 | 


--------------------------------------------------------------------------------
/src/canopy_cli/data_loader/errors.py:
--------------------------------------------------------------------------------
 1 | import typing as t
 2 | from click._compat import get_text_stderr
 3 | from click import echo
 4 | 
 5 | 
 6 | class IDsNotUniqueError(ValueError):
 7 |     pass
 8 | 
 9 | 
10 | class DocumentsValidationError(ValueError):
11 |     pass
12 | 
13 | 
14 | class DataLoaderException(Exception):
15 |     """An exception that Click can handle and show to the user."""
16 | 
17 |     #: The exit code for this exception.
18 |     exit_code = 1
19 | 
20 |     def __init__(self, file_name: str, row_id: str, err: str) -> None:
21 |         message = f"""
22 |         {file_name}, line {row_id} - {err}
23 |         """
24 |         super().__init__(message)
25 |         self.file_name = file_name
26 |         self.row_id = row_id
27 |         self.err = err
28 | 
29 |     def format_message(self) -> str:
30 |         message = f"""
31 |         {self.file_name}, line {self.row_id} - {self.err}
32 |         """
33 |         return message
34 | 
35 |     def __str__(self) -> str:
36 |         return self.format_message()
37 | 
38 |     def show(self, file: t.Optional[t.IO] = None) -> None:
39 |         if file is None:
40 |             file = get_text_stderr()
41 | 
42 |         echo("{message}".format(message=self.format_message()), file=file)
43 | 


--------------------------------------------------------------------------------
/src/canopy_cli/errors.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from click import ClickException
 3 | 
 4 | from canopy_cli.data_loader.data_loader import format_multiline
 5 | 
 6 | 
 7 | class CLIError(ClickException):
 8 |     def format_message(self) -> str:
 9 |         return click.style(format_multiline(self.message), fg='red')
10 | 
11 | 
12 | class ConfigError(RuntimeError):
13 |     pass
14 | 


--------------------------------------------------------------------------------
/src/canopy_server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy_server/__init__.py


--------------------------------------------------------------------------------
/src/canopy_server/_redocs_template.py:
--------------------------------------------------------------------------------
 1 | HTML_TEMPLATE = """<!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 5 |     <title>Canopy API Spec</title>
 6 |     <meta charset="utf-8">
 7 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 8 |     <link rel="shortcut icon" href="https://polybit-apps.s3.amazonaws.com/stdlib/users/pinecone/profile/image.png">
 9 |     <style>
10 |         body {
11 |             margin: 0;
12 |             padding: 0;
13 |         }
14 |     </style>
15 |     <style data-styled="" data-styled-version="4.4.1"></style>
16 | </head>
17 | <body>
18 |     <div id="redoc-container"></div>
19 |     <title>Redoc</title>
20 |     <script src="https://cdn.jsdelivr.net/npm/redoc/bundles/redoc.standalone.js"> </script>
21 |     <script>
22 |         var spec = %s;
23 |         Redoc.init(spec, {}, document.getElementById("redoc-container"));
24 |     </script>
25 | </body>
26 | </html>
27 | """  # noqa: E501
28 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .unit.stubs.stub_tokenizer import StubTokenizer
2 | from canopy.tokenizer import Tokenizer
3 | 
4 | Tokenizer.initialize(StubTokenizer)
5 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | TEST_NAMESPACE = "ns"
 4 | TEST_CREATE_INDEX_PARAMS = [
 5 |     {"spec": {"serverless": {"cloud": "aws", "region": "us-west-2"}}},
 6 |     {"spec": {"pod": {"environment": "eu-west1-gcp", "pod_type": "p1.x1"}}},
 7 |     {"spec": {"pod": {"environment": "gcp-starter", "pod_type": "p1.x1"}}},
 8 | ]
 9 | 
10 | 
11 | @pytest.fixture(scope="module", params=[None, TEST_NAMESPACE])
12 | def namespace(request):
13 |     return request.param
14 | 
15 | 
16 | @pytest.fixture(scope="module",
17 |                 params=TEST_CREATE_INDEX_PARAMS,
18 |                 # The first key in the spec is the index type ("serverless" \ "pod")
19 |                 ids=[next(iter(_["spec"])) for _ in TEST_CREATE_INDEX_PARAMS])
20 | def create_index_params(request):
21 |     return request.param
22 | 


--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/system/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/__init__.py


--------------------------------------------------------------------------------
/tests/system/knowledge_base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/knowledge_base/__init__.py


--------------------------------------------------------------------------------
/tests/system/knowledge_base/qdrant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/knowledge_base/qdrant/__init__.py


--------------------------------------------------------------------------------
/tests/system/knowledge_base/qdrant/common.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import requests
 3 | from canopy.knowledge_base.qdrant.constants import DENSE_VECTOR_NAME
 4 | from canopy.knowledge_base.qdrant.converter import QdrantConverter
 5 | from canopy.knowledge_base.qdrant.qdrant_knowledge_base import QdrantKnowledgeBase
 6 | 
 7 | import logging
 8 | from typing import List
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def total_vectors_in_collection(knowledge_base: QdrantKnowledgeBase):
14 |     return knowledge_base._client.count(knowledge_base.collection_name).count
15 | 
16 | 
17 | def assert_chunks_in_collection(knowledge_base: QdrantKnowledgeBase, encoded_chunks):
18 |     ids = [QdrantConverter.convert_id(c.id) for c in encoded_chunks]
19 |     fetch_result = knowledge_base._client.retrieve(
20 |         knowledge_base.collection_name, ids=ids, with_payload=True, with_vectors=True
21 |     )
22 |     points = {p.id: p for p in fetch_result}
23 |     for chunk in encoded_chunks:
24 |         id = QdrantConverter.convert_id(chunk.id)
25 |         assert id in points
26 |         point = points[id]
27 |         assert np.allclose(
28 |             point.vector[DENSE_VECTOR_NAME],
29 |             np.array(chunk.values, dtype=np.float32),
30 |             atol=1e-8,
31 |         )
32 | 
33 |         assert point.payload["text"] == chunk.text
34 |         assert point.payload["document_id"] == chunk.document_id
35 |         assert point.payload["source"] == chunk.source
36 |         for key, value in chunk.metadata.items():
37 |             assert point.payload[key] == value
38 | 
39 | 
40 | def assert_ids_in_collection(knowledge_base, ids):
41 |     fetch_result = knowledge_base._client.retrieve(
42 |         knowledge_base.collection_name,
43 |         ids=ids,
44 |     )
45 |     assert len(fetch_result) == len(
46 |         ids
47 |     ), f"Expected {len(ids)} ids, got {len(fetch_result)}"
48 | 
49 | 
50 | def assert_num_points_in_collection(knowledge_base, num_vectors):
51 |     points_in_index = total_vectors_in_collection(knowledge_base)
52 |     assert (
53 |         points_in_index == num_vectors
54 |     ), f"Expected {num_vectors} vectors in index, got {points_in_index}"
55 | 
56 | 
57 | def assert_ids_not_in_collection(knowledge_base, ids):
58 |     fetch_result = knowledge_base._client.retrieve(
59 |         knowledge_base.collection_name,
60 |         ids=ids,
61 |     )
62 |     assert len(fetch_result) == 0, f"Found {len(fetch_result)} unexpected ids"
63 | 
64 | 
65 | def qdrant_server_running() -> bool:
66 |     """Check if Qdrant server is running."""
67 | 
68 |     try:
69 |         response = requests.get("http://localhost:6333", timeout=10.0)
70 |         response_json = response.json()
71 |         return response_json.get("title") == "qdrant - vector search engine"
72 |     except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
73 |         return False
74 | 
75 | 
76 | def qdrant_locations() -> List[str]:
77 |     if not qdrant_server_running():
78 |         logger.warning("Running Qdrant tests in memory mode only.")
79 |         return [":memory:"]
80 |     return ["http://localhost:6333", ":memory:"]
81 | 


--------------------------------------------------------------------------------
/tests/system/knowledge_base/qdrant/conftest.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from canopy.knowledge_base.qdrant.constants import COLLECTION_NAME_PREFIX
  3 | from canopy.knowledge_base.qdrant.qdrant_knowledge_base import QdrantKnowledgeBase
  4 | from canopy.models.data_models import Document
  5 | from tests.system.knowledge_base.qdrant.common import qdrant_locations
  6 | from tests.system.knowledge_base.test_knowledge_base import _generate_text
  7 | from tests.unit.stubs.stub_chunker import StubChunker
  8 | from tests.unit.stubs.stub_dense_encoder import StubDenseEncoder
  9 | from tests.unit.stubs.stub_record_encoder import StubRecordEncoder
 10 | from tests.util import create_system_tests_index_name
 11 | 
 12 | 
 13 | @pytest.fixture(scope="module")
 14 | def collection_name(testrun_uid):
 15 |     return create_system_tests_index_name(testrun_uid)
 16 | 
 17 | 
 18 | @pytest.fixture(scope="module")
 19 | def collection_full_name(collection_name):
 20 |     return COLLECTION_NAME_PREFIX + collection_name
 21 | 
 22 | 
 23 | @pytest.fixture(scope="module")
 24 | def chunker():
 25 |     return StubChunker(num_chunks_per_doc=2)
 26 | 
 27 | 
 28 | @pytest.fixture(scope="module")
 29 | def encoder():
 30 |     return StubRecordEncoder(StubDenseEncoder())
 31 | 
 32 | 
 33 | @pytest.fixture(scope="module", autouse=True, params=qdrant_locations())
 34 | def knowledge_base(collection_name, chunker, encoder, request):
 35 |     kb = QdrantKnowledgeBase(
 36 |         collection_name=collection_name,
 37 |         record_encoder=encoder,
 38 |         chunker=chunker,
 39 |         location=request.param,
 40 |     )
 41 |     kb.create_canopy_collection()
 42 | 
 43 |     return kb
 44 | 
 45 | 
 46 | @pytest.fixture
 47 | def documents_large():
 48 |     return [
 49 |         Document(
 50 |             id=f"doc_{i}_large",
 51 |             text=f"Sample document {i}",
 52 |             metadata={"my-key-large": f"value-{i}"},
 53 |         )
 54 |         for i in range(1000)
 55 |     ]
 56 | 
 57 | 
 58 | @pytest.fixture
 59 | def encoded_chunks_large(documents_large, chunker, encoder):
 60 |     chunks = chunker.chunk_documents(documents_large)
 61 |     return encoder.encode_documents(chunks)
 62 | 
 63 | 
 64 | @pytest.fixture
 65 | def documents_with_datetime_metadata():
 66 |     return [
 67 |         Document(
 68 |             id="doc_1_metadata",
 69 |             text="document with datetime metadata",
 70 |             source="source_1",
 71 |             metadata={
 72 |                 "datetime": "2021-01-01T00:00:00Z",
 73 |                 "datetime_other_format": "January 1, 2021 00:00:00",
 74 |                 "datetime_other_format_2": "2210.03945",
 75 |             },
 76 |         ),
 77 |         Document(id="2021-01-01T00:00:00Z", text="id is datetime", source="source_1"),
 78 |     ]
 79 | 
 80 | 
 81 | @pytest.fixture
 82 | def datetime_metadata_encoded_chunks(
 83 |     documents_with_datetime_metadata, chunker, encoder
 84 | ):
 85 |     chunks = chunker.chunk_documents(documents_with_datetime_metadata)
 86 |     return encoder.encode_documents(chunks)
 87 | 
 88 | 
 89 | @pytest.fixture
 90 | def encoded_chunks(documents, chunker, encoder):
 91 |     chunks = chunker.chunk_documents(documents)
 92 |     return encoder.encode_documents(chunks)
 93 | 
 94 | 
 95 | @pytest.fixture(scope="module", autouse=True)
 96 | def teardown_knowledge_base(collection_full_name, knowledge_base):
 97 |     yield
 98 | 
 99 |     knowledge_base._client.delete_collection(collection_full_name)
100 |     knowledge_base.close()
101 | 
102 | 
103 | @pytest.fixture(scope="module")
104 | def random_texts():
105 |     return [_generate_text(10) for _ in range(5)]
106 | 
107 | 
108 | @pytest.fixture
109 | def documents(random_texts):
110 |     return [
111 |         Document(
112 |             id=f"doc_{i}",
113 |             text=random_texts[i],
114 |             source=f"source_{i}",
115 |             metadata={"my-key": f"value-{i}"},
116 |         )
117 |         for i in range(5)
118 |     ]
119 | 


--------------------------------------------------------------------------------
/tests/system/knowledge_base/qdrant/test_config.yml:
--------------------------------------------------------------------------------
1 | # ===========================================================
2 | #            QdrantKnowledgeBase test configuration file
3 | # ===========================================================
4 | 
5 | knowledge_base:
6 |       params:
7 |         default_top_k: 5
8 |         collection_name: test-config-collection
9 |         default_top_k: 10


--------------------------------------------------------------------------------
/tests/system/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/llm/__init__.py


--------------------------------------------------------------------------------
/tests/system/llm/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.models.data_models import UserMessage, AssistantMessage
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def messages():
 8 |     # Create a list of MessageBase objects
 9 |     return [
10 |         UserMessage(content="Hello, assistant."),
11 |         AssistantMessage(content="Hello, user. How can I assist you?"),
12 |         UserMessage(content="Just checking in. Be concise."),
13 |     ]
14 | 


--------------------------------------------------------------------------------
/tests/system/llm/test_azure_openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from canopy.llm import AzureOpenAILLM
 6 | from .test_openai import SYSTEM_PROMPT
 7 | 
 8 | MODEL_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
 9 | 
10 | 
11 | @pytest.fixture
12 | def azure_openai_llm():
13 |     if os.getenv("AZURE_DEPLOYMENT_NAME") is None:
14 |         pytest.skip(
15 |             "Couldn't find Azure deployment name. Skipping Azure OpenAI tests."
16 |         )
17 |     return AzureOpenAILLM(model_name=os.getenv("AZURE_DEPLOYMENT_NAME"))
18 | 
19 | 
20 | def test_init_params(azure_openai_llm):
21 |     llm = AzureOpenAILLM(
22 |         model_name="test_model_name",
23 |         api_version="2020-05-03",
24 |         api_key="test_api_key",
25 |         temperature=0.9,
26 |         top_p=0.95,
27 |         n=3,
28 |     )
29 | 
30 |     assert llm.model_name == "test_model_name"
31 |     assert llm.default_model_params["temperature"] == 0.9
32 |     assert llm.default_model_params["top_p"] == 0.95
33 |     assert llm.default_model_params["n"] == 3
34 |     assert llm._client.api_key == "test_api_key"
35 |     assert llm._client._api_version == "2020-05-03"
36 | 
37 | 
38 | @pytest.fixture()
39 | def no_api_key():
40 |     before = os.environ.pop("AZURE_OPENAI_API_KEY", None)
41 |     yield
42 |     if before is not None:
43 |         os.environ["AZURE_OPENAI_API_KEY"] = before
44 | 
45 | 
46 | def test_missing_api_key(no_api_key):
47 |     with pytest.raises(RuntimeError, match="AZURE_OPENAI_API_KEY"):
48 |         AzureOpenAILLM(MODEL_NAME)
49 | 
50 | 
51 | @pytest.fixture()
52 | def bad_api_key():
53 |     before = os.environ.pop("AZURE_OPENAI_API_KEY", None)
54 |     os.environ["AZURE_OPENAI_API_KEY"] = "bad key"
55 |     yield
56 |     if before is not None:
57 |         os.environ["AZURE_OPENAI_API_KEY"] = before
58 | 
59 | 
60 | def test_bad_api_key(bad_api_key, messages):
61 |     with pytest.raises(RuntimeError, match="AZURE_OPENAI_API_KEY"):
62 |         llm = AzureOpenAILLM(MODEL_NAME)
63 |         llm.chat_completion(system_prompt=SYSTEM_PROMPT, chat_history=messages)
64 | 
65 | 
66 | @pytest.fixture()
67 | def no_azure_endpoint():
68 |     before = os.environ.pop("AZURE_OPENAI_ENDPOINT", None)
69 |     yield
70 |     if before is not None:
71 |         os.environ["AZURE_OPENAI_ENDPOINT"] = before
72 | 
73 | 
74 | def test_missing_azure_endpoint(no_azure_endpoint):
75 |     with pytest.raises(RuntimeError, match="AZURE_OPENAI_ENDPOINT"):
76 |         AzureOpenAILLM(MODEL_NAME)
77 | 
78 | 
79 | @pytest.fixture()
80 | def bad_azure_endpoint():
81 |     before = os.environ.pop("AZURE_OPENAI_ENDPOINT", None)
82 |     os.environ["AZURE_OPENAI_ENDPOINT"] = "bad endpoint"
83 |     yield
84 |     if before is not None:
85 |         os.environ["AZURE_OPENAI_ENDPOINT"] = before
86 | 
87 | 
88 | def test_bad_azure_endpoint(bad_azure_endpoint, messages):
89 |     with pytest.raises(RuntimeError, match="Azure OpenAI endpoint"):
90 |         llm = AzureOpenAILLM(MODEL_NAME)
91 |         llm.chat_completion(system_prompt=SYSTEM_PROMPT, chat_history=messages)
92 | 
93 | # def test_function_calling_error(azure_openai_llm):
94 | 


--------------------------------------------------------------------------------
/tests/system/query_generator/test_cohere_query_generator.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.chat_engine.query_generator.cohere import CohereQueryGenerator
 4 | from canopy.models.data_models import MessageBase, Role
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def messages():
 9 |     return [
10 |         MessageBase(
11 |             role=Role.USER, content="Hello, assistant."),
12 |         MessageBase(
13 |             role=Role.ASSISTANT, content="Hello, user. How can I assist you?"),
14 |         MessageBase(
15 |             role=Role.USER, content="How do I init a pinecone client?.")
16 |     ]
17 | 
18 | 
19 | def test_generate_queries(messages):
20 |     query_generator = CohereQueryGenerator()
21 |     queries = query_generator.generate(messages, max_prompt_tokens=100)
22 |     assert queries
23 |     assert queries[0].text
24 | 
25 | 
26 | def test_max_tokens_exceeded_raises_error(messages):
27 |     query_generator = CohereQueryGenerator()
28 | 
29 |     with pytest.raises(ValueError):
30 |         query_generator.generate(messages, max_prompt_tokens=10)
31 | 


--------------------------------------------------------------------------------
/tests/system/query_generator/test_query_generator_integration.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.tokenizer.tokenizer import Tokenizer # noqa
 4 | from canopy.llm.openai import OpenAILLM # noqa
 5 | from canopy.models.data_models import MessageBase, Query # noqa
 6 | from canopy.chat_engine.query_generator import FunctionCallingQueryGenerator # noqa
 7 | from typing import List # noqa
 8 | 
 9 | 
10 | class TestFunctionCallingQueryGeneratorSystem:
11 | 
12 |     @staticmethod
13 |     @pytest.fixture
14 |     def openai_llm():
15 |         Tokenizer.initialize()
16 | 
17 |     @staticmethod
18 |     @pytest.fixture
19 |     def query_generator(openai_llm):
20 |         query_gen = FunctionCallingQueryGenerator(
21 |             llm=openai_llm,
22 |         )
23 |         return query_gen
24 | 
25 |     @staticmethod
26 |     @pytest.fixture
27 |     def sample_messages():
28 |         return [
29 |             MessageBase(role="user", content="What is photosynthesis?")
30 |         ]
31 | 
32 |     @staticmethod
33 |     def test_generate_default_params(query_generator,
34 |                                      sample_messages):
35 |         result = query_generator.generate(messages=sample_messages,
36 |                                           max_prompt_tokens=100)
37 |         assert isinstance(result, List)
38 |         assert len(result) > 0
39 |         for query in result:
40 |             assert isinstance(query, Query)
41 |             assert len(query.text) > 0
42 | 


--------------------------------------------------------------------------------
/tests/system/record_encoder/test_anyscale_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.knowledge_base.record_encoder.anyscale import AnyscaleRecordEncoder
 5 | from canopy.models.data_models import Query
 6 | 
 7 | 
 8 | documents = [KBDocChunk(
 9 |             id=f"doc_1_{i}",
10 |             text=f"Sample document {i}",
11 |             document_id=f"doc_{i}",
12 |             metadata={"test": i},
13 |             source="doc_1",
14 |         )
15 |         for i in range(4)
16 |     ]
17 | 
18 | queries = [Query(text="Sample query 1"),
19 |            Query(text="Sample query 2"),
20 |            Query(text="Sample query 3"),
21 |            Query(text="Sample query 4")]
22 | 
23 | 
24 | @pytest.fixture
25 | def encoder():
26 |     return AnyscaleRecordEncoder(batch_size=2)
27 | 
28 | 
29 | def test_dimension(encoder):
30 |     assert encoder.dimension == 1024
31 | 
32 | 
33 | @pytest.mark.parametrize("items,function",
34 |                          [(documents, "encode_documents"),
35 |                           (queries, "encode_queries"),
36 |                           ([], "encode_documents"),
37 |                           ([], "encode_queries")])
38 | def test_encode_documents(encoder, items, function):
39 | 
40 |     encoded_documents = getattr(encoder, function)(items)
41 | 
42 |     assert len(encoded_documents) == len(items)
43 |     assert all(len(encoded.values) == encoder.dimension
44 |                for encoded in encoded_documents)
45 | 
46 | 
47 | @pytest.mark.asyncio
48 | @pytest.mark.parametrize("items,function",
49 |                          [("aencode_documents", documents),
50 |                           ("aencode_queries", queries)])
51 | async def test_aencode_not_implemented(encoder, function, items):
52 |     with pytest.raises(NotImplementedError):
53 |         await encoder.aencode_queries(items)
54 | 


--------------------------------------------------------------------------------
/tests/system/record_encoder/test_cohere_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.knowledge_base.record_encoder.cohere import CohereRecordEncoder
 5 | from canopy.models.data_models import Query
 6 | 
 7 | 
 8 | documents = [KBDocChunk(
 9 |             id=f"doc_1_{i}",
10 |             text=f"Sample document {i}",
11 |             document_id=f"doc_{i}",
12 |             metadata={"test": i},
13 |             source="doc_1",
14 |         )
15 |         for i in range(4)
16 |     ]
17 | 
18 | queries = [Query(text="Sample query 1"),
19 |            Query(text="Sample query 2"),
20 |            Query(text="Sample query 3"),
21 |            Query(text="Sample query 4")]
22 | 
23 | 
24 | @pytest.fixture
25 | def encoder():
26 |     return CohereRecordEncoder(batch_size=2)
27 | 
28 | 
29 | def test_dimension(encoder):
30 |     assert encoder.dimension == 1024
31 | 
32 | 
33 | @pytest.mark.parametrize("items,function",
34 |                          [(documents, "encode_documents"),
35 |                           (queries, "encode_queries"),
36 |                           ([], "encode_documents"),
37 |                           ([], "encode_queries")])
38 | def test_encode_documents(encoder, items, function):
39 | 
40 |     encoded_documents = getattr(encoder, function)(items)
41 | 
42 |     assert len(encoded_documents) == len(items)
43 |     assert all(len(encoded.values) == encoder.dimension
44 |                for encoded in encoded_documents)
45 | 
46 | 
47 | @pytest.mark.asyncio
48 | @pytest.mark.parametrize("items,function",
49 |                          [("aencode_documents", documents),
50 |                           ("aencode_queries", queries)])
51 | async def test_aencode_not_implemented(encoder, function, items):
52 |     with pytest.raises(NotImplementedError):
53 |         await encoder.aencode_queries(items)
54 | 


--------------------------------------------------------------------------------
/tests/system/record_encoder/test_jina_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from canopy.knowledge_base.models import KBDocChunk
 6 | from canopy.knowledge_base.record_encoder.jina import JinaRecordEncoder
 7 | from canopy.models.data_models import Query
 8 | 
 9 | 
10 | documents = [KBDocChunk(
11 |             id=f"doc_1_{i}",
12 |             text=f"Sample document {i}",
13 |             document_id=f"doc_{i}",
14 |             metadata={"test": i},
15 |             source="doc_1",
16 |         )
17 |         for i in range(4)
18 |     ]
19 | 
20 | queries = [Query(text="Sample query 1"),
21 |            Query(text="Sample query 2"),
22 |            Query(text="Sample query 3"),
23 |            Query(text="Sample query 4")]
24 | 
25 | 
26 | @pytest.fixture
27 | def encoder():
28 |     if os.getenv("JINA_API_KEY", None) is None:
29 |         pytest.skip("Did not find JINA_API_KEY environment variable. Skipping...")
30 |     return JinaRecordEncoder(batch_size=2)
31 | 
32 | 
33 | def test_dimension(encoder):
34 |     assert encoder.dimension == 768
35 | 
36 | 
37 | @pytest.mark.parametrize("items,function",
38 |                          [(documents, "encode_documents"),
39 |                           (queries, "encode_queries"),
40 |                           ([], "encode_documents"),
41 |                           ([], "encode_queries")])
42 | def test_encode_documents(encoder, items, function):
43 | 
44 |     encoded_documents = getattr(encoder, function)(items)
45 | 
46 |     assert len(encoded_documents) == len(items)
47 |     assert all(len(encoded.values) == encoder.dimension
48 |                for encoded in encoded_documents)
49 | 
50 | 
51 | @pytest.mark.asyncio
52 | @pytest.mark.parametrize("items,function",
53 |                          [("aencode_documents", documents),
54 |                           ("aencode_queries", queries)])
55 | async def test_aencode_not_implemented(encoder, function, items):
56 |     with pytest.raises(NotImplementedError):
57 |         await encoder.aencode_queries(items)
58 | 


--------------------------------------------------------------------------------
/tests/system/record_encoder/test_octoai_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.knowledge_base.record_encoder.octoai import OctoAIRecordEncoder
 5 | from canopy.models.data_models import Query
 6 | 
 7 | 
 8 | documents = [KBDocChunk(
 9 |             id=f"doc_1_{i}",
10 |             text=f"Sample document {i}",
11 |             document_id=f"doc_{i}",
12 |             metadata={"test": i},
13 |             source="doc_1",
14 |         )
15 |         for i in range(4)
16 |     ]
17 | 
18 | queries = [Query(text="Sample query 1"),
19 |            Query(text="Sample query 2"),
20 |            Query(text="Sample query 3"),
21 |            Query(text="Sample query 4")]
22 | 
23 | 
24 | @pytest.fixture
25 | def encoder():
26 |     return OctoAIRecordEncoder(batch_size=2)
27 | 
28 | 
29 | def test_dimension(encoder):
30 |     assert encoder.dimension == 1024
31 | 
32 | 
33 | @pytest.mark.parametrize("items,function",
34 |                          [(documents, "encode_documents"),
35 |                           (queries, "encode_queries"),
36 |                           ([], "encode_documents"),
37 |                           ([], "encode_queries")])
38 | def test_encode_documents(encoder, items, function):
39 | 
40 |     encoded_documents = getattr(encoder, function)(items)
41 | 
42 |     assert len(encoded_documents) == len(items)
43 |     assert all(len(encoded.values) == encoder.dimension
44 |                for encoded in encoded_documents)
45 | 
46 | 
47 | @pytest.mark.asyncio
48 | @pytest.mark.parametrize("items,function",
49 |                          [("aencode_documents", documents),
50 |                           ("aencode_queries", queries)])
51 | async def test_aencode_not_implemented(encoder, function, items):
52 |     with pytest.raises(NotImplementedError):
53 |         await encoder.aencode_queries(items)
54 | 


--------------------------------------------------------------------------------
/tests/system/record_encoder/test_openai_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from canopy.knowledge_base.models import KBDocChunk
 6 | from canopy.knowledge_base.record_encoder import AzureOpenAIRecordEncoder
 7 | from canopy.knowledge_base.record_encoder.openai import OpenAIRecordEncoder
 8 | from canopy.models.data_models import Query
 9 | 
10 | 
11 | documents = [KBDocChunk(
12 |             id=f"doc_1_{i}",
13 |             text=f"Sample document {i}",
14 |             document_id=f"doc_{i}",
15 |             metadata={"test": i},
16 |             source="doc_1",
17 |         )
18 |         for i in range(4)
19 |     ]
20 | 
21 | queries = [Query(text="Sample query 1"),
22 |            Query(text="Sample query 2"),
23 |            Query(text="Sample query 3"),
24 |            Query(text="Sample query 4")]
25 | 
26 | 
27 | @pytest.fixture(params=[OpenAIRecordEncoder, AzureOpenAIRecordEncoder])
28 | def encoder(request):
29 |     encoder_class = request.param
30 |     if encoder_class == AzureOpenAIRecordEncoder:
31 |         model_name = os.getenv("AZURE_EMBEDDING_DEPLOYMENT_NAME")
32 |         if model_name is None:
33 |             pytest.skip(
34 |                 "Couldn't find Azure deployment name. Skipping Azure OpenAI tests."
35 |             )
36 |         return AzureOpenAIRecordEncoder(model_name=model_name, batch_size=2)
37 |     elif encoder_class == OpenAIRecordEncoder:
38 |         return OpenAIRecordEncoder(batch_size=2)
39 | 
40 | 
41 | def test_dimension(encoder):
42 |     assert encoder.dimension == 1536
43 | 
44 | 
45 | @pytest.mark.parametrize("items,function",
46 |                          [(documents, "encode_documents"),
47 |                           (queries, "encode_queries"),
48 |                           ([], "encode_documents"),
49 |                           ([], "encode_queries")])
50 | def test_encode_documents(encoder, items, function):
51 | 
52 |     encoded_documents = getattr(encoder, function)(items)
53 | 
54 |     assert len(encoded_documents) == len(items)
55 |     assert all(len(encoded.values) == encoder.dimension
56 |                for encoded in encoded_documents)
57 | 
58 | 
59 | @pytest.mark.asyncio
60 | @pytest.mark.parametrize("items,function",
61 |                          [("aencode_documents", documents),
62 |                           ("aencode_queries", queries)])
63 | async def test_aencode_not_implemented(encoder, function, items):
64 |     with pytest.raises(NotImplementedError):
65 |         await encoder.aencode_queries(items)
66 | 


--------------------------------------------------------------------------------
/tests/system/record_encoder/test_sentence_transformers_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.knowledge_base.record_encoder.sentence_transformers import (
 5 |     SentenceTransformerRecordEncoder
 6 | )
 7 | from canopy.models.data_models import Query
 8 | 
 9 | documents = [KBDocChunk(
10 |     id=f"doc_1_{i}",
11 |     text=f"Sample document {i}",
12 |     document_id=f"doc_{i}",
13 |     metadata={"test": i},
14 |     source="doc_1",
15 | )
16 |     for i in range(4)
17 | ]
18 | 
19 | queries = [Query(text="Sample query 1"),
20 |            Query(text="Sample query 2"),
21 |            Query(text="Sample query 3"),
22 |            Query(text="Sample query 4")]
23 | 
24 | 
25 | @pytest.fixture
26 | def encoder():
27 |     try:
28 |         encoder = SentenceTransformerRecordEncoder(batch_size=2)
29 |     except ImportError:
30 |         pytest.skip(
31 |             "`transformers` extra not installed. Skipping SentenceTransformer system "
32 |             "tests"
33 |         )
34 |     return encoder
35 | 
36 | 
37 | def test_dimension(encoder):
38 |     assert encoder.dimension == 384
39 | 
40 | 
41 | @pytest.mark.parametrize("items,function",
42 |                          [(documents, "encode_documents"),
43 |                           (queries, "encode_queries"),
44 |                           ([], "encode_documents"),
45 |                           ([], "encode_queries")])
46 | def test_encode_documents(encoder, items, function):
47 | 
48 |     encoded_documents = getattr(encoder, function)(items)
49 | 
50 |     assert len(encoded_documents) == len(items)
51 |     assert all(len(encoded.values) == encoder.dimension
52 |                for encoded in encoded_documents)
53 | 
54 | 
55 | @pytest.mark.asyncio
56 | @pytest.mark.parametrize("items,function",
57 |                          [("aencode_documents", documents),
58 |                           ("aencode_queries", queries)])
59 | async def test_aencode_not_implemented(encoder, function, items):
60 |     with pytest.raises(NotImplementedError):
61 |         await encoder.aencode_queries(items)
62 | 


--------------------------------------------------------------------------------
/tests/system/reranker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/reranker/__init__.py


--------------------------------------------------------------------------------
/tests/system/reranker/test_cohere_reranker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from canopy.knowledge_base.models import KBQueryResult, KBDocChunkWithScore
 6 | from canopy.knowledge_base.reranker import CohereReranker
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def should_run_test():
11 |     if os.getenv("CO_API_KEY") is None:
12 |         pytest.skip(
13 |             "Couldn't find Cohere API key. Skipping Cohere tests."
14 |         )
15 | 
16 | 
17 | @pytest.fixture
18 | def cohere_reranker(should_run_test):
19 |     return CohereReranker()
20 | 
21 | 
22 | @pytest.fixture
23 | def documents():
24 |     return [
25 |         KBDocChunkWithScore(
26 |             id=f"doc_1_{i}",
27 |             text=f"Sample chunk {i}",
28 |             document_id="doc_1",
29 |             source="doc_1",
30 |             score=0.1 * i
31 |         ) for i in range(4)
32 |     ]
33 | 
34 | 
35 | @pytest.fixture
36 | def query_result(documents):
37 |     return KBQueryResult(query="Sample query 1",
38 |                          documents=documents)
39 | 
40 | 
41 | def test_rerank_empty(cohere_reranker):
42 |     results = cohere_reranker.rerank([])
43 |     assert results == []
44 | 
45 | 
46 | def test_rerank(cohere_reranker, query_result, documents):
47 |     id_to_score = {d.id: d.score for d in query_result.documents}
48 |     ranked_result = next(iter(cohere_reranker.rerank([query_result])))
49 |     reranked_scores = [doc.score for doc in ranked_result.documents]
50 | 
51 |     assert len(ranked_result.documents) == len(documents)
52 |     assert reranked_scores == sorted(reranked_scores, reverse=True)
53 | 
54 |     # Make sure the scores are overriden by the reranker
55 |     for doc in ranked_result.documents:
56 |         assert doc.score != id_to_score[doc.id]
57 | 
58 | 
59 | def test_bad_api_key(should_run_test, query_result):
60 |     with pytest.raises(RuntimeError, match="invalid api token"):
61 |         CohereReranker(api_key="bad key").rerank([query_result])
62 | 
63 | 
64 | def test_model_name_invalid(should_run_test, query_result):
65 |     with pytest.raises(RuntimeError, match="model .* not found"):
66 |         CohereReranker(model_name="my-madeup-model").rerank([query_result])
67 | 
68 | 
69 | def test_top_n(should_run_test, query_result):
70 |     results = CohereReranker(top_n=1).rerank([query_result])
71 |     assert len(results[0].documents) == 1
72 | 


--------------------------------------------------------------------------------
/tests/system/reranker/test_transparent_reranker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunkWithScore, KBQueryResult
 4 | from canopy.knowledge_base.reranker import TransparentReranker
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def documents():
 9 |     return [
10 |         KBDocChunkWithScore(
11 |             id=f"doc_1_{i}",
12 |             text=f"Sample chunk {i}",
13 |             document_id="doc_1",
14 |             source="doc_1",
15 |             score=0.1 * i
16 |         ) for i in range(1)
17 |     ]
18 | 
19 | 
20 | @pytest.fixture
21 | def query_result(documents):
22 |     return KBQueryResult(query="Sample query 1",
23 |                          documents=documents)
24 | 
25 | 
26 | def test_rerank(query_result):
27 |     assert TransparentReranker().rerank([query_result]) == [query_result]
28 | 


--------------------------------------------------------------------------------
/tests/system/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/tokenizer/__init__.py


--------------------------------------------------------------------------------
/tests/system/tokenizer/test_cohere_api_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from canopy.models.data_models import MessageBase, Role
 6 | from canopy.tokenizer import CohereAPITokenizer
 7 | from ...unit.tokenizer.base_test_tokenizer import BaseTestTokenizer
 8 | 
 9 | 
10 | class TestCohereAPITokenizer(BaseTestTokenizer):
11 |     @staticmethod
12 |     @pytest.fixture(scope="class")
13 |     def tokenizer():
14 |         if not os.getenv("CO_API_KEY"):
15 |             pytest.skip("Skipping Cohere API tokenizer tests because "
16 |                         "COHERE_API_KEY environment variable is not set.")
17 |         return CohereAPITokenizer(model_name="command")
18 | 
19 |     @staticmethod
20 |     @pytest.fixture
21 |     def text():
22 |         return "string with special characters like !@#$%^&*()_+日本 " \
23 |                "spaces   \n \n\n CASE cAse "
24 | 
25 |     @staticmethod
26 |     @pytest.fixture
27 |     def expected_tokens(text):
28 |         return ['string', ' with', ' special', ' characters', ' like',
29 |                 ' !', '@', '#', '$', '%', '^', '&', '*', '()', '_', '+', '日',
30 |                 '本', ' spaces', '   ', '\n ', '\n\n', ' CASE', ' c', 'A',
31 |                 'se', " "]
32 | 
33 |     @staticmethod
34 |     def test_messages_token_count(tokenizer):
35 |         messages = [MessageBase(role=Role.USER, content="Hello, assistant.")]
36 |         assert tokenizer.messages_token_count(messages) == 11
37 | 
38 |         messages = [MessageBase(role=Role.USER,
39 |                                 content="Hello, assistant."),
40 |                     MessageBase(role=Role.ASSISTANT,
41 |                                 content="Hello, user. How can I assist you?")]
42 |         assert tokenizer.messages_token_count(messages) == 25
43 | 
44 |     @staticmethod
45 |     def test_messages_token_count_empty_messages(tokenizer):
46 |         assert tokenizer.messages_token_count([]) == 3
47 | 


--------------------------------------------------------------------------------
/tests/system/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/utils/__init__.py


--------------------------------------------------------------------------------
/tests/system/utils/test_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | import yaml
 5 | 
 6 | from canopy.chat_engine import ChatEngine
 7 | from canopy.context_engine import ContextEngine
 8 | from canopy.knowledge_base import KnowledgeBase
 9 | from canopy.utils.directory import Directory
10 | 
11 | 
12 | @pytest.fixture(scope='module')
13 | def temp_index_name():
14 |     index_name_before = os.getenv("INDEX_NAME", None)
15 | 
16 |     os.environ["INDEX_NAME"] = "temp_index"
17 |     yield "temp_index"
18 | 
19 |     if index_name_before is None:
20 |         del os.environ["INDEX_NAME"]
21 |     else:
22 |         os.environ["INDEX_NAME"] = index_name_before
23 | 
24 | 
25 | def test_default_config_matches_code_defaults(temp_index_name):
26 | 
27 |     with open(Directory.CONFIG_TEMPLATES.joinpath("default.yaml")) as file:
28 |         default_config = yaml.safe_load(file)
29 | 
30 |     chat_engine_config = default_config['chat_engine']
31 | 
32 |     loaded_chat_engine = ChatEngine.from_config(chat_engine_config)
33 |     default_kb = KnowledgeBase(index_name=temp_index_name)
34 |     default_context_engine = ContextEngine(default_kb)
35 |     default_chat_engine = ChatEngine(default_context_engine)
36 | 
37 |     def assert_identical_components(loaded_component, default_component):
38 |         assert type(loaded_component) == type(default_component)  # noqa: E721
39 |         if not loaded_component.__module__.startswith("canopy"):
40 |             return
41 | 
42 |         for key, value in default_component.__dict__.items():
43 |             assert hasattr(loaded_component, key), (
44 |                 f"Missing attribute {key} in {type(loaded_component)}"
45 |             )
46 |             if hasattr(value, '__dict__'):
47 |                 assert_identical_components(getattr(loaded_component, key), value)
48 |             else:
49 |                 assert getattr(loaded_component, key) == value, (
50 |                     f"Attribute {key} in {type(loaded_component)} is {value} in code "
51 |                     f"but {getattr(loaded_component, key)} in config"
52 |                 )
53 | 
54 |     assert_identical_components(loaded_chat_engine, default_chat_engine)
55 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
 1 | random_words = [
 2 |     "apple", "banana", "cherry", "date", "elephant", "flamingo", "grape", "honey",
 3 |     "iceberg", "jacket", "kangaroo", "lemon", "mango", "noodle", "octopus", "penguin",
 4 |     "quill", "raspberry", "strawberry", "tiger", "umbrella", "violin", "walrus",
 5 |     "xylophone", "yarn", "zebra", "ant", "bear", "cat", "dog", "eagle", "falcon",
 6 |     "giraffe", "horse", "iguana", "jellyfish", "koala", "lion", "monkey", "newt",
 7 |     "ostrich", "parrot", "quokka", "rhino", "snake", "turtle", "urchin", "vulture",
 8 |     "whale", "x-ray", "yak", "zeppelin", "atom", "bubble", "candle", "desk", "elevator",
 9 |     "fan", "globe", "hat", "ice", "juice", "kite", "lamp", "mountain", "nail", "orange",
10 |     "piano", "quartz", "river", "sun", "tree", "unicorn", "volcano", "wind",
11 |     "yogurt", "zipper", "accordion", "bat", "cymbal", "drum", "flute", "guitar",
12 |     "harmonica", "ivory", "jazz", "keyboard", "lyre", "maracas", "note", "organ",
13 |     "piccolo", "quena", "recorder", "saxophone", "trumpet", "ukulele", "viola",
14 |     "yacht", "zone", "adventure", "backpack", "calendar", "dolphin", "equator",
15 |     "gazelle", "helicopter", "island", "jigsaw", "kaleidoscope", "lighthouse",
16 |     "narrator", "obelisk", "puzzle", "quicksand", "rainbow", "satellite", "telescope",
17 |     "utensil", "vortex", "wavelength", "xenon", "yodel", "zucchini", "asteroid",
18 |     "crescent", "dynamo", "echo", "fractal", "galaxy", "hexagon", "infinity", "jungle",
19 |     "krypton", "lunar", "meteor", "nebula", "orbit", "prism", "quasar", "radiator",
20 |     "tornado", "universe", "vector", "warp", "x-axis", "yellow", "zenith",
21 |     "xylograph", "window", "festival", "molecule", "biscuit", "solar",
22 | ]
23 | 


--------------------------------------------------------------------------------
/tests/unit/chat_engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/chat_engine/__init__.py


--------------------------------------------------------------------------------
/tests/unit/chunker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/chunker/__init__.py


--------------------------------------------------------------------------------
/tests/unit/chunker/base_test_chunker.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from abc import ABC, abstractmethod
  3 | from canopy.models.data_models import Document
  4 | 
  5 | 
  6 | class BaseTestChunker(ABC):
  7 | 
  8 |     @staticmethod
  9 |     @pytest.fixture(scope="class")
 10 |     @abstractmethod
 11 |     def chunker():
 12 |         pass
 13 | 
 14 |     @staticmethod
 15 |     @pytest.fixture
 16 |     def documents():
 17 |         return [
 18 |             Document(
 19 |                 id="test_document_1",
 20 |                 text="I am a simple test string"
 21 |                      " to check the happy path of this simple chunker",
 22 |                 metadata={"test": 1}),
 23 |             Document(
 24 |                 id="test_document_2",
 25 |                 text="another simple test string",
 26 |                 metadata={"test": 2},
 27 |                 source="doc_2"
 28 |             ),
 29 |             Document(
 30 |                 id="test_document_3",
 31 |                 text="short",
 32 |                 metadata={"test": 2},
 33 |                 source="doc_3"
 34 |             )
 35 |         ]
 36 | 
 37 |     @staticmethod
 38 |     @pytest.fixture
 39 |     @abstractmethod
 40 |     def expected_chunks(documents):
 41 |         pass
 42 | 
 43 |     # region: test chunk_single_document
 44 | 
 45 |     @staticmethod
 46 |     def test_chunk_single_document_happy_path(chunker, documents, expected_chunks):
 47 |         for doc in documents:
 48 |             expected_chunks_for_doc = [chunk for chunk in
 49 |                                        expected_chunks if chunk.document_id == doc.id]
 50 |             actual_chunks = chunker.chunk_single_document(doc)
 51 |             assert len(actual_chunks) == len(expected_chunks_for_doc)
 52 |             for actual_chunk, expected_chunk in zip(actual_chunks,
 53 |                                                     expected_chunks_for_doc):
 54 |                 assert actual_chunk == expected_chunk, f"actual: {actual_chunk}\n, " \
 55 |                                                        f"expected: {expected_chunk}"
 56 | 
 57 |     @staticmethod
 58 |     def test_chunk_single_document_empty_content(chunker, documents):
 59 |         empty_document = Document(id="test_document_3", text="", metadata={"test": 3})
 60 |         assert chunker.chunk_single_document(empty_document) == []
 61 | 
 62 |     # endregion
 63 | 
 64 |     # region: test achunk_single_document
 65 | 
 66 |     @staticmethod
 67 |     @pytest.mark.asyncio
 68 |     async def test_achunk_single_document_raise_error(chunker,
 69 |                                                       documents,
 70 |                                                       expected_chunks):
 71 |         with pytest.raises(NotImplementedError):
 72 |             await chunker.achunk_single_document(documents[0])
 73 | 
 74 |     # endregion
 75 | 
 76 |     # region: test chunk_documents
 77 | 
 78 |     @staticmethod
 79 |     def test_chunk_documents_happy_path(chunker,
 80 |                                         documents,
 81 |                                         expected_chunks):
 82 |         chunks = chunker.chunk_documents(documents)
 83 |         assert len(chunks) == len(expected_chunks)
 84 |         for chunk, expected_chunk in zip(chunks, expected_chunks):
 85 |             assert chunk == expected_chunk
 86 | 
 87 |     @staticmethod
 88 |     def test_chunk_documents_empty_list(chunker):
 89 |         assert chunker.chunk_documents([]) == []
 90 | 
 91 |     @staticmethod
 92 |     def test_chunk_documents_empty_content(chunker):
 93 |         empty_document = Document(id="test_document_3", text="", metadata={"test": 3})
 94 |         assert chunker.chunk_documents([empty_document]) == []
 95 | 
 96 |     # endregion
 97 | 
 98 |     # region: test achunk_documents
 99 | 
100 |     @staticmethod
101 |     @pytest.mark.asyncio
102 |     async def test_achunk_documents_raise_error(chunker, documents):
103 |         with pytest.raises(NotImplementedError):
104 |             await chunker.achunk_documents(documents)
105 | 
106 |     # endregion
107 | 


--------------------------------------------------------------------------------
/tests/unit/chunker/test_recursive_character_chunker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from canopy.knowledge_base.chunker.recursive_character \
 3 |     import RecursiveCharacterChunker
 4 | from canopy.knowledge_base.models import KBDocChunk
 5 | from tests.unit.chunker.base_test_chunker import BaseTestChunker
 6 | 
 7 | 
 8 | class TestRecursiveCharacterChunker(BaseTestChunker):
 9 | 
10 |     @staticmethod
11 |     @pytest.fixture(scope="class")
12 |     def chunker():
13 |         return RecursiveCharacterChunker(chunk_size=3,
14 |                                          chunk_overlap=1)
15 | 
16 |     @staticmethod
17 |     @pytest.fixture
18 |     def expected_chunks(documents):
19 |         return [
20 |             KBDocChunk(id='test_document_1_0',
21 |                        text='I am a',
22 |                        metadata={'test': 1},
23 |                        document_id='test_document_1'),
24 |             KBDocChunk(id='test_document_1_1',
25 |                        text='a simple test',
26 |                        metadata={'test': 1},
27 |                        document_id='test_document_1'),
28 |             KBDocChunk(id='test_document_1_2',
29 |                        text='test string to',
30 |                        metadata={'test': 1},
31 |                        document_id='test_document_1'),
32 |             KBDocChunk(id='test_document_1_3',
33 |                        text='to check the',
34 |                        metadata={'test': 1},
35 |                        document_id='test_document_1'),
36 |             KBDocChunk(id='test_document_1_4',
37 |                        text='the happy path',
38 |                        metadata={'test': 1},
39 |                        document_id='test_document_1'),
40 |             KBDocChunk(id='test_document_1_5',
41 |                        text='path of this',
42 |                        metadata={'test': 1},
43 |                        document_id='test_document_1'),
44 |             KBDocChunk(id='test_document_1_6',
45 |                        text='this simple chunker',
46 |                        metadata={'test': 1},
47 |                        document_id='test_document_1'),
48 |             KBDocChunk(id='test_document_2_0',
49 |                        text='another simple test',
50 |                        metadata={'test': 2},
51 |                        document_id='test_document_2',
52 |                        source='doc_2'),
53 |             KBDocChunk(id='test_document_2_1',
54 |                        text='test string',
55 |                        metadata={'test': 2},
56 |                        document_id='test_document_2',
57 |                        source='doc_2'),
58 |             KBDocChunk(id='test_document_3_0',
59 |                        text='sho',
60 |                        metadata={'test': 2},
61 |                        document_id='test_document_3',
62 |                        source='doc_3'),
63 |             KBDocChunk(id='test_document_3_1',
64 |                        text='ort',
65 |                        metadata={'test': 2},
66 |                        document_id='test_document_3',
67 |                        source='doc_3')]
68 | 


--------------------------------------------------------------------------------
/tests/unit/chunker/test_stub_chunker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from .base_test_chunker import BaseTestChunker
 5 | from ..stubs.stub_chunker import StubChunker
 6 | 
 7 | 
 8 | class TestStubChunker(BaseTestChunker):
 9 | 
10 |     @staticmethod
11 |     @pytest.fixture(scope="class")
12 |     def chunker():
13 |         return StubChunker()
14 | 
15 |     @staticmethod
16 |     @pytest.fixture
17 |     def expected_chunks(documents):
18 |         return [KBDocChunk(id=f"{document.id}_0",
19 |                            document_id=document.id,
20 |                            text=document.text,
21 |                            metadata=document.metadata,
22 |                            source=document.source)
23 |                 for document in documents]
24 | 


--------------------------------------------------------------------------------
/tests/unit/chunker/test_token_chunker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.models.data_models import Document
 5 | from .base_test_chunker import BaseTestChunker
 6 | from canopy.knowledge_base.chunker.token_chunker import TokenChunker
 7 | 
 8 | 
 9 | class TestTokenChunker(BaseTestChunker):
10 | 
11 |     @staticmethod
12 |     @pytest.fixture(scope="class")
13 |     def chunker():
14 |         return TokenChunker(max_chunk_size=5,
15 |                             overlap=2)
16 | 
17 |     @staticmethod
18 |     @pytest.fixture
19 |     def expected_chunks(documents):
20 |         return [KBDocChunk(id='test_document_1_0',
21 |                            text='I am a simple test',
22 |                            metadata={'test': 1},
23 |                            document_id='test_document_1'),
24 |                 KBDocChunk(id='test_document_1_1',
25 |                            text='simple test string to check',
26 |                            metadata={'test': 1},
27 |                            document_id='test_document_1'),
28 |                 KBDocChunk(id='test_document_1_2',
29 |                            text='to check the happy path',
30 |                            metadata={'test': 1},
31 |                            document_id='test_document_1'),
32 |                 KBDocChunk(id='test_document_1_3',
33 |                            text='happy path of this simple',
34 |                            metadata={'test': 1},
35 |                            document_id='test_document_1'),
36 |                 KBDocChunk(id='test_document_1_4',
37 |                            text='this simple chunker',
38 |                            metadata={'test': 1},
39 |                            document_id='test_document_1',),
40 |                 KBDocChunk(id='test_document_2_0',
41 |                            text='another simple test string',
42 |                            metadata={'test': 2},
43 |                            document_id='test_document_2',
44 |                            source='doc_2'),
45 |                 KBDocChunk(id='test_document_3_0',
46 |                            text='short',
47 |                            metadata={'test': 2},
48 |                            document_id='test_document_3',
49 |                            source='doc_3'),
50 |                 ]
51 | 
52 |     @staticmethod
53 |     def test_chunk_single_document_zero_overlap(chunker):
54 |         chunker._overlap = 0
55 |         document = Document(id="test_document_1",
56 |                             text="I am a test string with no overlap",
57 |                             metadata={"test": 1})
58 |         actual = chunker.chunk_single_document(document)
59 | 
60 |         expected = [KBDocChunk(id='test_document_1_0',
61 |                                text='I am a test string',
62 |                                metadata={'test': 1},
63 |                                document_id='test_document_1'),
64 |                     KBDocChunk(id='test_document_1_1',
65 |                                text='with no overlap',
66 |                                metadata={'test': 1},
67 |                                document_id='test_document_1')]
68 | 
69 |         for actual_chunk, expected_chunk in zip(actual, expected):
70 |             assert actual_chunk == expected_chunk
71 | 
72 |     @staticmethod
73 |     def test_chunker_init_raise_on_negative_overlap():
74 |         with pytest.raises(ValueError):
75 |             TokenChunker(max_chunk_size=5,
76 |                          overlap=-1)
77 | 
78 |     @staticmethod
79 |     def test_chunker_init_raise_on_non_positive_max_tokens():
80 |         with pytest.raises(ValueError):
81 |             TokenChunker(max_chunk_size=0,
82 |                          overlap=5)
83 | 


--------------------------------------------------------------------------------
/tests/unit/cli/test_non_schematic_data_loader.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import pandas as pd
 4 | from pandas.testing import assert_frame_equal
 5 | 
 6 | from canopy_cli.data_loader.data_loader import (
 7 |     DataLoaderException,
 8 |     _load_multiple_txt_files
 9 | )
10 | 
11 | 
12 | @pytest.fixture
13 | def two_valid_txt_files(tmpdir):
14 |     file1 = tmpdir.join("file1.txt")
15 |     file1.write("the little brown fox\njumped over the lazy dog")
16 |     file2 = tmpdir.join("file2.txt")
17 |     file2.write("meow meow meow\nmeow meow meow")
18 |     return [file1, file2]
19 | 
20 | 
21 | @pytest.fixture
22 | def invalid_txt_file(tmpdir):
23 |     file_path = tmpdir.join("file.txt")
24 |     with open(str(file_path), 'w', encoding='latin-1') as file:
25 |         file.write("This is a text with bad encoding for UTF-8. ñáéíóú")
26 |     return [file_path]
27 | 
28 | 
29 | @pytest.fixture
30 | def one_invalid_rest_is_valid(tmpdir):
31 |     file1 = tmpdir.join("file1.txt")
32 |     file1.write("the little brown fox\njumped over the lazy dog")
33 |     file2 = tmpdir.join("file2.txt")
34 |     file2.write("meow meow meow\nmeow meow meow")
35 |     file3 = tmpdir.join("file3.txt")
36 |     with open(str(file3), 'w', encoding='latin-1') as file:
37 |         file.write("This is a text with bad encoding for UTF-8. ñáéíóú")
38 |     return [file1, file2, file3]
39 | 
40 | 
41 | def test_loading_files_good(two_valid_txt_files):
42 |     expected = pd.DataFrame([
43 |         {
44 |             "id": "file1",
45 |             "text": "the little brown fox\njumped over the lazy dog",
46 |             "source": str(two_valid_txt_files[0])
47 |         },
48 |         {
49 |             "id": "file2",
50 |             "text": "meow meow meow\nmeow meow meow",
51 |             "source": str(two_valid_txt_files[1])
52 |         }
53 |     ])
54 |     docs = _load_multiple_txt_files(two_valid_txt_files)
55 |     assert isinstance(docs, pd.DataFrame)
56 |     assert_frame_equal(docs, expected)
57 | 
58 | 
59 | def test_loading_files_bad(invalid_txt_file):
60 |     with pytest.raises(DataLoaderException) as e:
61 |         _load_multiple_txt_files(invalid_txt_file)
62 |     assert str(e.value) == f"""
63 |         {invalid_txt_file[0]}, line * - File must be UTF-8 encoded
64 |         """
65 | 
66 | 
67 | def test_loading_file_one_is_corrupted(one_invalid_rest_is_valid):
68 |     expected = pd.DataFrame([
69 |         {
70 |             "id": "file1",
71 |             "text": "the little brown fox\njumped over the lazy dog",
72 |             "source": str(one_invalid_rest_is_valid[0])
73 |         },
74 |         {
75 |             "id": "file2",
76 |             "text": "meow meow meow\nmeow meow meow",
77 |             "source": str(one_invalid_rest_is_valid[1])
78 |         }
79 |     ])
80 |     with pytest.raises(DataLoaderException) as e:
81 |         _load_multiple_txt_files(one_invalid_rest_is_valid)
82 |     assert str(e.value) == f"""
83 |         {one_invalid_rest_is_valid[2]}, line * - File must be UTF-8 encoded
84 |         """
85 |     remaining = _load_multiple_txt_files(one_invalid_rest_is_valid[:-1])
86 |     assert isinstance(remaining, pd.DataFrame)
87 |     assert_frame_equal(remaining, expected)
88 | 


--------------------------------------------------------------------------------
/tests/unit/context_builder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/context_builder/__init__.py


--------------------------------------------------------------------------------
/tests/unit/history_pruner/test_raising_history_pruner.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.chat_engine.history_pruner import RaisingHistoryPruner
 4 | from canopy.models.data_models import \
 5 |     UserMessage, AssistantMessage, Context, StringContextContent
 6 | from canopy.tokenizer import Tokenizer
 7 | 
 8 | 
 9 | SAMPLE_CONTEXT = Context(content=StringContextContent(
10 |     "Some context information"
11 | ),
12 |     num_tokens=3
13 | )
14 | SYSTEM_PROMPT = "This is a system prompt."
15 | 
16 | 
17 | @pytest.fixture
18 | def raising_history_builder():
19 |     return RaisingHistoryPruner()
20 | 
21 | 
22 | @pytest.fixture
23 | def sample_messages():
24 |     return [
25 |         UserMessage(content="Hello there!"),
26 |         AssistantMessage(content="Hi! How can I help you?"),
27 |         UserMessage(content="Tell me about the weather."),
28 |         AssistantMessage(content="Anything else?"),
29 |         UserMessage(content="No that's enough"),
30 |     ]
31 | 
32 | 
33 | @pytest.mark.parametrize(
34 |     "token_limit, expected_token_count, context, prompt",
35 |     [
36 |         (33, 33, None, None),
37 |         (50, 33, SAMPLE_CONTEXT, None),
38 |         (50, 33, None, SYSTEM_PROMPT),
39 |         (50, 33, SAMPLE_CONTEXT, SYSTEM_PROMPT),
40 |     ],
41 |     ids=[
42 |         "within_limit_no_context_no_prompt",
43 |         "within_limit_with_context",
44 |         "within_limit_with_prompt",
45 |         "within_limit_with_context_and_prompt",
46 |     ]
47 | )
48 | def test_build_within_limits(raising_history_builder, sample_messages,
49 |                              token_limit, expected_token_count, context, prompt):
50 |     messages = raising_history_builder.build(sample_messages, token_limit,
51 |                                              system_prompt=prompt, context=context)
52 |     assert Tokenizer().messages_token_count(messages) == expected_token_count
53 | 
54 | 
55 | @pytest.mark.parametrize(
56 |     "token_limit, context, prompt",
57 |     [
58 |         (32, None, None),
59 |         (33, SAMPLE_CONTEXT, None),
60 |         (33, None, SYSTEM_PROMPT),
61 |         (31, SAMPLE_CONTEXT, SYSTEM_PROMPT),
62 |     ],
63 |     ids=[
64 |         "exceed_limit_no_context_no_prompt",
65 |         "exceed_limit_with_context",
66 |         "exceed_limit_with_prompt",
67 |         "exceed_limit_with_context_and_prompt",
68 |     ]
69 | )
70 | def test_build_exceeds_limits(raising_history_builder, sample_messages,
71 |                               token_limit, context, prompt):
72 |     with pytest.raises(ValueError) as e:
73 |         raising_history_builder.build(sample_messages, token_limit,
74 |                                       system_prompt=prompt, context=context)
75 |         err_msg = e.value.args[0]
76 |         assert f"require {Tokenizer().messages_token_count(sample_messages)} " \
77 |                f"tokens" in err_msg
78 |         assert f"of {token_limit} tokens left for history" in err_msg
79 | 
80 | 
81 | @pytest.mark.asyncio
82 | async def test_abuild_not_implemented(raising_history_builder, sample_messages):
83 |     with pytest.raises(NotImplementedError):
84 |         await raising_history_builder.abuild(sample_messages, 25)
85 | 


--------------------------------------------------------------------------------
/tests/unit/history_pruner/test_recent_history_pruner.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.chat_engine.history_pruner import RecentHistoryPruner
 4 | from canopy.models.data_models import UserMessage, \
 5 |     AssistantMessage, Context, StringContextContent
 6 | from canopy.tokenizer import Tokenizer
 7 | 
 8 | 
 9 | SAMPLE_CONTEXT = Context(content=StringContextContent(
10 |     "Some context information"
11 | ), num_tokens=3)
12 | SYSTEM_PROMPT = "This is a system prompt."
13 | 
14 | 
15 | @pytest.fixture
16 | def recent_history_builder():
17 |     return RecentHistoryPruner(min_history_messages=1)
18 | 
19 | 
20 | @pytest.fixture
21 | def sample_messages():
22 |     return [
23 |         UserMessage(content="Hello there!"),
24 |         AssistantMessage(content="Hi! How can I help you?"),
25 |         UserMessage(content="Tell me about the weather."),
26 |         AssistantMessage(content="Anything else?"),
27 |         UserMessage(content="No that's enough"),
28 |     ]
29 | 
30 | 
31 | @pytest.mark.parametrize(
32 |     "token_limit, expected_tail, expected_token_count, context, prompt",
33 |     [
34 |         (50, 5, 33, None, None),
35 |         (18, 2, 11, None, None),
36 |         (10, 1, 6, None, None),
37 |         (50, 5, 33, SAMPLE_CONTEXT, None),
38 |         (50, 5, 33, None, SYSTEM_PROMPT),
39 |         (50, 5, 33, SAMPLE_CONTEXT, SYSTEM_PROMPT),
40 |         (11, 1, 6, SAMPLE_CONTEXT, None),
41 |         (18, 1, 6, None, SYSTEM_PROMPT),
42 |         (19, 1, 6, SAMPLE_CONTEXT, SYSTEM_PROMPT),
43 |     ],
44 |     ids=[
45 |         "full_history_fit_no_context_no_prompt",
46 |         "truncated_no_context_no_prompt",
47 |         "single_message_no_context_no_prompt",
48 |         "full_history_fit_with_context",
49 |         "full_history_fit_with_prompt",
50 |         "full_history_fit_with_context_and_prompt",
51 |         "truncated_with_context",
52 |         "truncated_with_prompt",
53 |         "truncated_with_context_and_prompt",
54 |     ]
55 | )
56 | def test_build(recent_history_builder,
57 |                sample_messages,
58 |                token_limit,
59 |                expected_tail,
60 |                expected_token_count,
61 |                context,
62 |                prompt):
63 |     messages = recent_history_builder.build(sample_messages,
64 |                                             token_limit,
65 |                                             system_prompt=prompt,
66 |                                             context=context)
67 |     assert messages == sample_messages[-expected_tail:]
68 |     assert Tokenizer().messages_token_count(messages) == expected_token_count
69 | 
70 | 
71 | def test_min_history_messages(sample_messages):
72 |     recent_history_builder = RecentHistoryPruner(
73 |         min_history_messages=2
74 |     )
75 |     token_limit = 18
76 |     messages = recent_history_builder.build(sample_messages, token_limit)
77 |     assert messages == sample_messages[-2:]
78 |     assert Tokenizer().messages_token_count(messages) == 11
79 | 
80 |     token_limit = 10
81 |     with pytest.raises(ValueError) as e:
82 |         recent_history_builder.build(sample_messages, token_limit)
83 |         err_msg = e.value.args[0]
84 |         assert f"The {2} most recent" in err_msg
85 |         assert f"calculated history of {token_limit}" in err_msg
86 |         assert "history require 11 tokens" in err_msg
87 | 
88 | 
89 | def test_build_with_empty_history(recent_history_builder):
90 |     messages = recent_history_builder.build([], 15)
91 |     assert messages == []
92 |     assert Tokenizer().messages_token_count(messages) == 0
93 | 
94 | 
95 | @pytest.mark.asyncio
96 | async def test_abuild_not_implemented(recent_history_builder, sample_messages):
97 |     with pytest.raises(NotImplementedError):
98 |         await recent_history_builder.abuild(sample_messages, 25)
99 | 


--------------------------------------------------------------------------------
/tests/unit/query_generators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/query_generators/__init__.py


--------------------------------------------------------------------------------
/tests/unit/query_generators/test_instruction_query_generator.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from unittest.mock import create_autospec
 3 | 
 4 | import pytest
 5 | 
 6 | from canopy.chat_engine.query_generator import InstructionQueryGenerator
 7 | from canopy.llm import BaseLLM
 8 | from canopy.models.api_models import ChatResponse, _Choice, TokenCounts
 9 | from canopy.models.data_models import Query, UserMessage, AssistantMessage
10 | 
11 | 
12 | @pytest.fixture
13 | def mock_llm():
14 |     return create_autospec(BaseLLM)
15 | 
16 | 
17 | @pytest.fixture
18 | def query_generator(mock_llm):
19 |     query_gen = InstructionQueryGenerator(
20 |         llm=mock_llm,
21 |     )
22 |     return query_gen
23 | 
24 | 
25 | @pytest.fixture
26 | def sample_messages():
27 |     return [UserMessage(content="How can I init a client?"),
28 |             AssistantMessage(content="Which kind of client?"),
29 |             UserMessage(content="A pinecone client.")]
30 | 
31 | 
32 | @pytest.mark.parametrize(("response", "query", "call_count"), [
33 |     (
34 |             '{"question": "How do I init a pinecone client?"}',
35 |             "How do I init a pinecone client?",
36 |             1
37 |     ),
38 | 
39 |     (
40 |             'Unparseable JSON response from LLM, falling back to the last message',
41 |             "A pinecone client.",
42 |             3
43 |     )
44 | 
45 | ])
46 | def test_generate(query_generator,
47 |                   mock_llm,
48 |                   sample_messages,
49 |                   response,
50 |                   query,
51 |                   call_count):
52 |     mock_llm.chat_completion.return_value = ChatResponse(
53 |         id="meta-llama/Llama-2-7b-chat-hf-HTQ-4",
54 |         object="text_completion",
55 |         created=1702569324,
56 |         model='meta-llama/Llama-2-7b-chat-hf',
57 |         usage=TokenCounts(
58 |             prompt_tokens=367,
59 |             completion_tokens=19,
60 |             total_tokens=386
61 |         ),
62 |         choices=[
63 |             _Choice(
64 |                 index=0,
65 |                 message=AssistantMessage(
66 |                     content=response
67 |                 )
68 |             )
69 |         ]
70 |     )
71 | 
72 |     result = query_generator.generate(messages=sample_messages,
73 |                                       max_prompt_tokens=4096)
74 | 
75 |     assert mock_llm.chat_completion.call_count == call_count
76 |     assert isinstance(result, List)
77 |     assert len(result) == 1
78 |     assert result[0] == Query(text=query)
79 | 
80 | 
81 | @pytest.mark.asyncio
82 | async def test_agenerate_not_implemented(query_generator,
83 |                                          mock_llm,
84 |                                          sample_messages
85 |                                          ):
86 |     with pytest.raises(NotImplementedError):
87 |         await query_generator.agenerate(messages=sample_messages,
88 |                                         max_prompt_tokens=100)
89 | 


--------------------------------------------------------------------------------
/tests/unit/query_generators/test_last_message_query_generator.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.chat_engine.query_generator import LastMessageQueryGenerator
 4 | from canopy.models.data_models import UserMessage, Query, AssistantMessage
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def sample_messages():
 9 |     return [
10 |         UserMessage(content="What is photosynthesis?")
11 |     ]
12 | 
13 | 
14 | @pytest.fixture
15 | def query_generator():
16 |     return LastMessageQueryGenerator()
17 | 
18 | 
19 | def test_generate(query_generator, sample_messages):
20 |     expected = [Query(text=sample_messages[-1].content)]
21 |     actual = query_generator.generate(sample_messages, 0)
22 |     assert actual == expected
23 | 
24 | 
25 | @pytest.mark.asyncio
26 | async def test_agenerate(query_generator, sample_messages):
27 |     expected = [Query(text=sample_messages[-1].content)]
28 |     actual = await query_generator.agenerate(sample_messages, 0)
29 |     assert actual == expected
30 | 
31 | 
32 | def test_generate_fails_with_empty_history(query_generator):
33 |     with pytest.raises(ValueError):
34 |         query_generator.generate([], 0)
35 | 
36 | 
37 | def test_generate_fails_with_no_user_message(query_generator):
38 |     with pytest.raises(ValueError):
39 |         query_generator.generate([
40 |             AssistantMessage(content="Hi! How can I help you?")
41 |         ], 0)
42 | 


--------------------------------------------------------------------------------
/tests/unit/record_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/record_encoder/__init__.py


--------------------------------------------------------------------------------
/tests/unit/record_encoder/test_dense_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.record_encoder import DenseRecordEncoder
 4 | from .base_test_record_encoder import BaseTestRecordEncoder
 5 | from ..stubs.stub_dense_encoder import StubDenseEncoder
 6 | 
 7 | 
 8 | class TestStubRecordEncoder(BaseTestRecordEncoder):
 9 | 
10 |     @staticmethod
11 |     @pytest.fixture
12 |     def expected_dimension():
13 |         return 3
14 | 
15 |     @staticmethod
16 |     @pytest.fixture
17 |     def inner_encoder(expected_dimension):
18 |         return StubDenseEncoder(dimension=3)
19 | 
20 |     @staticmethod
21 |     @pytest.fixture
22 |     def record_encoder(inner_encoder):
23 |         return DenseRecordEncoder(inner_encoder, batch_size=2)
24 | 


--------------------------------------------------------------------------------
/tests/unit/record_encoder/test_jina_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.knowledge_base.record_encoder.jina import JinaRecordEncoder
 5 | from canopy.models.data_models import Query
 6 | 
 7 | from unittest.mock import patch
 8 | 
 9 | documents = [KBDocChunk(
10 |     id=f"doc_1_{i}",
11 |     text=f"Sample document {i}",
12 |     document_id=f"doc_{i}",
13 |     metadata={"test": i},
14 |     source="doc_1",
15 | )
16 |     for i in range(4)
17 | ]
18 | 
19 | queries = [Query(text="Sample query 1"),
20 |            Query(text="Sample query 2"),
21 |            Query(text="Sample query 3"),
22 |            Query(text="Sample query 4")]
23 | 
24 | 
25 | @pytest.fixture
26 | def encoder():
27 |     return JinaRecordEncoder(api_key='test_api_key', batch_size=2)
28 | 
29 | 
30 | def test_dimension(encoder):
31 |     with patch('pinecone_text.dense.JinaEncoder.encode_documents') \
32 |             as mock_encode_documents:
33 |         mock_encode_documents.return_value = [0.1, 0.2, 0.3]
34 |         assert encoder.dimension == 3
35 | 
36 | 
37 | def custom_encode(*args, **kwargs):
38 |     input_to_encode = args[0]
39 |     if isinstance(input_to_encode, list):
40 |         return [[0.1, 0.2, 0.3] for _ in input_to_encode]
41 |     else:
42 |         return [0.1, 0.2, 0.3]
43 | 
44 | 
45 | @pytest.mark.parametrize("items,function",
46 |                          [(documents, "encode_documents"),
47 |                           (queries, "encode_queries"),
48 |                           ([], "encode_documents"),
49 |                           ([], "encode_queries")])
50 | def test_encode_documents(encoder, items, function):
51 |     with patch('pinecone_text.dense.JinaEncoder.encode_documents',
52 |                side_effect=custom_encode):
53 |         with patch('pinecone_text.dense.JinaEncoder.encode_queries',
54 |                    side_effect=custom_encode):
55 |             encoded_documents = getattr(encoder, function)(items)
56 | 
57 |             assert len(encoded_documents) == len(items)
58 |             assert all(len(encoded.values) == encoder.dimension
59 |                        for encoded in encoded_documents)
60 | 
61 | 
62 | @pytest.mark.asyncio
63 | @pytest.mark.parametrize("items,function",
64 |                          [("aencode_documents", documents),
65 |                           ("aencode_queries", queries)])
66 | async def test_aencode_not_implemented(encoder, function, items):
67 |     with pytest.raises(NotImplementedError):
68 |         await encoder.aencode_queries(items)
69 | 


--------------------------------------------------------------------------------
/tests/unit/record_encoder/test_sentence_transformers_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.knowledge_base.record_encoder.sentence_transformers import (
 5 |     SentenceTransformerRecordEncoder
 6 | )
 7 | from canopy.models.data_models import Query
 8 | 
 9 | from unittest.mock import patch
10 | 
11 | documents = [KBDocChunk(
12 |     id=f"doc_1_{i}",
13 |     text=f"Sample document {i}",
14 |     document_id=f"doc_{i}",
15 |     metadata={"test": i},
16 |     source="doc_1",
17 | )
18 |     for i in range(4)
19 | ]
20 | 
21 | queries = [Query(text="Sample query 1"),
22 |            Query(text="Sample query 2"),
23 |            Query(text="Sample query 3"),
24 |            Query(text="Sample query 4")]
25 | 
26 | 
27 | @pytest.fixture
28 | def encoder():
29 |     try:
30 |         encoder = SentenceTransformerRecordEncoder(batch_size=2)
31 |     except ImportError:
32 |         pytest.skip(
33 |             "`transformers` extra not installed. Skipping SentenceTransformer unit "
34 |             "tests"
35 |         )
36 |     return encoder
37 | 
38 | 
39 | def test_dimension(encoder):
40 |     with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_documents') \
41 |             as mock_encode_documents:
42 |         mock_encode_documents.return_value = [0.1, 0.2, 0.3]
43 |         assert encoder.dimension == 3
44 | 
45 | 
46 | def custom_encode(*args, **kwargs):
47 |     input_to_encode = args[0]
48 |     if isinstance(input_to_encode, list):
49 |         return [[0.1, 0.2, 0.3] for _ in input_to_encode]
50 |     else:
51 |         return [0.1, 0.2, 0.3]
52 | 
53 | 
54 | @pytest.mark.parametrize("items,function",
55 |                          [(documents, "encode_documents"),
56 |                           (queries, "encode_queries"),
57 |                           ([], "encode_documents"),
58 |                           ([], "encode_queries")])
59 | def test_encode_documents(encoder, items, function):
60 |     with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_documents',
61 |                side_effect=custom_encode):
62 |         with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_queries',
63 |                    side_effect=custom_encode):
64 |             encoded_documents = getattr(encoder, function)(items)
65 | 
66 |             assert len(encoded_documents) == len(items)
67 |             assert all(len(encoded.values) == encoder.dimension
68 |                        for encoded in encoded_documents)
69 | 
70 | 
71 | @pytest.mark.asyncio
72 | @pytest.mark.parametrize("items,function",
73 |                          [("aencode_documents", documents),
74 |                           ("aencode_queries", queries)])
75 | async def test_aencode_not_implemented(encoder, function, items):
76 |     with pytest.raises(NotImplementedError):
77 |         await encoder.aencode_queries(items)
78 | 


--------------------------------------------------------------------------------
/tests/unit/record_encoder/test_stub_record_encoder.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from .base_test_record_encoder import BaseTestRecordEncoder
 3 | from ..stubs.stub_dense_encoder import StubDenseEncoder
 4 | from ..stubs.stub_record_encoder import StubRecordEncoder
 5 | 
 6 | 
 7 | class TestStubRecordEncoder(BaseTestRecordEncoder):
 8 | 
 9 |     @staticmethod
10 |     @pytest.fixture
11 |     def expected_dimension():
12 |         return 3
13 | 
14 |     @staticmethod
15 |     @pytest.fixture
16 |     def inner_encoder(expected_dimension):
17 |         return StubDenseEncoder(dimension=3)
18 | 
19 |     @staticmethod
20 |     @pytest.fixture
21 |     def record_encoder(inner_encoder):
22 |         return StubRecordEncoder(inner_encoder,
23 |                                  batch_size=2)
24 | 


--------------------------------------------------------------------------------
/tests/unit/stubs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/stubs/__init__.py


--------------------------------------------------------------------------------
/tests/unit/stubs/stub_chunker.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from canopy.knowledge_base.chunker.base import Chunker
 3 | from canopy.knowledge_base.models import KBDocChunk
 4 | from canopy.models.data_models import Document
 5 | 
 6 | 
 7 | class StubChunker(Chunker):
 8 | 
 9 |     def __init__(self, num_chunks_per_doc: int = 1):
10 |         super().__init__()
11 |         self.num_chunks_per_doc = num_chunks_per_doc
12 | 
13 |     def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
14 |         if document.text == "":
15 |             return []
16 | 
17 |         # simply duplicate docs as chunks
18 |         return [KBDocChunk(id=self.generate_chunk_id(document.id, i),
19 |                            document_id=document.id,
20 |                            text=document.text + (f" dup_{i}" if i > 0 else ""),
21 |                            source=document.source,
22 |                            metadata=document.metadata)
23 |                 for i in range(self.num_chunks_per_doc)]
24 | 
25 |     async def achunk_single_document(self, document: Document) -> List[KBDocChunk]:
26 |         raise NotImplementedError()
27 | 


--------------------------------------------------------------------------------
/tests/unit/stubs/stub_dense_encoder.py:
--------------------------------------------------------------------------------
 1 | import mmh3
 2 | import numpy as np
 3 | from collections import defaultdict
 4 | from typing import Union, List
 5 | 
 6 | from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder
 7 | 
 8 | 
 9 | class StubDenseEncoder(BaseDenseEncoder):
10 |     """
11 |     Bag-of-words encoder that uses a random projection matrix to
12 |     project sparse vectors to dense vectors.
13 |     uses Johnson–Lindenstrauss lemma to project BOW sparse vectors to dense vectors.
14 |     """
15 | 
16 |     def __init__(self,
17 |                  dimension: int = 8,
18 |                  vocab_size: int = 2 ** 12):
19 |         self.input_dim = vocab_size
20 |         self.dimension = dimension
21 | 
22 |     def _text_to_word_counts(self, text: str) -> defaultdict:
23 |         words = text.split()
24 |         word_counts = defaultdict(int)
25 |         for word in words:
26 |             hashed_word = mmh3.hash(word) % self.input_dim
27 |             word_counts[hashed_word] += 1
28 |         return word_counts
29 | 
30 |     def _encode_text(self, text: str) -> List[float]:
31 |         word_counts = self._text_to_word_counts(text)
32 | 
33 |         # This will hold the result of word_counts * random_matrix
34 |         projected_embedding = np.zeros(self.dimension, dtype=np.float32)
35 | 
36 |         for hashed_word, count in word_counts.items():
37 |             rng = np.random.default_rng(hashed_word)
38 |             # Seed the RNG with the hashed word index for consistency
39 |             random_vector = rng.standard_normal(self.dimension)
40 |             projected_embedding += count * random_vector
41 | 
42 |         projected_embedding = projected_embedding.astype(np.float32)
43 |         return list(projected_embedding / np.linalg.norm(projected_embedding))
44 | 
45 |     def encode_documents(self,
46 |                          texts: Union[str, List[str]]
47 |                          ) -> Union[List[float], List[List[float]]]:
48 |         return self._encode(texts)
49 | 
50 |     def encode_queries(self,
51 |                        texts: Union[str, List[str]]
52 |                        ) -> Union[List[float], List[List[float]]]:
53 |         return self._encode(texts)
54 | 
55 |     def _encode(self,
56 |                 texts: Union[str, List[str]]
57 |                 ) -> Union[List[float], List[List[float]]]:
58 |         if isinstance(texts, str):
59 |             return self._encode_text(texts)
60 |         else:
61 |             return [self._encode_text(text) for text in texts]
62 | 


--------------------------------------------------------------------------------
/tests/unit/stubs/stub_record_encoder.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from canopy.knowledge_base.record_encoder import RecordEncoder
 4 | from canopy.knowledge_base.models import KBQuery, KBDocChunk, KBEncodedDocChunk
 5 | from canopy.models.data_models import Query
 6 | from .stub_dense_encoder import StubDenseEncoder
 7 | 
 8 | 
 9 | class StubRecordEncoder(RecordEncoder):
10 | 
11 |     def __init__(self,
12 |                  stub_dense_encoder: StubDenseEncoder,
13 |                  batch_size: int = 1):
14 |         super().__init__(batch_size)
15 |         self._dense_encoder = stub_dense_encoder
16 | 
17 |     def _encode_documents_batch(self,
18 |                                 documents: List[KBDocChunk]
19 |                                 ) -> List[KBEncodedDocChunk]:
20 |         result: List[KBEncodedDocChunk] = []
21 |         for doc in documents:
22 |             values = self._dense_encoder.encode_documents(doc.text)
23 |             result.append(
24 |                 KBEncodedDocChunk(
25 |                     **doc.model_dump(),
26 |                     values=values))
27 |         return result
28 | 
29 |     def _encode_queries_batch(self,
30 |                               queries: List[Query]
31 |                               ) -> List[KBQuery]:
32 |         result: List[KBQuery] = []
33 |         for query in queries:
34 |             values = self._dense_encoder.encode_queries(query.text)
35 |             result.append(
36 |                 KBQuery(**query.model_dump(),
37 |                         values=values))
38 |         return result
39 | 
40 |     async def _aencode_documents_batch(self,
41 |                                        documents: List[KBDocChunk]
42 |                                        ) -> List[KBEncodedDocChunk]:
43 |         raise NotImplementedError()
44 | 
45 |     async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
46 |         raise NotImplementedError()
47 | 
48 |     @property
49 |     def dimension(self) -> int:
50 |         return self._dense_encoder.dimension
51 | 


--------------------------------------------------------------------------------
/tests/unit/stubs/stub_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from canopy.tokenizer.base import BaseTokenizer
 3 | from canopy.models.data_models import Messages
 4 | 
 5 | 
 6 | class StubTokenizer(BaseTokenizer):
 7 | 
 8 |     def __init__(self, message_overhead: int = 3):
 9 |         self._message_overhead = message_overhead
10 | 
11 |     def tokenize(self, text: str) -> List[str]:
12 |         return text.split()
13 | 
14 |     def detokenize(self, tokens: List[str]) -> str:
15 |         if not isinstance(tokens, List):
16 |             raise TypeError(f"detokenize expect List[str], got f{type(tokens)}")
17 |         return " ".join(tokens)
18 | 
19 |     def messages_token_count(self, messages: Messages) -> int:
20 |         return sum(len(self.tokenize(msg.content)) + self._message_overhead
21 |                    for msg in messages)
22 | 


--------------------------------------------------------------------------------
/tests/unit/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/tokenizer/__init__.py


--------------------------------------------------------------------------------
/tests/unit/tokenizer/base_test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import pytest
 3 | 
 4 | 
 5 | class BaseTestTokenizer(ABC):
 6 | 
 7 |     @staticmethod
 8 |     @pytest.fixture(scope="class")
 9 |     @abstractmethod
10 |     def tokenizer():
11 |         pass
12 | 
13 |     @staticmethod
14 |     @pytest.fixture
15 |     def text():
16 |         return "string with special characters like !@#$%^&*()_+ 日本 " \
17 |                "spaces   \n \n\n CASE cAse "
18 | 
19 |     @staticmethod
20 |     @pytest.fixture
21 |     @abstractmethod
22 |     def expected_tokens(text):
23 |         pass
24 | 
25 |     # region: test tokenize
26 | 
27 |     @staticmethod
28 |     def test_tokenize(tokenizer, text, expected_tokens):
29 |         tokens = tokenizer.tokenize(text)
30 |         assert tokens == expected_tokens, f"\nExpected: {expected_tokens}" \
31 |                                           f"\nActual: {tokens}"
32 | 
33 |     @staticmethod
34 |     def test_tokenize_empty_string(tokenizer):
35 |         assert tokenizer.tokenize("") == []
36 | 
37 |     @staticmethod
38 |     def test_tokenize_invalid_input_type_raise_exception(tokenizer):
39 |         with pytest.raises(Exception):
40 |             tokenizer.tokenize(1)
41 | 
42 |         with pytest.raises(Exception):
43 |             tokenizer.tokenize(["asd"])
44 | 
45 |     # endregion
46 | 
47 |     # region: test detokenize
48 | 
49 |     @staticmethod
50 |     def test_detokenize(tokenizer, text, expected_tokens):
51 |         text = tokenizer.detokenize(expected_tokens)
52 |         assert text == text
53 | 
54 |     @staticmethod
55 |     def test_detokenize_empty_string(tokenizer):
56 |         assert tokenizer.detokenize([]) == ""
57 | 
58 |     @staticmethod
59 |     def test_detokenize_invalid_input_type_raise_exception(tokenizer):
60 |         with pytest.raises(Exception):
61 |             tokenizer.detokenize(1)
62 | 
63 |         with pytest.raises(Exception):
64 |             tokenizer.detokenize("asd")
65 | 
66 |     # endregion
67 | 
68 |     # region test token_count
69 | 
70 |     @staticmethod
71 |     def test_token_count(tokenizer, text, expected_tokens):
72 |         token_count = tokenizer.token_count(text)
73 |         assert token_count == len(expected_tokens)
74 |         assert token_count == len(tokenizer.tokenize(text))
75 | 
76 |     @staticmethod
77 |     def test_token_count_empty_string(tokenizer):
78 |         assert tokenizer.token_count("") == 0
79 | 
80 |     # endregion
81 | 
82 |     @staticmethod
83 |     def test_tokenize_detokenize_compatibility(tokenizer, text, expected_tokens):
84 |         retext = tokenizer.detokenize(tokenizer.tokenize(text))
85 |         assert retext == text, f"\nExpected: {text}\nActual: {retext}"
86 |         reconstructed_expected_tokens = tokenizer.tokenize(
87 |             tokenizer.detokenize(expected_tokens))
88 |         assert reconstructed_expected_tokens == expected_tokens, \
89 |             f"\nExpected: {expected_tokens}\nActual: {reconstructed_expected_tokens}"
90 | 


--------------------------------------------------------------------------------
/tests/unit/tokenizer/test_cohere_hf_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from canopy.tokenizer import CohereHFTokenizer
 3 | from canopy.models.data_models import MessageBase, Role
 4 | from .base_test_tokenizer import BaseTestTokenizer
 5 | 
 6 | 
 7 | class TestCohereHFTokenizer(BaseTestTokenizer):
 8 |     @staticmethod
 9 |     @pytest.fixture(scope="class")
10 |     def tokenizer():
11 |         try:
12 |             tokenizer = CohereHFTokenizer()
13 |         except ImportError:
14 |             pytest.skip(
15 |                 "`cohere` extra not installed. Skipping CohereHFTokenizer unit "
16 |                 "tests"
17 |             )
18 |         return tokenizer
19 | 
20 |     @staticmethod
21 |     @pytest.fixture
22 |     def expected_tokens(text):
23 |         return ['string',
24 |                 'Ġwith',
25 |                 'Ġspecial',
26 |                 'Ġcharacters',
27 |                 'Ġlike',
28 |                 'Ġ!',
29 |                 '@',
30 |                 '#$',
31 |                 '%^',
32 |                 '&',
33 |                 '*',
34 |                 '()',
35 |                 '_',
36 |                 '+',
37 |                 'ĠæĹ¥æľ¬',
38 |                 'Ġspaces',
39 |                 'ĠĠĠ',
40 |                 'ĊĠĊĊ',
41 |                 'ĠCASE',
42 |                 'Ġc',
43 |                 'A',
44 |                 'se',
45 |                 'Ġ']
46 | 
47 |     @staticmethod
48 |     def test_messages_token_count(tokenizer):
49 |         messages = [MessageBase(role=Role.USER, content="Hello, assistant.")]
50 |         assert tokenizer.messages_token_count(messages) == 11
51 | 
52 |         messages = [
53 |             MessageBase(role=Role.USER, content="Hello, assistant."),
54 |             MessageBase(
55 |                 role=Role.ASSISTANT, content="Hello, user. How can I assist you?"
56 |             ),
57 |         ]
58 |         assert tokenizer.messages_token_count(messages) == 25
59 | 


--------------------------------------------------------------------------------
/tests/unit/tokenizer/test_llama_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from canopy.tokenizer import LlamaTokenizer
 3 | from canopy.models.data_models import MessageBase, Role
 4 | from .base_test_tokenizer import BaseTestTokenizer
 5 | 
 6 | 
 7 | class TestLlamaTokenizer(BaseTestTokenizer):
 8 |     @staticmethod
 9 |     @pytest.fixture(scope="class")
10 |     def tokenizer():
11 |         try:
12 |             tokenizer = LlamaTokenizer(model_name="hf-internal-testing/llama-tokenizer")
13 |         except ImportError:
14 |             pytest.skip(
15 |                 "`transformers` extra not installed. Skipping LLamaTokenizer unit "
16 |                 "tests"
17 |             )
18 |         return tokenizer
19 | 
20 |     @staticmethod
21 |     @pytest.fixture
22 |     def expected_tokens(text):
23 |         return [
24 |             "▁string",
25 |             "▁with",
26 |             "▁special",
27 |             "▁characters",
28 |             "▁like",
29 |             "▁!",
30 |             "@",
31 |             "#",
32 |             "$",
33 |             "%",
34 |             "^",
35 |             "&",
36 |             "*",
37 |             "()",
38 |             "_+",
39 |             "▁",
40 |             "日",
41 |             "本",
42 |             "▁spaces",
43 |             "▁▁▁",
44 |             "<0x0A>",
45 |             "▁",
46 |             "<0x0A>",
47 |             "<0x0A>",
48 |             "▁CASE",
49 |             "▁c",
50 |             "A",
51 |             "se",
52 |             "▁",
53 |         ]
54 | 
55 |     @staticmethod
56 |     def test_messages_token_count(tokenizer):
57 |         messages = [MessageBase(role=Role.USER, content="Hello, assistant.")]
58 |         assert tokenizer.messages_token_count(messages) == 11
59 | 
60 |         messages = [
61 |             MessageBase(role=Role.USER, content="Hello, assistant."),
62 |             MessageBase(
63 |                 role=Role.ASSISTANT, content="Hello, user. How can I assist you?"
64 |             ),
65 |         ]
66 |         assert tokenizer.messages_token_count(messages) == 25
67 | 
68 |     @staticmethod
69 |     def test_messages_token_count_empty_messages(tokenizer):
70 |         assert tokenizer.messages_token_count([]) == 3
71 | 
72 |     @staticmethod
73 |     def test_special_tokens_to_natural_text(tokenizer):
74 |         input_text = "</s>_<0x0A>__ <unk><s>word"
75 |         tokens = tokenizer.tokenize(input_text)
76 |         expected_tokens = [
77 |             "▁</s>",
78 |             "_",
79 |             "<",
80 |             "0",
81 |             "x",
82 |             "0",
83 |             "A",
84 |             ">",
85 |             "__",
86 |             "▁<unk>",
87 |             "<",
88 |             "s",
89 |             ">",
90 |             "word",
91 |         ]
92 |         assert tokens == expected_tokens
93 | 
94 |         # TODO: this currently fails since detokenize() adds a space after <s> and </s>.
95 |         #  We need to decide if this is the desired behavior or not.
96 |         assert tokenizer.detokenize(tokens) == input_text
97 | 
98 |         assert tokenizer.token_count(input_text) == len(tokens)
99 | 


--------------------------------------------------------------------------------
/tests/unit/tokenizer/test_openai_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from canopy.tokenizer import OpenAITokenizer
 3 | from canopy.models.data_models import MessageBase, Role
 4 | from .base_test_tokenizer import BaseTestTokenizer
 5 | 
 6 | 
 7 | class TestOpenAITokenizer(BaseTestTokenizer):
 8 | 
 9 |     @staticmethod
10 |     @pytest.fixture(scope="class")
11 |     def tokenizer():
12 |         return OpenAITokenizer(model_name="gpt-3.5-turbo")
13 | 
14 |     @staticmethod
15 |     @pytest.fixture
16 |     def expected_tokens(text):
17 |         return ['string', ' with', ' special', ' characters', ' like',
18 |                 ' !', '@', '#$', '%^', '&', '*', '()', '_', '+', ' 日',
19 |                 '本', ' spaces', '   \n', ' \n\n', ' CASE', ' c', 'A',
20 |                 'se', " "]
21 | 
22 |     @staticmethod
23 |     def test_messages_token_count(tokenizer):
24 |         messages = [MessageBase(role=Role.USER, content="Hello, assistant.")]
25 |         assert tokenizer.messages_token_count(messages) == 11
26 | 
27 |         messages = [MessageBase(role=Role.USER,
28 |                                 content="Hello, assistant."),
29 |                     MessageBase(role=Role.ASSISTANT,
30 |                                 content="Hello, user. How can I assist you?")]
31 |         assert tokenizer.messages_token_count(messages) == 25
32 | 
33 |     @staticmethod
34 |     def test_messages_token_count_empty_messages(tokenizer):
35 |         assert tokenizer.messages_token_count([]) == 3
36 | 
37 |     @staticmethod
38 |     def test_special_tokens_to_natural_text(tokenizer):
39 |         tokens = tokenizer.tokenize("<|endoftext|>")
40 |         assert tokens == ['<', '|', 'endo', 'ft', 'ext', '|', '>']
41 | 
42 |         assert tokenizer.detokenize(tokens) == "<|endoftext|>"
43 | 
44 |         assert tokenizer.token_count("<|endoftext|>") == 7
45 | 


--------------------------------------------------------------------------------
/tests/unit/tokenizer/test_stub_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from canopy.models.data_models import MessageBase, Role
 4 | from .base_test_tokenizer import BaseTestTokenizer
 5 | from ..stubs.stub_tokenizer import StubTokenizer
 6 | 
 7 | 
 8 | class TestStubTokenizer(BaseTestTokenizer):
 9 | 
10 |     @staticmethod
11 |     @pytest.fixture(scope="class")
12 |     def tokenizer():
13 |         return StubTokenizer()
14 | 
15 |     @staticmethod
16 |     @pytest.fixture
17 |     def expected_tokens(text):
18 |         return text.split()
19 | 
20 |     @staticmethod
21 |     def test_tokenize_detokenize_compatibility(tokenizer, text, expected_tokens):
22 |         assert tokenizer.detokenize(tokenizer.tokenize(text)) \
23 |                == " ".join(text.split())
24 |         assert tokenizer.tokenize(tokenizer.detokenize(expected_tokens))\
25 |                == expected_tokens
26 | 
27 |     @staticmethod
28 |     def test_messages_token_count(tokenizer):
29 |         messages = [MessageBase(role=Role.USER, content="hi bye"),
30 |                     MessageBase(role=Role.ASSISTANT, content="hi")]
31 |         assert tokenizer.messages_token_count(messages) == 3 + len(messages) * 3
32 | 
33 |     @staticmethod
34 |     def test_messages_token_count_empty_messages(tokenizer):
35 |         assert tokenizer.messages_token_count([]) == 0
36 | 


--------------------------------------------------------------------------------
/tests/unit/tokenizer/test_tokenizer_singleton.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from canopy.tokenizer import Tokenizer
 3 | from ..stubs.stub_tokenizer import StubTokenizer
 4 | 
 5 | 
 6 | class StubChildTokenizer(StubTokenizer):
 7 |     pass
 8 | 
 9 | 
10 | @pytest.fixture
11 | def reset_tokenizer_singleton():
12 |     before = Tokenizer._tokenizer_instance.__class__
13 |     Tokenizer.clear()
14 |     yield
15 |     Tokenizer.clear()
16 |     Tokenizer.initialize(tokenizer_class=before)
17 | 
18 | 
19 | def test_tokenizer_init(reset_tokenizer_singleton):
20 |     Tokenizer.initialize(StubTokenizer)
21 |     assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer)
22 |     assert Tokenizer._initialized is True
23 | 
24 | 
25 | def test_tokenizer_init_already_initialized_same_class(reset_tokenizer_singleton):
26 |     Tokenizer.initialize(StubTokenizer, message_overhead=10)
27 |     tokenizer = Tokenizer()
28 |     assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer)
29 |     assert Tokenizer._initialized is True
30 |     assert Tokenizer._tokenizer_instance._message_overhead == 10
31 |     assert tokenizer._tokenizer_instance._message_overhead == 10
32 | 
33 | 
34 | def test_tokenizer_init_already_initialized_different_class(reset_tokenizer_singleton):
35 |     Tokenizer.initialize(StubChildTokenizer, message_overhead=10)
36 |     tokenizer = Tokenizer()
37 |     assert isinstance(Tokenizer._tokenizer_instance, StubChildTokenizer)
38 |     assert Tokenizer._initialized is True
39 |     assert isinstance(tokenizer._tokenizer_instance, StubChildTokenizer)
40 | 
41 | 
42 | def test_tokenizer_init_invalid_same_class(reset_tokenizer_singleton):
43 |     with pytest.raises(ValueError):
44 |         Tokenizer.initialize(Tokenizer)
45 | 
46 | 
47 | def test_tokenizer_init_invalid_tokenizer_class(reset_tokenizer_singleton):
48 |     class InvalidTokenizer:
49 |         pass
50 |     with pytest.raises(ValueError):
51 |         Tokenizer.initialize(InvalidTokenizer)
52 | 
53 | 
54 | def test_tokenizer_uniqueness(reset_tokenizer_singleton):
55 |     Tokenizer.initialize(StubTokenizer)
56 |     tokenizer = Tokenizer()
57 |     assert tokenizer is Tokenizer()
58 |     another_tokenizer = Tokenizer()
59 |     assert tokenizer is another_tokenizer
60 | 


--------------------------------------------------------------------------------
/tests/unit/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/utils/__init__.py


--------------------------------------------------------------------------------
/tests/unit/utils/_stub_classes.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import Optional
 3 | 
 4 | from canopy.utils.config import ConfigurableMixin
 5 | 
 6 | 
 7 | # A base class that inherits from ConfigurableMixin, with multiple derived classes
 8 | class BaseStubChunker(abc.ABC, ConfigurableMixin):
 9 |     @abc.abstractmethod
10 |     def chunk(self, text: str) -> str:
11 |         pass
12 | 
13 | 
14 | class StubChunker(BaseStubChunker):
15 |     DEFAULT_CHUNK_SIZE = 100
16 |     DEFAULT_SPLITTER = ' '
17 | 
18 |     def __init__(self, chunk_size=DEFAULT_CHUNK_SIZE, splitter=DEFAULT_SPLITTER):
19 |         self.chunk_size = chunk_size
20 |         self.splitter = splitter
21 | 
22 |     def chunk(self, text: str) -> str:
23 |         return text.split(self.splitter)
24 | 
25 | 
26 | class StubOtherChunker(BaseStubChunker):
27 |     DEFAULT_CHUNK_SIZE = 200
28 | 
29 |     def __init__(self, chunk_size=DEFAULT_CHUNK_SIZE, some_param=' '):
30 |         self.chunk_size = chunk_size
31 |         self.splitter = some_param
32 | 
33 |     def chunk(self, text: str) -> str:
34 |         return text.split(self.splitter)
35 | 
36 | 
37 | # A base class that inherits from ConfigurableMixin, where the derived class has
38 | # default components
39 | class BaseStubKB(abc.ABC, ConfigurableMixin):
40 |     pass
41 | 
42 | 
43 | class StubKB(BaseStubKB):
44 |     _DEFAULT_COMPONENTS = {
45 |         'chunker': StubChunker,
46 |     }
47 | 
48 |     DEFAULT_TOP_K = 5
49 | 
50 |     def __init__(self,
51 |                  chunker: Optional[BaseStubChunker] = None,
52 |                  top_k: int = DEFAULT_TOP_K,
53 |                  ):
54 |         self.chunker = chunker or self._DEFAULT_COMPONENTS['chunker']()
55 |         self.top_k = top_k
56 | 
57 | 
58 | class BaseStubContextBuilder(ConfigurableMixin):
59 |     pass
60 | 
61 | 
62 | class StubContextBuilder(BaseStubContextBuilder):
63 |     DEFAULT_MAX_CONTEXT_LENGTH = 1000
64 | 
65 |     def __init__(self, max_context_length: int = DEFAULT_MAX_CONTEXT_LENGTH):
66 |         self.max_context_length = max_context_length
67 | 
68 | 
69 | # A base class that inherits from ConfigurableMixin, where the derived class has
70 | # default components, one of them is a class that also inherits from ConfigurableMixin
71 | class BaseStubContextEngine(ConfigurableMixin):
72 |     pass
73 | 
74 | 
75 | class StubContextEngine(BaseStubContextEngine):
76 |     _DEFAULT_COMPONENTS = {
77 |         'knowledge_base': StubKB,
78 |         'context_builder': StubContextBuilder,
79 |     }
80 | 
81 |     def __init__(self,
82 |                  knowledge_base: StubKB,
83 |                  context_builder: Optional[BaseStubContextBuilder] = None,
84 |                  filter: Optional[dict] = None,
85 |                  ):
86 |         self.knowledge_base = knowledge_base or self._DEFAULT_COMPONENTS['kb']()
87 |         self.context_builder = (context_builder or
88 |                                 self._DEFAULT_COMPONENTS['context_builder']())
89 |         self.filter = filter
90 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from datetime import datetime
 3 | from typing import List
 4 | 
 5 | from canopy.knowledge_base.knowledge_base import _get_global_client, INDEX_NAME_PREFIX
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def create_index_name(testrun_uid: str, prefix: str) -> str:
11 |     today = datetime.today().strftime("%Y-%m-%d")
12 |     return f"{testrun_uid[-6:]}-{prefix}-{today}"
13 | 
14 | 
15 | def create_system_tests_index_name(testrun_uid: str) -> str:
16 |     return create_index_name(testrun_uid, "test-kb")
17 | 
18 | 
19 | def create_e2e_tests_index_name(testrun_uid: str, index_type: str) -> str:
20 |     return create_index_name(testrun_uid, f"test-app-{index_type}")
21 | 
22 | 
23 | def get_related_indexes(indexes: List[str], testrun_uid: str) -> List[str]:
24 |     return [
25 |         index for index in indexes
26 |         if index.startswith(f"{INDEX_NAME_PREFIX}{testrun_uid[-6:]}")
27 |     ]
28 | 
29 | 
30 | def cleanup_indexes(testrun_uid: str):
31 |     client = _get_global_client()
32 |     current_indexes = client.list_indexes().names()
33 |     index_names = get_related_indexes(current_indexes, testrun_uid)
34 |     logger.info(f"Preparing to cleanup indexes: {index_names}")
35 |     for index_name in index_names:
36 |         logger.info(f"Deleting index '{index_name}'...")
37 |         client.delete_index(index_name)
38 |         logger.info(f"Index '{index_name}' deleted.")
39 | 


--------------------------------------------------------------------------------