├── .devcontainer └── devcontainer.json ├── .env.example ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ ├── config.yml │ └── feature-request.yml ├── actions │ └── install-deps-and-canopy │ │ └── action.yml └── workflows │ ├── PR.yml │ ├── build-push-image.yml │ ├── pre-release-CI.yml │ └── release.yml ├── .gitignore ├── .readme-content ├── class_architecture.png ├── new.gif ├── rag_flow.png ├── resin-chat-no-rag.gif └── sketch.png ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── docs ├── deployment-gcp.md └── library.md ├── examples └── canopy-lib-quickstart.ipynb ├── pyproject.toml ├── scripts └── cleanup_indexes.py ├── src ├── canopy │ ├── __init__.py │ ├── chat_engine │ │ ├── __init__.py │ │ ├── chat_engine.py │ │ ├── exceptions.py │ │ ├── history_pruner │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── raising.py │ │ │ └── recent.py │ │ └── query_generator │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── cohere.py │ │ │ ├── function_calling.py │ │ │ ├── instruction.py │ │ │ └── last_message.py │ ├── config_templates │ │ ├── anyscale.yaml │ │ ├── azure.yaml │ │ ├── cohere.yaml │ │ ├── default.yaml │ │ └── octoai.yaml │ ├── context_engine │ │ ├── __init__.py │ │ ├── context_builder │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── stuffing.py │ │ └── context_engine.py │ ├── knowledge_base │ │ ├── __init__.py │ │ ├── base.py │ │ ├── chunker │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── langchain_text_splitter.py │ │ │ ├── markdown.py │ │ │ ├── recursive_character.py │ │ │ └── token_chunker.py │ │ ├── knowledge_base.py │ │ ├── models.py │ │ ├── qdrant │ │ │ ├── constants.py │ │ │ ├── converter.py │ │ │ ├── qdrant_knowledge_base.py │ │ │ └── utils.py │ │ ├── record_encoder │ │ │ ├── __init__.py │ │ │ ├── anyscale.py │ │ │ ├── azure_openai.py │ │ │ ├── base.py │ │ │ ├── cohere.py │ │ │ ├── dense.py │ │ │ ├── hybrid.py │ │ │ ├── jina.py │ │ │ ├── octoai.py │ │ │ ├── openai.py │ │ │ └── sentence_transformers.py │ │ └── reranker │ │ │ ├── __init__.py │ │ │ ├── cohere.py │ │ │ ├── reranker.py │ │ │ └── transparent.py │ ├── llm │ │ ├── __init__.py │ │ ├── anyscale.py │ │ ├── azure_openai_llm.py │ │ ├── base.py │ │ ├── cohere.py │ │ ├── models.py │ │ ├── octoai.py │ │ └── openai.py │ ├── models │ │ ├── __init__.py │ │ ├── api_models.py │ │ └── data_models.py │ ├── tokenizer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cohere.py │ │ ├── llama.py │ │ ├── openai.py │ │ └── tokenizer.py │ └── utils │ │ ├── __init__.py │ │ ├── config.py │ │ ├── debugging.py │ │ └── directory.py ├── canopy_cli │ ├── __init__.py │ ├── cli.py │ ├── cli_spinner.py │ ├── data_loader │ │ ├── __init__.py │ │ ├── data_loader.py │ │ └── errors.py │ └── errors.py └── canopy_server │ ├── __init__.py │ ├── _redocs_template.py │ ├── app.py │ └── models │ └── v1 │ └── api_models.py └── tests ├── __init__.py ├── conftest.py ├── e2e ├── __init__.py └── test_app.py ├── system ├── __init__.py ├── knowledge_base │ ├── __init__.py │ ├── qdrant │ │ ├── __init__.py │ │ ├── common.py │ │ ├── conftest.py │ │ ├── test_async_qdrant_knowledge_base.py │ │ ├── test_config.yml │ │ └── test_qdrant_knowledge_base.py │ └── test_knowledge_base.py ├── llm │ ├── __init__.py │ ├── conftest.py │ ├── test_azure_openai.py │ ├── test_cohere.py │ └── test_openai.py ├── query_generator │ ├── test_cohere_query_generator.py │ └── test_query_generator_integration.py ├── record_encoder │ ├── test_anyscale_record_encoder.py │ ├── test_cohere_record_encoder.py │ ├── test_jina_record_encoder.py │ ├── test_octoai_record_encoder.py │ ├── test_openai_record_encoder.py │ └── test_sentence_transformers_encoder.py ├── reranker │ ├── __init__.py │ ├── test_cohere_reranker.py │ └── test_transparent_reranker.py ├── tokenizer │ ├── __init__.py │ └── test_cohere_api_tokenizer.py └── utils │ ├── __init__.py │ └── test_config.py ├── unit ├── __init__.py ├── chat_engine │ ├── __init__.py │ └── test_chat_engine.py ├── chunker │ ├── __init__.py │ ├── base_test_chunker.py │ ├── test_markdown_chunker.py │ ├── test_recursive_character_chunker.py │ ├── test_stub_chunker.py │ └── test_token_chunker.py ├── cli │ ├── test_data_loader.py │ └── test_non_schematic_data_loader.py ├── context_builder │ ├── __init__.py │ └── test_stuffing_context_builder.py ├── context_engine │ └── test_context_engine.py ├── history_pruner │ ├── test_raising_history_pruner.py │ └── test_recent_history_pruner.py ├── query_generators │ ├── __init__.py │ ├── test_function_calling_query_generator.py │ ├── test_instruction_query_generator.py │ └── test_last_message_query_generator.py ├── record_encoder │ ├── __init__.py │ ├── base_test_record_encoder.py │ ├── test_dense_record_encoder.py │ ├── test_hybrid_record_encoder.py │ ├── test_jina_record_encoder.py │ ├── test_sentence_transformers_encoder.py │ └── test_stub_record_encoder.py ├── stubs │ ├── __init__.py │ ├── stub_chunker.py │ ├── stub_dense_encoder.py │ ├── stub_record_encoder.py │ └── stub_tokenizer.py ├── tokenizer │ ├── __init__.py │ ├── base_test_tokenizer.py │ ├── test_cohere_hf_tokenizer.py │ ├── test_llama_tokenizer.py │ ├── test_openai_tokenizer.py │ ├── test_stub_tokenizer.py │ └── test_tokenizer_singleton.py └── utils │ ├── __init__.py │ ├── _stub_classes.py │ └── test_config.py └── util.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python 3 | { 4 | "name": "Python 3", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/python:1-3.9-bullseye" 7 | 8 | // Features to add to the dev container. More info: https://containers.dev/features. 9 | // "features": {}, 10 | 11 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 12 | // "forwardPorts": [], 13 | 14 | // Use 'postCreateCommand' to run commands after the container is created. 15 | // "postCreateCommand": "pip3 install --user -r requirements.txt", 16 | 17 | // Configure tool-specific properties. 18 | // "customizations": {}, 19 | 20 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 21 | // "remoteUser": "root" 22 | } 23 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | PINECONE_API_KEY="" 2 | OPENAI_API_KEY="" 3 | INDEX_NAME="" 4 | CANOPY_CONFIG_FILE="config/config.yaml" 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: 🐞 Bug 2 | description: Report a bug or an issue you've found 3 | title: "[Bug] " 4 | labels: ["bug", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thanks for taking the time to fill out this bug report! 10 | - type: checkboxes 11 | attributes: 12 | label: Is this a new bug? 13 | description: > 14 | In other words: Is this an error, flaw, failure or fault? Please search issues to see if someone has already reported the bug you encountered. 15 | options: 16 | - label: I believe this is a new bug 17 | required: true 18 | - label: I have searched the existing issues, and I could not find an existing issue for this bug 19 | required: true 20 | - type: textarea 21 | attributes: 22 | label: Current Behavior 23 | description: A concise description of what you're experiencing. 24 | validations: 25 | required: true 26 | - type: textarea 27 | attributes: 28 | label: Expected Behavior 29 | description: A concise description of what you expected to happen. 30 | validations: 31 | required: true 32 | - type: textarea 33 | attributes: 34 | label: Steps To Reproduce 35 | description: Steps to reproduce the behavior. 36 | placeholder: | 37 | 1. In this environment... 38 | 2. With this config... 39 | 3. Run '...' 40 | 4. See error... 41 | validations: 42 | required: true 43 | - type: textarea 44 | id: logs 45 | attributes: 46 | label: Relevant log output 47 | description: | 48 | If applicable, log output to help explain your problem. 49 | render: shell 50 | validations: 51 | required: false 52 | - type: textarea 53 | attributes: 54 | label: Environment 55 | description: | 56 | examples: 57 | - **OS**: Ubuntu 20.04 58 | - **Language version**: Python 3.9.12 (`python3 --version`) 59 | - **Canopy version**: 0.2 (`canopy --version`) 60 | value: | 61 | - **OS**: 62 | - **Language version**: 63 | - **Canopy version**: 64 | render: markdown 65 | validations: 66 | required: false 67 | - type: textarea 68 | attributes: 69 | label: Additional Context 70 | description: | 71 | Links? References? Anything that will give us more context about the issue you are encountering! 72 | 73 | validations: 74 | required: false 75 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: 🤔 Ask a Question 4 | url: 'https://github.com/pinecone-io/canopy/discussions/new?category=q-a' 5 | about: Ask a question about how to use Canopy using GitHub discussions 6 | 7 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: ✨ Feature 2 | description: Propose a straightforward extension 3 | title: "[Feature] <title>" 4 | labels: ["enhancement", "triage"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Thanks for taking the time to fill out this feature request! 10 | - type: checkboxes 11 | attributes: 12 | label: Is this your first time submitting a feature request? 13 | description: > 14 | We want to make sure that features are distinct and discoverable, 15 | so that other members of the community can find them and offer their thoughts. 16 | 17 | Issues are the right place to request straightforward extensions of existing functionality. 18 | options: 19 | - label: I have searched the existing issues, and I could not find an existing issue for this feature 20 | required: true 21 | - label: I am requesting a straightforward extension of existing functionality 22 | - type: textarea 23 | attributes: 24 | label: Describe the feature 25 | description: A clear and concise description of what you want to happen. 26 | validations: 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: Describe alternatives you've considered 31 | description: | 32 | A clear and concise description of any alternative solutions or features you've considered. 33 | validations: 34 | required: false 35 | - type: textarea 36 | attributes: 37 | label: Who will this benefit? 38 | description: | 39 | What kind of use case will this feature be useful for? Please be specific and provide examples, this will help us prioritize properly. 40 | validations: 41 | required: false 42 | - type: input 43 | attributes: 44 | label: Are you interested in contributing this feature? 45 | description: Let us know if you want to write some code, and how we can help. 46 | validations: 47 | required: false 48 | - type: textarea 49 | attributes: 50 | label: Anything else? 51 | description: | 52 | Links? References? Anything that will give us more context about the feature you are suggesting! 53 | validations: 54 | required: false 55 | -------------------------------------------------------------------------------- /.github/actions/install-deps-and-canopy/action.yml: -------------------------------------------------------------------------------- 1 | name: Install dependencies and canopy 2 | description: "Installs Poetry, dependencies and optionally canopy library" 3 | inputs: 4 | python-version: 5 | description: "Python version" 6 | required: true 7 | default: "3.9" 8 | install-canopy: 9 | description: "Whether to install canopy library, or dependencies only" 10 | required: true 11 | default: "true" 12 | runs: 13 | using: "composite" 14 | steps: 15 | - name: Install Poetry 16 | uses: snok/install-poetry@v1 17 | with: 18 | version: 1.3.2 19 | virtualenvs-create: true 20 | virtualenvs-in-project: true 21 | installer-parallel: true 22 | #---------------------------------------------- 23 | # load cached venv if cache exists 24 | #---------------------------------------------- 25 | - name: Load cached venv 26 | id: cached-poetry-dependencies 27 | uses: actions/cache@v3 28 | with: 29 | path: | 30 | .venv 31 | poetry.lock 32 | key: venv-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('pyproject.toml') }} 33 | #---------------------------------------------- 34 | # install dependencies if cache does not exist 35 | #---------------------------------------------- 36 | - name: Install dependencies 37 | shell: bash 38 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 39 | run: make install-extras POETRY_INSTALL_ARGS="--no-interaction --no-root --with dev" 40 | - name: Install project 41 | if: ${{ inputs.install-canopy == 'true' }} 42 | shell: bash 43 | run: make install-extras POETRY_INSTALL_ARGS="--with dev --no-interaction" 44 | -------------------------------------------------------------------------------- /.github/workflows/build-push-image.yml: -------------------------------------------------------------------------------- 1 | name: Create and publish a Docker image 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version: 7 | description: 'Version to tag the image with' 8 | required: true 9 | type: string 10 | workflow_call: 11 | inputs: 12 | version: 13 | description: 'Version to tag the image with' 14 | required: true 15 | type: string 16 | push: 17 | branches: 18 | - 'main' 19 | tags: 20 | - 'v*' 21 | jobs: 22 | build-and-push-image: 23 | runs-on: ubuntu-latest 24 | permissions: 25 | contents: read 26 | packages: write 27 | steps: 28 | - name: Checkout repository 29 | uses: actions/checkout@v4 30 | - name: Set up QEMU 31 | uses: docker/setup-qemu-action@v3 32 | - name: Set up Docker Buildx 33 | uses: docker/setup-buildx-action@v3 34 | - name: Log in to the Container registry 35 | uses: docker/login-action@v3 36 | with: 37 | registry: ghcr.io 38 | username: ${{ github.actor }} 39 | password: ${{ secrets.GITHUB_TOKEN }} 40 | - name: Docker metadata 41 | id: meta 42 | uses: docker/metadata-action@v5 43 | with: 44 | images: | 45 | ghcr.io/${{ github.repository }} 46 | tags: | 47 | type=ref,event=branch,enable=${{ github.event_name == 'push' }} 48 | type=semver,pattern={{version}},enable=${{ github.event_name == 'push' }} 49 | type=raw,value=latest,enable=${{ github.event_name != 'push' }} 50 | type=raw,value=${{inputs.version}},enable=${{ github.event_name != 'push' }} 51 | - name: Create build args 52 | run: | 53 | export POETRY_INSTALL_ARGS="$(make print-var VAR=POETRY_DEFAULT_EXTRAS)" 54 | echo "POETRY_INSTALL_ARGS=$POETRY_INSTALL_ARGS" >> $GITHUB_OUTPUT 55 | id: build-args 56 | - name: Build and push 57 | uses: docker/build-push-action@v5 58 | with: 59 | context: . 60 | platforms: linux/amd64 61 | push: true 62 | build-args: | 63 | POETRY_INSTALL_ARGS=${{steps.build-args.outputs.POETRY_INSTALL_ARGS}} 64 | tags: ${{ steps.meta.outputs.tags }} 65 | labels: ${{ steps.meta.outputs.labels }} 66 | provenance: false 67 | cache-from: type=gha 68 | cache-to: type=gha,mode=max -------------------------------------------------------------------------------- /.github/workflows/pre-release-CI.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test installation 2 | 3 | on: 4 | workflow_dispatch: 5 | workflow_call: 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | test-installation: 13 | name: Test on ${{ matrix.os }}-py${{ matrix.python-version }} 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ubuntu-latest, windows-latest, macos-latest] 18 | python-version: [3.9, '3.10', 3.11] 19 | defaults: 20 | run: 21 | shell: bash 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v4 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Install Poetry 32 | uses: snok/install-poetry@v1 33 | with: 34 | version: 1.3.2 35 | virtualenvs-create: true 36 | virtualenvs-in-project: true 37 | installer-parallel: true 38 | 39 | - name: Download wheels 40 | uses: actions/download-artifact@v2 41 | with: 42 | name: wheels 43 | path: ./dist/ 44 | 45 | - name: Install dev dependencies 46 | run: | 47 | poetry install --no-root --only dev --no-interaction 48 | 49 | - name: Install the wheel 50 | run: | 51 | source $VENV 52 | ls -lah ./dist 53 | pip install ./dist/canopy_sdk*.whl 54 | 55 | - name: Run unit tests 56 | run: | 57 | source $VENV 58 | pytest --html=report.html --self-contained-html tests/unit 59 | 60 | - name: Upload pytest reports 61 | if: failure() 62 | uses: actions/upload-artifact@v3 63 | with: 64 | name: pytest-report-${{ matrix.os }}-py${{ matrix.python-version }} 65 | path: .pytest_cache 66 | 67 | 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | # Mac OS 163 | **/.DS_Store 164 | 165 | datafiles/* 166 | canopy-api-docs.html 167 | .vscode/ 168 | *.jsonl 169 | -------------------------------------------------------------------------------- /.readme-content/class_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/class_architecture.png -------------------------------------------------------------------------------- /.readme-content/new.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/new.gif -------------------------------------------------------------------------------- /.readme-content/rag_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/rag_flow.png -------------------------------------------------------------------------------- /.readme-content/resin-chat-no-rag.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/resin-chat-no-rag.gif -------------------------------------------------------------------------------- /.readme-content/sketch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/.readme-content/sketch.png -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TEST_WORKER_COUNT = 8 2 | 3 | POETRY_DEFAULT_EXTRAS = -E cohere -E transformers -E grpc 4 | POETRY_INSTALL_ARGS = 5 | 6 | REPOSITORY = ghcr.io/pinecone-io/canopy 7 | IMAGE_TAG = $(shell poetry version -s) 8 | 9 | CONTAINER_PORT = 8000 10 | CONTAINER_ENV_FILE = .env 11 | CONTAINER_BUILD_DIR = . 12 | CONTAINER_BUILD_PLATFORM = linux/amd64 13 | CONTAINER_SYSTEM_BUILD_ARGS = --progress plain --platform $(CONTAINER_BUILD_PLATFORM) --build-arg PORT=$(CONTAINER_PORT) --build-arg POETRY_INSTALL_ARGS="$(POETRY_DEFAULT_EXTRAS) $(POETRY_INSTALL_ARGS)" 14 | CONTAINER_BUILD_ARGS = 15 | 16 | # Only add the env file if it exists 17 | CONTAINER_SYSTEM_RUN_ARGS = --platform linux/amd64 -p $(CONTAINER_PORT):$(CONTAINER_PORT) $(shell [ -e "$(CONTAINER_ENV_FILE)" ] && echo "--env-file $(CONTAINER_ENV_FILE)") 18 | CONTAINER_RUN_ARGS = 19 | 20 | 21 | .PHONY: lint static install install-extras install-all-extras test test-unit test-system test-e2e docker-build docker-build-dev docker-run docker-run-dev print-var help 22 | 23 | lint: 24 | poetry run flake8 . 25 | 26 | static: 27 | poetry run mypy src 28 | 29 | install: 30 | poetry install $(POETRY_INSTALL_ARGS) 31 | 32 | install-extras: 33 | poetry install $(POETRY_DEFAULT_EXTRAS) $(POETRY_INSTALL_ARGS) 34 | 35 | install-all-extras: 36 | poetry install --all-extras $(POETRY_INSTALL_ARGS) 37 | 38 | test: 39 | poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope 40 | 41 | test-unit: 42 | poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope tests/unit 43 | 44 | test-system: 45 | poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope tests/system 46 | 47 | test-e2e: 48 | poetry run pytest -n $(TEST_WORKER_COUNT) --dist loadscope tests/e2e 49 | 50 | docker-build: 51 | @echo "Building Docker image..." 52 | docker build $(CONTAINER_SYSTEM_BUILD_ARGS) $(CONTAINER_BUILD_ARGS) -t $(REPOSITORY):$(IMAGE_TAG) $(CONTAINER_BUILD_DIR) 53 | @echo "Docker build complete." 54 | 55 | docker-build-dev: 56 | @echo "Building Docker image for development..." 57 | docker build $(CONTAINER_SYSTEM_BUILD_ARGS) $(CONTAINER_BUILD_ARGS) -t $(REPOSITORY)-dev:$(IMAGE_TAG) --target=development $(CONTAINER_BUILD_DIR) 58 | @echo "Development Docker build complete." 59 | 60 | docker-run: 61 | docker run $(CONTAINER_SYSTEM_RUN_ARGS) $(CONTAINER_RUN_ARGS) $(REPOSITORY):$(IMAGE_TAG) 62 | 63 | docker-run-dev: 64 | docker run $(CONTAINER_SYSTEM_RUN_ARGS) $(CONTAINER_RUN_ARGS) -it $(REPOSITORY)-dev:$(IMAGE_TAG) 65 | 66 | print-var: 67 | @echo "$($(VAR))" 68 | 69 | help: 70 | @echo "Available targets:" 71 | @echo "" 72 | @echo " -- DEV -- " 73 | @echo " make install - Install only the required dependencies without any extras." 74 | @echo " make install-extras - Install the dependencies with the default extras." 75 | @echo " make install-all-extras - Install the dependencies with all extras." 76 | @echo " make lint - Lint the code." 77 | @echo " make static - Run static type checks." 78 | @echo " make test - Test the code." 79 | @echo " make test-unit - Run unit tests." 80 | @echo " make test-system - Run system tests." 81 | @echo " make test-e2e - Run e2e tests." 82 | @echo "" 83 | @echo " -- DOCKER -- " 84 | @echo " make docker-build - Build the Docker image." 85 | @echo " make docker-build-dev - Build the Docker image for development." 86 | @echo " make docker-run - Run the Docker image." 87 | @echo " make docker-run-dev - Run the Docker image for development." 88 | @echo "" 89 | @echo " -- MISC -- " 90 | @echo " make print-var VAR=<variable> - Print the value of a variable." 91 | 92 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "canopy-sdk" 3 | version = "0.9.0" 4 | description = "Retrieval Augmented Generation (RAG) framework and context engine powered by Pinecone" 5 | authors = ["Relevance Team <relevance@pinecone.io>"] 6 | readme = "README.md" 7 | license = "Apache-2.0" 8 | packages = [{include = "canopy", from = "src"}, 9 | {include = "canopy_cli", from = "src"}, 10 | {include = "canopy_server", from = "src"},] 11 | 12 | [tool.poetry.dependencies] 13 | python = ">=3.9,<3.13" 14 | python-dotenv = "^1.0.0" 15 | openai = "^1.2.3" 16 | tiktoken = "^0.3.3" 17 | pydantic = "^2.0.0" 18 | pandas-stubs = "^2.0.3.230814" 19 | fastapi = ">=0.93.0, <1.0.0" 20 | uvicorn = ">=0.20.0, <1.0.0" 21 | tenacity = "^8.2.1" 22 | sse-starlette = "^1.6.5" 23 | types-tqdm = "^4.61.0" 24 | tqdm = "^4.66.1" 25 | gunicorn = "^21.2.0" 26 | types-pyyaml = "^6.0.12.12" 27 | jsonschema = "^4.2.0" 28 | types-jsonschema = "^4.2.0" 29 | prompt-toolkit = "^3.0.39" 30 | tokenizers = "^0.15.0" 31 | transformers = {version = "^4.35.2", optional = true} 32 | sentencepiece = "^0.1.99" 33 | pandas = "2.0.0" 34 | pyarrow = "^14.0.1" 35 | qdrant-client = {version = "^1.8.0", optional = true} 36 | cohere = { version = "^4.37", optional = true } 37 | 38 | 39 | pinecone-text = "^0.8.0" 40 | # Extra: torch (Relies on pinecone-text[dense]) 41 | # Dependencies here should be synced with pinecone-text's pyproject.toml 42 | # See: https://github.com/pinecone-io/pinecone-text/blob/0eb00a202f5c9bc8cc48c8b7536fcbabf95f096e/pyproject.toml#L30 43 | torch = { version = ">=1.13.1", optional = true } 44 | sentence-transformers = { version = ">=2.0.0", optional = true } 45 | 46 | 47 | pinecone-client = "^3.0.0" 48 | # Extra: grpc (Relies on pinecone-client[grpc]) 49 | # Dependencies here should be synced with pinecone-python-client's pyproject.toml 50 | # See: https://github.com/pinecone-io/pinecone-python-client/blob/886f932b66521a6ab5b1e076f6a53ba2f16eb41b/pyproject.toml#L94 51 | grpcio = { version = ">=1.44.0", optional = true } 52 | grpc-gateway-protoc-gen-openapiv2 = { version = "0.1.0", optional = true } 53 | googleapis-common-protos = { version = ">=1.53.0", optional = true } 54 | lz4 = { version = ">=3.1.3", optional = true } 55 | protobuf = { version = "~=3.20.0", optional = true } 56 | 57 | 58 | 59 | [tool.poetry.extras] 60 | cohere = ["cohere"] 61 | torch = ["torch", "sentence-transformers"] 62 | transformers = ["transformers"] 63 | grpc = ["grpcio", "grpc-gateway-protoc-gen-openapiv2", "googleapis-common-protos", "lz4", "protobuf"] 64 | qdrant = ["qdrant-client"] 65 | 66 | 67 | [tool.poetry.group.dev.dependencies] 68 | pytest = "^7.3.2" 69 | jupyter = "^1.0.0" 70 | mypy = "^1.4.1" 71 | flake8 = "^6.1.0" 72 | pytest-html = "^4.1.0" 73 | flake8-pyproject = "^1.2.3" 74 | asyncio = "^3.4.3" 75 | pytest-asyncio = "^0.14.0" 76 | pytest-mock = "^3.6.1" 77 | pytest-xdist = "^3.3.1" 78 | types-requests = "^2.31.0.2" 79 | httpx = "^0.25.0" 80 | pydoclint = "^0.3.8" 81 | pytest-dotenv = "^0.5.2" 82 | 83 | [build-system] 84 | requires = ["poetry-core"] 85 | build-backend = "poetry.core.masonry.api" 86 | 87 | 88 | [tool.mypy] 89 | allow_redefinition = true 90 | exclude = ['tests', '.venv'] 91 | 92 | [[tool.mypy.overrides]] 93 | module = [ 94 | 'pinecone_text.*', 95 | 'pinecone_datasets', 96 | 'pinecone', 97 | 'transformers.*', 98 | 'tokenizers.*', 99 | 'cohere.*', 100 | 'pinecone.grpc', 101 | 'huggingface_hub.utils', 102 | 'qdrant_client.*', 103 | 'grpc.*' 104 | ] 105 | ignore_missing_imports = true 106 | 107 | 108 | [tool.flake8] 109 | per-file-ignores = [ 110 | '*/__init__.py:F401', 111 | ] 112 | exclude = ['.venv'] 113 | max-line-length = 88 114 | 115 | # PyDocLint configuration 116 | style = 'google' 117 | arg-type-hints-in-docstring = false 118 | require-return-section-when-returning-nothing = false 119 | allow-init-docstring = true 120 | check-return-types = false 121 | skip-checking-raises = true 122 | 123 | [tool.poetry.scripts] 124 | canopy = "canopy_cli.cli:cli" 125 | 126 | [tool.pytest.ini_options] 127 | log_cli = true 128 | -------------------------------------------------------------------------------- /scripts/cleanup_indexes.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from tests.util import cleanup_indexes 4 | 5 | 6 | def main(): 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | if len(sys.argv) != 2: 11 | logger.info("Usage: python scripts/cleanup_indexes.py <testrun_uid>") 12 | sys.exit(1) 13 | 14 | testrun_uid = sys.argv[1] 15 | if testrun_uid: 16 | logger.info(f"Cleaning up indexes for testrun_uid '{testrun_uid}'") 17 | cleanup_indexes(testrun_uid) 18 | else: 19 | logger.info("Passed testrun_uid is empty.") 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /src/canopy/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | import warnings 3 | import logging 4 | import os 5 | from typing import List 6 | 7 | # Taken from https://stackoverflow.com/a/67097076 8 | __version__ = importlib.metadata.version("canopy-sdk") 9 | 10 | 11 | IGNORED_WARNINGS: List[str] = [ 12 | ] 13 | 14 | IGNORED_WARNING_IN_MODULES = [ 15 | "transformers", 16 | ] 17 | 18 | for warning in IGNORED_WARNINGS: 19 | warnings.filterwarnings("ignore", message=warning) 20 | for module in IGNORED_WARNING_IN_MODULES: 21 | warnings.filterwarnings("ignore", module=module) 22 | logging.getLogger(module).setLevel(logging.ERROR) 23 | 24 | # Apparently, `transformers` has its own logging system, and needs to be silenced separately # noqa: E501 25 | os.environ["TRANSFORMERS_VERBOSITY"] = "error" 26 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .chat_engine import ChatEngine 2 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class InvalidRequestError(Exception): 4 | pass 5 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/history_pruner/__init__.py: -------------------------------------------------------------------------------- 1 | from .recent import RecentHistoryPruner 2 | from .raising import RaisingHistoryPruner 3 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/history_pruner/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional 3 | 4 | from canopy.tokenizer import Tokenizer 5 | from canopy.models.data_models import Messages, SystemMessage, Context 6 | from canopy.utils.config import ConfigurableMixin 7 | 8 | 9 | class HistoryPruner(ABC, ConfigurableMixin): 10 | 11 | def __init__(self): 12 | self._tokenizer = Tokenizer() 13 | 14 | @abstractmethod 15 | def build(self, 16 | chat_history: Messages, 17 | max_tokens: int, 18 | system_prompt: Optional[str] = None, 19 | context: Optional[Context] = None, 20 | ) -> Messages: 21 | raise NotImplementedError 22 | 23 | async def abuild(self, 24 | chat_history: Messages, 25 | max_tokens: int) -> Messages: 26 | raise NotImplementedError() 27 | 28 | def _max_tokens_history(self, 29 | max_tokens: int, 30 | system_prompt: Optional[str] = None, 31 | context: Optional[Context] = None, ) -> int: 32 | if system_prompt is not None: 33 | max_tokens -= self._tokenizer.messages_token_count( 34 | [SystemMessage(content=system_prompt)] 35 | ) 36 | 37 | if context is not None: 38 | max_tokens -= context.num_tokens 39 | 40 | return max_tokens 41 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/history_pruner/raising.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from canopy.chat_engine.history_pruner.base import HistoryPruner 4 | from canopy.models.data_models import Messages, Context 5 | 6 | 7 | class RaisingHistoryPruner(HistoryPruner): 8 | 9 | def build(self, 10 | chat_history: Messages, 11 | max_tokens: int, 12 | system_prompt: Optional[str] = None, 13 | context: Optional[Context] = None, ) -> Messages: 14 | max_tokens = self._max_tokens_history(max_tokens, 15 | system_prompt, 16 | context) 17 | token_count = self._tokenizer.messages_token_count(chat_history) 18 | if token_count > max_tokens: 19 | raise ValueError(f"The history require {token_count} tokens, " 20 | f"which exceeds the calculated limit for history " 21 | f"of {max_tokens} tokens left for" 22 | f" history out of {max_tokens} tokens" 23 | f" allowed in context window.") 24 | return chat_history 25 | 26 | async def abuild(self, 27 | chat_history: Messages, 28 | max_tokens: int) -> Messages: 29 | raise NotImplementedError() 30 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/history_pruner/recent.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from canopy.chat_engine.history_pruner.base import HistoryPruner 4 | from canopy.models.data_models import Messages, Context 5 | 6 | 7 | class RecentHistoryPruner(HistoryPruner): 8 | 9 | def __init__(self, 10 | min_history_messages: int = 1): 11 | super().__init__() 12 | self._min_history_messages = min_history_messages 13 | 14 | def build(self, 15 | chat_history: Messages, 16 | max_tokens: int, 17 | system_prompt: Optional[str] = None, 18 | context: Optional[Context] = None, 19 | ) -> Messages: 20 | max_tokens = self._max_tokens_history(max_tokens, 21 | system_prompt, 22 | context) 23 | token_count = self._tokenizer.messages_token_count(chat_history) 24 | if token_count < max_tokens: 25 | return chat_history 26 | 27 | truncated_history = chat_history[-self._min_history_messages:] 28 | token_count = self._tokenizer.messages_token_count(truncated_history) 29 | if token_count > max_tokens: 30 | raise ValueError(f"The {self._min_history_messages} most recent messages in" 31 | f" history require {token_count} tokens, which exceeds the" 32 | f" calculated limit for history of {max_tokens}" 33 | f" tokens out of total {max_tokens} allowed" 34 | f" in context window.") 35 | 36 | for message in reversed(chat_history[:-self._min_history_messages]): 37 | token_count = self._tokenizer.messages_token_count( 38 | truncated_history + [message] 39 | ) 40 | 41 | # If the message can fit into the remaining tokens, add it 42 | if token_count > max_tokens: 43 | break 44 | 45 | truncated_history.insert(0, message) 46 | 47 | return truncated_history 48 | 49 | async def abuild(self, 50 | chat_history: Messages, 51 | max_tokens: int) -> Messages: 52 | raise NotImplementedError() 53 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/query_generator/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import QueryGenerator 2 | from .function_calling import FunctionCallingQueryGenerator 3 | from .last_message import LastMessageQueryGenerator 4 | from .instruction import InstructionQueryGenerator 5 | from .cohere import CohereQueryGenerator 6 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/query_generator/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | from canopy.models.data_models import Messages, Query 5 | from canopy.utils.config import ConfigurableMixin 6 | 7 | 8 | class QueryGenerator(ABC, ConfigurableMixin): 9 | @abstractmethod 10 | def generate(self, 11 | messages: Messages, 12 | max_prompt_tokens: int, 13 | ) -> List[Query]: 14 | pass 15 | 16 | @abstractmethod 17 | async def agenerate(self, 18 | messages: Messages, 19 | max_prompt_tokens: int, 20 | ) -> List[Query]: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/query_generator/cohere.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, cast 2 | 3 | from canopy.chat_engine.query_generator import QueryGenerator 4 | from canopy.chat_engine.history_pruner.raising import RaisingHistoryPruner 5 | from canopy.llm import BaseLLM, CohereLLM 6 | from canopy.models.data_models import Messages, Query 7 | 8 | 9 | class CohereQueryGenerator(QueryGenerator): 10 | """ 11 | Query generator for LLM clients that have a built-in feature to 12 | generate search queries from chat messages. 13 | """ 14 | _DEFAULT_COMPONENTS = { 15 | "llm": CohereLLM, 16 | } 17 | 18 | def __init__(self, 19 | *, 20 | llm: Optional[BaseLLM] = None): 21 | self._llm = llm or self._DEFAULT_COMPONENTS["llm"]() 22 | 23 | if not isinstance(self._llm, CohereLLM): 24 | raise NotImplementedError( 25 | "CohereQueryGenerator only compatible with CohereLLM" 26 | ) 27 | 28 | self._history_pruner = RaisingHistoryPruner() 29 | 30 | def generate(self, 31 | messages: Messages, 32 | max_prompt_tokens: int) -> List[Query]: 33 | messages = self._history_pruner.build(chat_history=messages, 34 | max_tokens=max_prompt_tokens) 35 | llm = cast(CohereLLM, self._llm) 36 | queries = llm.generate_search_queries(messages) 37 | return [Query(text=query) for query in queries] 38 | 39 | async def agenerate(self, 40 | messages: Messages, 41 | max_prompt_tokens: int) -> List[Query]: 42 | raise NotImplementedError 43 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/query_generator/function_calling.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from canopy.chat_engine.history_pruner import RaisingHistoryPruner 4 | from canopy.chat_engine.query_generator import QueryGenerator 5 | from canopy.llm import BaseLLM, OpenAILLM 6 | from canopy.llm.models import (Function, FunctionParameters, 7 | FunctionArrayProperty) 8 | from canopy.models.data_models import Messages, Query 9 | 10 | DEFAULT_SYSTEM_PROMPT = """Your task is to formulate search queries for a search engine, to assist in responding to the user's question. 11 | You should break down complex questions into sub-queries if needed. 12 | """ # noqa: E501 13 | 14 | DEFAULT_FUNCTION_DESCRIPTION = """Query search engine for relevant information""" 15 | 16 | 17 | class FunctionCallingQueryGenerator(QueryGenerator): 18 | 19 | _DEFAULT_COMPONENTS = { 20 | "llm": OpenAILLM, 21 | } 22 | 23 | def __init__(self, 24 | *, 25 | llm: Optional[BaseLLM] = None, 26 | prompt: Optional[str] = None, 27 | function_description: Optional[str] = None): 28 | self._llm = llm or self._DEFAULT_COMPONENTS["llm"]() 29 | self._system_prompt = prompt or DEFAULT_SYSTEM_PROMPT 30 | self._function_description = \ 31 | function_description or DEFAULT_FUNCTION_DESCRIPTION 32 | self._history_pruner = RaisingHistoryPruner() 33 | 34 | def generate(self, 35 | messages: Messages, 36 | max_prompt_tokens: int) -> List[Query]: 37 | messages = self._history_pruner.build(system_prompt=self._system_prompt, 38 | chat_history=messages, 39 | max_tokens=max_prompt_tokens) 40 | try: 41 | arguments = self._llm.enforced_function_call( 42 | system_prompt=self._system_prompt, 43 | chat_history=messages, 44 | function=self._function 45 | ) 46 | except NotImplementedError as e: 47 | raise RuntimeError( 48 | "FunctionCallingQueryGenerator requires an LLM that supports " 49 | "function calling. Please provide a different LLM, " 50 | "or alternatively select a different QueryGenerator class. " 51 | f"Received the following error from LLM:\n{e}" 52 | ) from e 53 | 54 | return [Query(text=q) 55 | for q in arguments["queries"]] 56 | 57 | async def agenerate(self, 58 | messages: Messages, 59 | max_prompt_tokens: int) -> List[Query]: 60 | raise NotImplementedError 61 | 62 | @property 63 | def _function(self) -> Function: 64 | return Function( 65 | name="query_knowledgebase", 66 | description=self._function_description, 67 | parameters=FunctionParameters( 68 | required_properties=[ 69 | FunctionArrayProperty( 70 | name="queries", 71 | items_type="string", 72 | description='List of queries to send to the search engine.', 73 | ), 74 | ] 75 | ), 76 | ) 77 | -------------------------------------------------------------------------------- /src/canopy/chat_engine/query_generator/last_message.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from canopy.chat_engine.query_generator import QueryGenerator 4 | from canopy.models.data_models import Messages, Query, Role 5 | 6 | 7 | class LastMessageQueryGenerator(QueryGenerator): 8 | """ 9 | Returns the last message as a query without running any LLMs. This can be 10 | considered as the most basic query generation. Please use other query generators 11 | for more accurate results. 12 | """ 13 | 14 | def generate(self, 15 | messages: Messages, 16 | max_prompt_tokens: int) -> List[Query]: 17 | """ 18 | max_prompt_token is dismissed since we do not consume any token for 19 | generating the queries. 20 | """ 21 | 22 | if len(messages) == 0: 23 | raise ValueError("Passed chat history does not contain any messages. " 24 | "Please include at least one message in the history.") 25 | 26 | last_message = messages[-1] 27 | 28 | if last_message.role != Role.USER: 29 | raise ValueError(f"Expected a UserMessage, got {type(last_message)}.") 30 | 31 | return [Query(text=last_message.content)] 32 | 33 | async def agenerate(self, 34 | messages: Messages, 35 | max_prompt_tokens: int) -> List[Query]: 36 | return self.generate(messages, max_prompt_tokens) 37 | -------------------------------------------------------------------------------- /src/canopy/config_templates/anyscale.yaml: -------------------------------------------------------------------------------- 1 | # =========================================================== 2 | # Configuration file for Canopy Server 3 | # =========================================================== 4 | tokenizer: 5 | # ------------------------------------------------------------------------------------------- 6 | # Tokenizer configuration 7 | # Use LLamaTokenizer from HuggingFace with the relevant OSS model (e.g. LLama2) 8 | # ------------------------------------------------------------------------------------------- 9 | type: LlamaTokenizer # Options: [OpenAITokenizer, LlamaTokenizer] 10 | params: 11 | model_name: hf-internal-testing/llama-tokenizer 12 | 13 | chat_engine: 14 | # ------------------------------------------------------------------------------------------- 15 | # Chat engine configuration 16 | # Use Anyscale Endpoint as the open source LLM provider 17 | # You can find the list of supported LLM at https://docs.endpoints.anyscale.com/category/supported-models 18 | # ------------------------------------------------------------------------------------------- 19 | params: 20 | max_prompt_tokens: 2048 # The maximum number of tokens to use for input prompt to the LLM. 21 | llm: &llm 22 | type: AnyscaleLLM 23 | params: 24 | model_name: meta-llama/Llama-2-7b-chat-hf # The name of the model to use. 25 | 26 | query_builder: 27 | type: FunctionCallingQueryGenerator # Options: [FunctionCallingQueryGenerator, LastMessageQueryGenerator, InstructionQueryGenerator] 28 | llm: 29 | type: AnyscaleLLM 30 | params: 31 | model_name: mistralai/Mistral-7B-Instruct-v0.1 32 | 33 | context_engine: 34 | # ------------------------------------------------------------------------------------------------------------- 35 | # ContextEngine configuration 36 | # ------------------------------------------------------------------------------------------------------------- 37 | knowledge_base: 38 | # ----------------------------------------------------------------------------------------------------------- 39 | # KnowledgeBase configuration 40 | # ----------------------------------------------------------------------------------------------------------- 41 | record_encoder: 42 | # -------------------------------------------------------------------------- 43 | # Configuration for the RecordEncoder subcomponent of the knowledge base. 44 | # Use Anyscale's Embedding endpoint for dense encoding 45 | # -------------------------------------------------------------------------- 46 | type: AnyscaleRecordEncoder 47 | params: 48 | model_name: # The name of the model to use for encoding 49 | thenlper/gte-large 50 | batch_size: 100 # The number of document chunks to encode in each call to the encoding model 51 | -------------------------------------------------------------------------------- /src/canopy/config_templates/azure.yaml: -------------------------------------------------------------------------------- 1 | # =========================================================== 2 | # Configuration file for Azure OpenAI 3 | # =========================================================== 4 | 5 | 6 | query_builder_prompt: &query_builder_prompt | 7 | Your task is to formulate search queries for a search engine, to assist in responding to the user's question. 8 | You should break down complex questions into sub-queries if needed. 9 | 10 | 11 | tokenizer: 12 | type: OpenAITokenizer # Options: [OpenAITokenizer] 13 | params: 14 | model_name: gpt-3.5-turbo # Configure the tokenizer that matches the OpenAI model in your deployment 15 | 16 | 17 | chat_engine: 18 | 19 | llm: &llm 20 | # ------------------------------------------------------------------------------------------------------------- 21 | # LLM configuration 22 | # Configuration of the LLM (Large Language Model) 23 | # ------------------------------------------------------------------------------------------------------------- 24 | type: AzureOpenAILLM # Options: [OpenAILLM, AzureOpenAILLM] 25 | params: 26 | model_name: your-deployment-name # Specify the name of the LLM deployment to use. 27 | api_version: 2023-12-01-preview # Specify the API version to use. 28 | 29 | query_builder: 30 | # ------------------------------------------------------------------------------------------------------------- 31 | # LLM configuration 32 | # Configuration of the LLM (Large Language Model) 33 | # ------------------------------------------------------------------------------------------------------------- 34 | type: FunctionCallingQueryGenerator # Options: [FunctionCallingQueryGenerator, LastMessageQueryGenerator, InstructionQueryGenerator] 35 | params: 36 | prompt: *query_builder_prompt # The query builder's system prompt for calling the LLM 37 | function_description: # A function description passed to the LLM's `function_calling` API 38 | Query search engine for relevant information 39 | 40 | llm: # The LLM that the query builder will use to generate queries. Leave `*llm` to use the chat engine's LLM 41 | <<: *llm 42 | 43 | context_engine: 44 | 45 | knowledge_base: 46 | 47 | record_encoder: 48 | # -------------------------------------------------------------------------- 49 | # Configuration for the RecordEncoder subcomponent of the knowledge base. 50 | # The record encoder is responsible for encoding document chunks to a vector representation 51 | # -------------------------------------------------------------------------- 52 | type: AzureOpenAIRecordEncoder # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder] 53 | params: 54 | model_name: # Specify the name of the embedding deployment to use. 55 | your-embedding-deployment-name 56 | batch_size: 400 # The number of document chunks to encode in each call to the encoding model -------------------------------------------------------------------------------- /src/canopy/config_templates/octoai.yaml: -------------------------------------------------------------------------------- 1 | # =========================================================== 2 | # Configuration file for Canopy Server 3 | # =========================================================== 4 | tokenizer: 5 | # ------------------------------------------------------------------------------------------- 6 | # Tokenizer configuration 7 | # Use LLamaTokenizer from HuggingFace with the relevant OSS model (e.g. LLama2) 8 | # ------------------------------------------------------------------------------------------- 9 | type: LlamaTokenizer # Options: [OpenAITokenizer, LlamaTokenizer] 10 | params: 11 | model_name: hf-internal-testing/llama-tokenizer 12 | 13 | chat_engine: 14 | # ------------------------------------------------------------------------------------------- 15 | # Chat engine configuration 16 | # Use OctoAI as the open source LLM provider 17 | # You can find the list of supported LLMs at https://octo.ai/docs/text-gen-solution/rest-api 18 | # ------------------------------------------------------------------------------------------- 19 | params: 20 | max_prompt_tokens: 2048 # The maximum number of tokens to use for input prompt to the LLM. 21 | llm: &llm 22 | type: OctoAILLM 23 | params: 24 | model_name: mistral-7b-instruct-fp16 # The name of the model to use. 25 | 26 | # query_builder: 27 | # type: FunctionCallingQueryGenerator # Options: [FunctionCallingQueryGenerator, LastMessageQueryGenerator, InstructionQueryGenerator] 28 | # llm: 29 | # type: OctoAILLM 30 | # params: 31 | # model_name: mistral-7b-instruct-fp16 32 | 33 | context_engine: 34 | # ------------------------------------------------------------------------------------------------------------- 35 | # ContextEngine configuration 36 | # ------------------------------------------------------------------------------------------------------------- 37 | knowledge_base: 38 | # ----------------------------------------------------------------------------------------------------------- 39 | # KnowledgeBase configuration 40 | # ----------------------------------------------------------------------------------------------------------- 41 | record_encoder: 42 | # -------------------------------------------------------------------------- 43 | # Configuration for the RecordEncoder subcomponent of the knowledge base. 44 | # Use OctoAI's Embedding endpoint for dense encoding 45 | # -------------------------------------------------------------------------- 46 | type: OctoAIRecordEncoder 47 | params: 48 | model_name: # The name of the model to use for encoding 49 | thenlper/gte-large 50 | batch_size: 2048 # The number of document chunks to encode in each call to the encoding model 51 | -------------------------------------------------------------------------------- /src/canopy/context_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .context_engine import ContextEngine 2 | -------------------------------------------------------------------------------- /src/canopy/context_engine/context_builder/__init__.py: -------------------------------------------------------------------------------- 1 | from .stuffing import StuffingContextBuilder 2 | from .base import ContextBuilder 3 | -------------------------------------------------------------------------------- /src/canopy/context_engine/context_builder/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | from canopy.knowledge_base.models import QueryResult 5 | from canopy.models.data_models import Context 6 | from canopy.utils.config import ConfigurableMixin 7 | 8 | 9 | class ContextBuilder(ABC, ConfigurableMixin): 10 | """ 11 | BaseContextBuilder is an abstract class that defines the interface for a context 12 | builder. 13 | """ 14 | 15 | @abstractmethod 16 | def build(self, 17 | query_results: List[QueryResult], 18 | max_context_tokens: int, ) -> Context: 19 | pass 20 | 21 | @abstractmethod 22 | async def abuild(self, 23 | query_results: List[QueryResult], 24 | max_context_tokens: int, ) -> Context: 25 | pass 26 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/__init__.py: -------------------------------------------------------------------------------- 1 | from .knowledge_base import list_canopy_indexes 2 | from .knowledge_base import KnowledgeBase 3 | from .qdrant.qdrant_knowledge_base import QdrantKnowledgeBase 4 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Optional 3 | 4 | from canopy.knowledge_base.models import QueryResult 5 | from canopy.models.data_models import Query, Document 6 | from canopy.utils.config import ConfigurableMixin 7 | 8 | 9 | class BaseKnowledgeBase(ABC, ConfigurableMixin): 10 | """ 11 | KnowledgeBase is an abstract class that defines the interface for a knowledge base. 12 | """ 13 | 14 | @abstractmethod 15 | def query(self, 16 | queries: List[Query], 17 | global_metadata_filter: Optional[dict] = None, 18 | namespace: Optional[str] = None 19 | ) -> List[QueryResult]: 20 | pass 21 | 22 | @abstractmethod 23 | def upsert(self, 24 | documents: List[Document], 25 | namespace: str = "", ) -> None: 26 | pass 27 | 28 | # TODO: Do we want delete by metadata? 29 | @abstractmethod 30 | def delete(self, 31 | document_ids: List[str], 32 | namespace: str = "") -> None: 33 | pass 34 | 35 | @abstractmethod 36 | def verify_index_connection(self) -> None: 37 | pass 38 | 39 | @abstractmethod 40 | async def aquery(self, 41 | queries: List[Query], 42 | global_metadata_filter: Optional[dict] = None, 43 | namespace: Optional[str] = None 44 | ) -> List[QueryResult]: 45 | pass 46 | 47 | @abstractmethod 48 | async def aupsert(self, 49 | documents: List[Document], 50 | namespace: str = "", 51 | ) -> None: 52 | pass 53 | 54 | @abstractmethod 55 | async def adelete(self, 56 | document_ids: List[str], 57 | namespace: str = "") -> None: 58 | pass 59 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/chunker/__init__.py: -------------------------------------------------------------------------------- 1 | from .token_chunker import TokenChunker 2 | from .markdown import MarkdownChunker 3 | from .base import Chunker 4 | 5 | __ALL__ = [ 6 | "MarkdownChunker", 7 | "TokenChunker", 8 | "Chunker", 9 | ] 10 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/chunker/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | from canopy.knowledge_base.models import KBDocChunk 5 | from canopy.models.data_models import Document 6 | from canopy.utils.config import ConfigurableMixin 7 | 8 | 9 | class Chunker(ABC, ConfigurableMixin): 10 | """ 11 | Base class for chunkers. Chunkers take a document (id, text, ...) 12 | and return a list of KBDocChunks (id, text, document_id, ...) 13 | Chunker is an abstract class that must be subclassed to be used, 14 | also, it extends ConfigurableMixin which means that every subclass of 15 | Chunker could be referenced by a name and configured in a config file. 16 | """ 17 | 18 | def chunk_documents(self, documents: List[Document]) -> List[KBDocChunk]: 19 | """ 20 | chunk_documents takes a list of documents and returns a list of KBDocChunks 21 | this method is just a wrapper around chunk_single_document that can be 22 | used to chunk a list of documents. 23 | 24 | Args: 25 | documents: list of documents 26 | 27 | Returns: 28 | chunks: list of chunks of type KBDocChunks 29 | """ 30 | chunks: List[KBDocChunk] = [] 31 | for doc in documents: 32 | chunks.extend(self.chunk_single_document(doc)) 33 | return chunks 34 | 35 | async def achunk_documents(self, documents: List[Document]) -> List[KBDocChunk]: 36 | chunks: List[KBDocChunk] = [] 37 | for doc in documents: 38 | chunks.extend(await self.achunk_single_document(doc)) 39 | return chunks 40 | 41 | @abstractmethod 42 | def chunk_single_document(self, document: Document) -> List[KBDocChunk]: 43 | """ 44 | chunk_single_document takes a document and returns a 45 | list of KBDocChunks, this is the main method 46 | that must be implemented by every subclass of Chunker 47 | 48 | Args: 49 | document: list of documents 50 | 51 | Returns: 52 | chunks: list of chunks KBDocChunks 53 | """ 54 | pass 55 | 56 | @abstractmethod 57 | async def achunk_single_document(self, document: Document) -> List[KBDocChunk]: 58 | raise NotImplementedError() 59 | 60 | def generate_chunk_id(self, document_id: str, chunk_index: int) -> str: 61 | return f"{document_id}_{chunk_index}" 62 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/chunker/markdown.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .langchain_text_splitter import Language, RecursiveCharacterTextSplitter 4 | from .recursive_character import RecursiveCharacterChunker 5 | from canopy.knowledge_base.models import KBDocChunk 6 | from canopy.models.data_models import Document 7 | 8 | 9 | class MarkdownChunker(RecursiveCharacterChunker): 10 | """ 11 | MarkdownChunker is a subclass of RecursiveCharacterChunker that is configured 12 | to chunk markdown documents. It uses RecursiveCharacterTextSplitter to split 13 | the text of the document into chunks, by providing the separators for markdown documents 14 | (also from LangChainTextSplitter, with modifications) 15 | """ # noqa: E501 16 | 17 | def __init__(self, 18 | chunk_size: int = 256, 19 | chunk_overlap: int = 0, 20 | keep_separator: bool = True 21 | ): 22 | """ 23 | Iniitalizes RecursiveCharacterChunker with the separators for markdown documents. 24 | 25 | Args: 26 | chunk_size: size of the chunks. Defaults to 256 tokens. 27 | chunk_overlap: overlap between chunks. Defaults to 0. 28 | keep_separator: whether to keep the separator in the chunk. Defaults to True. 29 | 30 | """ # noqa: E501 31 | separators = RecursiveCharacterTextSplitter.get_separators_for_language( 32 | Language.MARKDOWN 33 | ) 34 | super().__init__(chunk_size=chunk_size, 35 | chunk_overlap=chunk_overlap, 36 | separators=separators, 37 | keep_separator=keep_separator) 38 | 39 | async def achunk_single_document(self, 40 | document: Document) -> List[KBDocChunk]: 41 | raise NotImplementedError() 42 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/chunker/recursive_character.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import List, Optional 3 | 4 | from .langchain_text_splitter import RecursiveCharacterTextSplitter 5 | 6 | from canopy.knowledge_base.chunker.base import Chunker 7 | from canopy.knowledge_base.models import KBDocChunk 8 | from canopy.tokenizer import Tokenizer 9 | from canopy.models.data_models import Document 10 | 11 | 12 | class RecursiveCharacterChunker(Chunker): 13 | """ 14 | A chunker that splits a document into chunks of a given size, using a recursive character splitter. 15 | A RecursiveCharacterChunker is a derived class of Chunker, which means that it can be referenced by a name 16 | and configured in a config file. 17 | """ # noqa: E501 18 | 19 | def __init__(self, 20 | chunk_size: int = 256, 21 | chunk_overlap: int = 0, 22 | separators: Optional[List[str]] = None, 23 | keep_separator: bool = True, 24 | ): 25 | """ 26 | RecursiveCharacterTextSplitter is a text splitter from the langchain library. 27 | It splits a text into chunks of a given size, using a recursive character splitter. 28 | 29 | Args: 30 | chunk_size: size of the chunks, in tokens 31 | chunk_overlap: overlap between chunks 32 | separators: list of separators to use for splitting the text 33 | keep_separator: whether to keep the separator in the chunk or not 34 | """ # noqa: E501 35 | self._tokenizer = Tokenizer() 36 | self._chunker = RecursiveCharacterTextSplitter( 37 | chunk_size=chunk_size, 38 | chunk_overlap=chunk_overlap, 39 | length_function=self._tokenizer.token_count, 40 | separators=separators, 41 | keep_separator=keep_separator) 42 | 43 | def chunk_single_document(self, document: Document) -> List[KBDocChunk]: 44 | """ 45 | using the RecursiveCharacterTextSplitter, this method takes a document and returns a list of KBDocChunks 46 | Args: 47 | document: document to be chunked 48 | 49 | Returns: 50 | chunks: list of chunks KBDocChunks from the document, where text is splitted 51 | evenly using the RecursiveCharacterTextSplitter 52 | """ # noqa: E501 53 | # TODO: check overlap not bigger than max_chunk_size 54 | text_chunks = self._chunker.split_text(document.text) 55 | return [KBDocChunk(id=self.generate_chunk_id(document.id, i), 56 | document_id=document.id, 57 | text=text_chunk, 58 | source=document.source, 59 | metadata=deepcopy(document.metadata)) 60 | for i, text_chunk in enumerate(text_chunks)] 61 | 62 | async def achunk_single_document(self, document: Document) -> List[KBDocChunk]: 63 | raise NotImplementedError() 64 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/chunker/token_chunker.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .base import Chunker 4 | from ..models import KBDocChunk 5 | from canopy.tokenizer import Tokenizer 6 | from ...models.data_models import Document 7 | 8 | 9 | class TokenChunker(Chunker): 10 | """ 11 | Simple chunker that splits a document into chunks (group of tokens) of a given size, using a tokenizer. 12 | A TokenChunker is a derived class of Chunker, which means that it can be referenced by a name 13 | and configured in a config file. 14 | """ # noqa: E501 15 | 16 | def __init__(self, 17 | max_chunk_size: int = 256, 18 | overlap: int = 30, ): 19 | """ 20 | Using the global tokenizer, will set the class parameters for the TokenChunker. 21 | will check overlap and max_chunk_size. 22 | 23 | Args: 24 | max_chunk_size: size of the chunks, in tokens 25 | overlap: overlap between chunks, in tokens 26 | """ # noqa: E501 27 | 28 | # TODO: should add check for overlap not bigger than max_chunk_size 29 | if overlap < 0: 30 | cls_name = self.__class__.__name__ 31 | raise ValueError( 32 | f"overlap for {cls_name} can't be negative, got: {overlap}" 33 | ) 34 | 35 | if max_chunk_size <= 0: 36 | cls_name = self.__class__.__name__ 37 | raise ValueError( 38 | f"max_chunk_size for {cls_name} must be positive, got: {max_chunk_size}" 39 | ) 40 | 41 | self._tokenizer = Tokenizer() 42 | self._chunk_size = max_chunk_size 43 | self._overlap = overlap 44 | 45 | def chunk_single_document(self, document: Document) -> List[KBDocChunk]: 46 | """ 47 | This methods takes a document and returns a list of KBDocChunks, where text is splitted 48 | evenly using the tokenizer. Firts the text is tokenized, then the tokens are splitted into chunks 49 | of a given size, with overlap between chunks. 50 | Last chunk is handled such that if the last chunk is smaller than the overlap, it will be removed. 51 | 52 | Args: 53 | document: document to be chunked 54 | 55 | Returns: 56 | text_chunks: list of chunks KBDocChunks from the document 57 | """ # noqa: E501 58 | tokens = self._tokenizer.tokenize(document.text) 59 | token_chunks = [tokens[i:i + self._chunk_size] 60 | for i in range(0, len(tokens), 61 | self._chunk_size - self._overlap)] 62 | 63 | if len(token_chunks) == 0: 64 | return [] 65 | 66 | # remove last chunk if it is smaller than overlap 67 | if len(token_chunks[-1]) <= self._overlap and len(token_chunks) > 1: 68 | token_chunks = token_chunks[:-1] 69 | 70 | text_chunks = [self._tokenizer.detokenize(chunk) 71 | for chunk in token_chunks] 72 | return [KBDocChunk(id=self.generate_chunk_id(document.id, i), 73 | document_id=document.id, 74 | text=text_chunk, 75 | source=document.source, 76 | metadata=document.metadata) 77 | for i, text_chunk in enumerate(text_chunks)] 78 | 79 | async def achunk_single_document(self, document: Document) -> List[KBDocChunk]: 80 | raise NotImplementedError() 81 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/models.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import List, Optional 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from canopy.models.data_models import Document, Query, SparseVector 7 | 8 | # TODO: (1) consider moving this to pinecone-text 9 | # TODO: (2) consider renaming to "Vector" or "DenseVector" 10 | # TODO: (3) consider supporting `np.ndarray` 11 | VectorValues = List[float] 12 | 13 | 14 | class KBDocChunk(Document): 15 | document_id: str 16 | 17 | 18 | class KBDocChunkWithScore(KBDocChunk): 19 | score: float 20 | 21 | 22 | class KBEncodedDocChunk(KBDocChunk): 23 | values: VectorValues 24 | sparse_values: Optional[SparseVector] = None 25 | 26 | def to_db_record(self): 27 | metadata = deepcopy(self.metadata) 28 | metadata["text"] = self.text 29 | metadata["document_id"] = self.document_id 30 | metadata["source"] = self.source 31 | 32 | record = { 33 | "id": self.id, 34 | "values": self.values, 35 | "metadata": metadata, 36 | 37 | } 38 | 39 | if self.sparse_values is not None and len(self.sparse_values["values"]) > 0: 40 | record["sparse_values"] = self.sparse_values 41 | 42 | return record 43 | 44 | 45 | class KBQuery(Query): 46 | values: Optional[VectorValues] = None 47 | sparse_values: Optional[SparseVector] = None 48 | 49 | 50 | class KBQueryResult(BaseModel): 51 | query: str 52 | documents: List[KBDocChunkWithScore] 53 | 54 | 55 | class DocumentWithScore(Document): 56 | score: float 57 | 58 | 59 | class QueryResult(BaseModel): 60 | query: str 61 | documents: List[DocumentWithScore] 62 | debug_info: dict = Field(default_factory=dict, exclude=True) 63 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/qdrant/constants.py: -------------------------------------------------------------------------------- 1 | from canopy.knowledge_base.knowledge_base import INDEX_NAME_PREFIX 2 | 3 | COLLECTION_NAME_PREFIX = INDEX_NAME_PREFIX 4 | DENSE_VECTOR_NAME = "dense" 5 | RESERVED_METADATA_KEYS = {"document_id", "text", "source", "chunk_id"} 6 | SPARSE_VECTOR_NAME = "sparse" 7 | UUID_NAMESPACE = "867603e3-ba69-447d-a8ef-263dff19bda7" 8 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/qdrant/converter.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Dict, List, Any, Union 3 | import uuid 4 | from canopy.knowledge_base.models import ( 5 | KBDocChunkWithScore, 6 | KBEncodedDocChunk, 7 | KBQuery, 8 | VectorValues, 9 | ) 10 | from pinecone_text.sparse import SparseVector 11 | 12 | try: 13 | from qdrant_client import models 14 | except ImportError: 15 | pass 16 | 17 | from canopy.knowledge_base.qdrant.constants import ( 18 | DENSE_VECTOR_NAME, 19 | SPARSE_VECTOR_NAME, 20 | UUID_NAMESPACE, 21 | ) 22 | 23 | 24 | class QdrantConverter: 25 | @staticmethod 26 | def convert_id(_id: str) -> str: 27 | """ 28 | Converts any string into a UUID string based on a seed. 29 | 30 | Qdrant accepts UUID strings and unsigned integers as point ID. 31 | We use a seed to convert each string into a UUID string deterministically. 32 | This allows us to overwrite the same point with the original ID. 33 | """ 34 | return str(uuid.uuid5(uuid.UUID(UUID_NAMESPACE), _id)) 35 | 36 | @staticmethod 37 | def encoded_docs_to_points( 38 | encoded_docs: List[KBEncodedDocChunk], 39 | ) -> "List[models.PointStruct]": 40 | points = [] 41 | for doc in encoded_docs: 42 | record = doc.to_db_record() 43 | _id: str = record.pop("id") 44 | dense_vector: VectorValues = record.pop("values", None) 45 | sparse_vector: SparseVector = record.pop("sparse_values", None) 46 | 47 | vector: Dict[str, models.Vector] = {} 48 | 49 | if dense_vector: 50 | vector[DENSE_VECTOR_NAME] = dense_vector 51 | 52 | if sparse_vector: 53 | vector[SPARSE_VECTOR_NAME] = models.SparseVector( 54 | indices=sparse_vector["indices"], 55 | values=sparse_vector["values"], 56 | ) 57 | 58 | points.append( 59 | models.PointStruct( 60 | id=QdrantConverter.convert_id(_id), 61 | vector=vector, 62 | payload={**record["metadata"], "chunk_id": _id}, 63 | ) 64 | ) 65 | return points 66 | 67 | @staticmethod 68 | def scored_point_to_scored_doc( 69 | scored_point, 70 | ) -> "KBDocChunkWithScore": 71 | metadata: Dict[str, Any] = deepcopy(scored_point.payload or {}) 72 | _id = metadata.pop("chunk_id") 73 | text = metadata.pop("text", "") 74 | document_id = metadata.pop("document_id") 75 | return KBDocChunkWithScore( 76 | id=_id, 77 | text=text, 78 | document_id=document_id, 79 | score=scored_point.score, 80 | source=metadata.pop("source", ""), 81 | metadata=metadata, 82 | ) 83 | 84 | @staticmethod 85 | def kb_query_to_search_vector( 86 | query: KBQuery, 87 | ) -> "Union[models.NamedVector, models.NamedSparseVector]": 88 | # Use dense vector if available, otherwise use sparse vector 89 | query_vector: Union[models.NamedVector, models.NamedSparseVector] 90 | if query.values: 91 | query_vector = models.NamedVector(name=DENSE_VECTOR_NAME, vector=query.values) # noqa: E501 92 | elif query.sparse_values: 93 | query_vector = models.NamedSparseVector( 94 | name=SPARSE_VECTOR_NAME, 95 | vector=models.SparseVector( 96 | indices=query.sparse_values["indices"], 97 | values=query.sparse_values["values"], 98 | ), 99 | ) 100 | else: 101 | raise ValueError("Query should have either dense or sparse vector.") 102 | return query_vector 103 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/qdrant/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import functools 3 | from itertools import islice 4 | from typing import Any, Callable, Optional 5 | 6 | import logging 7 | 8 | try: 9 | from qdrant_client import AsyncQdrantClient, QdrantClient 10 | from qdrant_client.local.async_qdrant_local import AsyncQdrantLocal 11 | except ImportError: 12 | pass 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def sync_fallback(method: Callable) -> Callable: 17 | @functools.wraps(method) 18 | async def wrapper(self, *args, **kwargs): 19 | if self._async_client is None or isinstance( 20 | self._async_client._client, AsyncQdrantLocal 21 | ): 22 | sync_method_name = method.__name__[1:] 23 | 24 | logger.warning( 25 | f"{method.__name__}() cannot be used for QdrantLocal. " 26 | f"Falling back to {sync_method_name}()" 27 | ) 28 | loop = asyncio.get_event_loop() 29 | 30 | call = functools.partial(getattr(self, sync_method_name), *args, **kwargs) 31 | return await loop.run_in_executor(None, call) 32 | else: 33 | return await method(self, *args, **kwargs) 34 | 35 | return wrapper 36 | 37 | 38 | def generate_clients( 39 | location: Optional[str] = None, 40 | url: Optional[str] = None, 41 | port: Optional[int] = 6333, 42 | grpc_port: int = 6334, 43 | prefer_grpc: bool = False, 44 | https: Optional[bool] = None, 45 | api_key: Optional[str] = None, 46 | prefix: Optional[str] = None, 47 | timeout: Optional[int] = None, 48 | host: Optional[str] = None, 49 | path: Optional[str] = None, 50 | force_disable_check_same_thread: bool = False, 51 | **kwargs: Any, 52 | ): 53 | sync_client = QdrantClient( 54 | location=location, 55 | url=url, 56 | port=port, 57 | grpc_port=grpc_port, 58 | prefer_grpc=prefer_grpc, 59 | https=https, 60 | api_key=api_key, 61 | prefix=prefix, 62 | timeout=timeout, 63 | host=host, 64 | path=path, 65 | force_disable_check_same_thread=force_disable_check_same_thread, 66 | **kwargs, 67 | ) 68 | 69 | if location == ":memory:" or path is not None: 70 | # In-memory Qdrant doesn't interoperate with Sync and Async clients 71 | # We fallback to sync operations in this case using @utils.sync_fallback 72 | async_client = None 73 | else: 74 | async_client = AsyncQdrantClient( 75 | location=location, 76 | url=url, 77 | port=port, 78 | grpc_port=grpc_port, 79 | prefer_grpc=prefer_grpc, 80 | https=https, 81 | api_key=api_key, 82 | prefix=prefix, 83 | timeout=timeout, 84 | host=host, 85 | path=path, 86 | force_disable_check_same_thread=force_disable_check_same_thread, 87 | **kwargs, 88 | ) 89 | 90 | return sync_client, async_client 91 | 92 | 93 | def batched(iterable, n): 94 | """ 95 | Batch elements of an iterable into fixed-length chunks or blocks. 96 | Based on itertools.batched() from Python 3.12 97 | """ 98 | it = iter(iterable) 99 | while batch := tuple(islice(it, n)): 100 | yield batch 101 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import RecordEncoder 2 | from .cohere import CohereRecordEncoder 3 | from .dense import DenseRecordEncoder 4 | from .openai import OpenAIRecordEncoder 5 | from .anyscale import AnyscaleRecordEncoder 6 | from .azure_openai import AzureOpenAIRecordEncoder 7 | from .jina import JinaRecordEncoder 8 | from .sentence_transformers import SentenceTransformerRecordEncoder 9 | from .hybrid import HybridRecordEncoder 10 | from .octoai import OctoAIRecordEncoder 11 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/anyscale.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from pinecone_text.dense.openai_encoder import OpenAIEncoder 4 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery 5 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder 6 | from canopy.models.data_models import Query 7 | 8 | ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1" 9 | 10 | 11 | class AnyscaleRecordEncoder(DenseRecordEncoder): 12 | """ 13 | AnyscaleRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API. 14 | The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library. 15 | For more information about see: https://github.com/pinecone-io/pinecone-text 16 | 17 | """ # noqa: E501 18 | """ 19 | Initialize the AnyscaleRecordEncoder 20 | 21 | Args: 22 | api_key: The Anyscale Endpoint API Key 23 | base_url: The Base URL for Anyscale Endpoint 24 | model_name: The name of the Anyscale embeddings model to use for encoding. See https://docs.endpoints.anyscale.com/category/supported-models 25 | batch_size: The number of documents or queries to encode at once. 26 | Defaults to 400. 27 | **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`. 28 | """ # noqa: E501 29 | def __init__(self, 30 | *, 31 | api_key: str = "", 32 | base_url: str = ANYSCALE_BASE_URL, 33 | model_name: str = "thenlper/gte-large", 34 | batch_size: int = 400, 35 | **kwargs): 36 | 37 | ae_api_key = api_key or os.environ.get("ANYSCALE_API_KEY") 38 | if not ae_api_key: 39 | raise ValueError( 40 | "Anyscale API key is required to use Anyscale. " 41 | "Please provide it as an argument " 42 | "or set the ANYSCALE_API_KEY environment variable." 43 | ) 44 | ae_base_url = base_url 45 | encoder = OpenAIEncoder(model_name, 46 | base_url=ae_base_url, api_key=ae_api_key, 47 | **kwargs) 48 | super().__init__(dense_encoder=encoder, batch_size=batch_size) 49 | 50 | def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]: 51 | """ 52 | Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk. 53 | 54 | Args: 55 | documents: A list of KBDocChunk to encode. 56 | 57 | Returns: 58 | encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector. 59 | """ # noqa: E501 60 | return super().encode_documents(documents) 61 | 62 | async def _aencode_documents_batch(self, 63 | documents: List[KBDocChunk] 64 | ) -> List[KBEncodedDocChunk]: 65 | raise NotImplementedError 66 | 67 | async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 68 | raise NotImplementedError 69 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/azure_openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pinecone_text.dense import AzureOpenAIEncoder 4 | 5 | from canopy.knowledge_base.record_encoder import OpenAIRecordEncoder, DenseRecordEncoder 6 | import openai 7 | 8 | 9 | class AzureOpenAIRecordEncoder(OpenAIRecordEncoder): 10 | """ 11 | AzureOpenAIRecordEncoder is a type of DenseRecordEncoder that uses the Azure OpenAI's `embeddings` deployments. 12 | The implementation uses the `AzureOpenAIEncoder` class from the `pinecone-text` library. 13 | For more information about see: https://github.com/pinecone-io/pinecone-text 14 | 15 | Azure OpenAI services require a valid API key, and an Azure endpoint. You will need 16 | To set the following environment variables: 17 | - AZURE_OPENAI_API_KEY: Your Azure OpenAI API key. 18 | - AZURE_OPENAI_ENDPOINT: Your Azure endpoint, including the resource, e.g. `https://example-resource.azure.openai.com/` 19 | """ # noqa: E501 20 | 21 | def __init__( 22 | self, 23 | *, 24 | model_name: str, 25 | api_version: str = "2023-12-01-preview", 26 | batch_size: int = 400, 27 | **kwargs 28 | ): 29 | """ 30 | Initialize the AzureOpenAIRecordEncoder 31 | 32 | Args: 33 | model_name: The name of embeddings model deployment to use for encoding 34 | api_version: The Azure OpenAI API version to use. Defaults to "2023-12-01-preview". 35 | batch_size: The number of documents or queries to encode at once. 36 | Defaults to 400. 37 | **kwargs: Additional arguments to pass to the underlying `pinecone-text.AzureOpenAIEncoder`. 38 | """ # noqa: E501 39 | try: 40 | encoder = AzureOpenAIEncoder(model_name, api_version=api_version, **kwargs) 41 | except (openai.OpenAIError, ValueError) as e: 42 | raise RuntimeError( 43 | "Failed to connect to Azure OpenAI, please make sure that the " 44 | "AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT environment variables " 45 | "are set correctly. " 46 | f"Underlying Error:\n{self._format_openai_error(e)}" 47 | ) from e 48 | 49 | DenseRecordEncoder.__init__(self, dense_encoder=encoder, batch_size=batch_size, 50 | **kwargs) 51 | 52 | def _format_error(self, err): 53 | if isinstance(err, openai.AuthenticationError): 54 | return ( 55 | "Failed to connect to Azure OpenAI, please make sure that the " 56 | "AZURE_OPENAI_API_KEY environment variable is set correctly. " 57 | f"Underlying Error:\n{self._format_openai_error(err)}" 58 | ) 59 | elif isinstance(err, openai.APIConnectionError): 60 | return ( 61 | f"Failed to connect to your Azure OpenAI endpoint, please make sure " 62 | f"that the provided endpoint {os.getenv('AZURE_OPENAI_ENDPOINT')} " 63 | f"is correct. Underlying Error:\n{self._format_openai_error(err)}" 64 | ) 65 | elif isinstance(err, openai.NotFoundError): 66 | return ( 67 | f"Failed to connect to your Azure OpenAI. Please make sure that " 68 | f"you have provided the correct deployment name: {self.model_name} " 69 | f"and API version: {self._client._api_version}. " 70 | f"Underlying Error:\n{self._format_openai_error(err)}" 71 | ) 72 | else: 73 | return super()._format_error(err) 74 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/cohere.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from pinecone_text.dense.cohere_encoder import CohereEncoder 3 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery 4 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder 5 | from canopy.models.data_models import Query 6 | 7 | 8 | class CohereRecordEncoder(DenseRecordEncoder): 9 | """ 10 | CohereRecordEncoder is a type of DenseRecordEncoder that uses the Cohere `embed` API. 11 | The implementation uses the `CohereEncoder` class from the `pinecone-text` library. 12 | For more information about see: https://github.com/pinecone-io/pinecone-text 13 | 14 | """ # noqa: E501 15 | 16 | def __init__( 17 | self, 18 | *, 19 | model_name: str = "embed-english-v3.0", 20 | batch_size: int = 100, 21 | **kwargs, 22 | ): 23 | """ 24 | Initialize the CohereRecordEncoder 25 | 26 | Args: 27 | model_name: The name of the Cohere embeddings model to use for encoding. See https://docs.cohere.com/reference/embed 28 | batch_size: The number of documents or queries to encode at once. 29 | Defaults to 400. 30 | **kwargs: Additional arguments to pass to the underlying `pinecone-text. CohereEncoder`. 31 | """ # noqa: E501 32 | encoder = CohereEncoder(model_name, **kwargs) 33 | super().__init__(dense_encoder=encoder, batch_size=batch_size) 34 | 35 | def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]: 36 | """ 37 | Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk. 38 | 39 | Args: 40 | documents: A list of KBDocChunk to encode. 41 | 42 | Returns: 43 | encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector. 44 | """ # noqa: E501 45 | return super().encode_documents(documents) 46 | 47 | async def _aencode_documents_batch( 48 | self, documents: List[KBDocChunk] 49 | ) -> List[KBEncodedDocChunk]: 50 | raise NotImplementedError 51 | 52 | async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 53 | raise NotImplementedError 54 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/dense.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | from typing import List 3 | from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder 4 | 5 | from .base import RecordEncoder 6 | from canopy.knowledge_base.models import KBQuery, KBEncodedDocChunk, KBDocChunk 7 | from canopy.models.data_models import Query 8 | 9 | 10 | class DenseRecordEncoder(RecordEncoder): 11 | """ 12 | DenseRecordEncoder is a subclass of RecordEncoder that generates dense vector representation of documents chunks and textual queries. 13 | The dense representation generated by the `DenseRecordEncoder` is a list of floats in a given dimension. 14 | DenseRecordEncoder wraps a BaseDenseEncoder from the `pinecone-text` library to encode the text itself. 15 | for more information about the BaseDenseEncoder see: https://github.com/pinecone-io/pinecone-text 16 | """ # noqa: E501 17 | 18 | def __init__(self, 19 | dense_encoder: BaseDenseEncoder, 20 | **kwargs): 21 | """ 22 | Initialize the encoder. 23 | 24 | Args: 25 | dense_encoder: A BaseDenseEncoder to encode the text. 26 | **kwargs: Additional arguments to pass to the RecordEncoder. 27 | """ # noqa: E501 28 | super().__init__(**kwargs) 29 | self._dense_encoder = dense_encoder 30 | 31 | def _encode_documents_batch(self, 32 | documents: List[KBDocChunk] 33 | ) -> List[KBEncodedDocChunk]: 34 | """ 35 | Encode a batch of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk. 36 | 37 | Args: 38 | documents: A list of KBDocChunk to encode. 39 | Returns: 40 | encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector. 41 | """ # noqa: E501 42 | dense_values = self._dense_encoder.encode_documents([d.text for d in documents]) 43 | return [KBEncodedDocChunk(**d.model_dump(), values=v) for d, v in 44 | zip(documents, dense_values)] 45 | 46 | def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 47 | """ 48 | Encode a batch of queries, takes a list of Query and returns a list of KBQuery. 49 | Args: 50 | queries: A list of Query to encode. 51 | Returns: 52 | encoded queries: A list of KBQuery, with the `values` field populated by the generated embeddings vector. 53 | """ # noqa: E501 54 | dense_values = self._dense_encoder.encode_queries([q.text for q in queries]) 55 | return [ 56 | KBQuery(**q.model_dump(), values=v) for q, v in zip(queries, dense_values) 57 | ] 58 | 59 | @cached_property 60 | def dimension(self) -> int: 61 | """ 62 | The dimension is the length of the vector generated by the `DenseRecordEncoder` 63 | 64 | Returns: 65 | dimension(int): the dimension of the encoder 66 | """ # noqa: E501 67 | return self._dense_encoder.dimension 68 | 69 | async def _aencode_documents_batch(self, 70 | documents: List[KBDocChunk] 71 | ) -> List[KBEncodedDocChunk]: 72 | raise NotImplementedError 73 | 74 | async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 75 | raise NotImplementedError 76 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/jina.py: -------------------------------------------------------------------------------- 1 | from pinecone_text.dense import JinaEncoder 2 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder 3 | 4 | 5 | class JinaRecordEncoder(DenseRecordEncoder): 6 | """ 7 | JinaRecordEncoder is a type of DenseRecordEncoder that uses the JinaAI `embeddings` API. 8 | The implementation uses the `JinaEncoder` class from the `pinecone-text` library. 9 | For more information about see: https://github.com/pinecone-io/pinecone-text 10 | 11 | """ # noqa: E501 12 | 13 | def __init__(self, 14 | *, 15 | model_name: str = "jina-embeddings-v2-base-en", 16 | batch_size: int = 400, 17 | **kwargs): 18 | """ 19 | Initialize the JinaRecordEncoder 20 | 21 | Args: 22 | model_name: The name of the embedding model to use. 23 | batch_size: The number of documents or queries to encode at once. 24 | Defaults to 400. 25 | **kwargs: Additional arguments to pass to the underlying `pinecone-text. JinaEncoder`. 26 | """ # noqa: E501 27 | encoder = JinaEncoder(model_name, **kwargs) 28 | super().__init__(dense_encoder=encoder, batch_size=batch_size) 29 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/octoai.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from pinecone_text.dense.openai_encoder import OpenAIEncoder 4 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery 5 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder 6 | from canopy.models.data_models import Query 7 | 8 | OCTOAI_BASE_URL = "https://text.octoai.run/v1" 9 | 10 | 11 | class OctoAIRecordEncoder(DenseRecordEncoder): 12 | """ 13 | OctoAIRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API. 14 | The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library. 15 | For more information about see: https://github.com/pinecone-io/pinecone-text 16 | 17 | """ # noqa: E501 18 | """ 19 | Initialize the OctoAIRecordEncoder 20 | 21 | Args: 22 | api_key: The OctoAI Endpoint API Key 23 | base_url: The Base URL for the OctoAI Endpoint 24 | model_name: The name of the OctoAI embeddings model to use for encoding. See https://octo.ai/docs/text-gen-solution/getting-started 25 | batch_size: The number of documents or queries to encode at once. 26 | Defaults to 1. 27 | **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`. 28 | """ # noqa: E501 29 | def __init__(self, 30 | *, 31 | api_key: str = "", 32 | base_url: str = OCTOAI_BASE_URL, 33 | model_name: str = "thenlper/gte-large", 34 | batch_size: int = 1024, 35 | **kwargs): 36 | 37 | octoai_api_key = api_key or os.environ.get("OCTOAI_API_KEY") 38 | if not octoai_api_key: 39 | raise ValueError( 40 | "An OctoAI API token is required to use OctoAI. " 41 | "Please provide it as an argument " 42 | "or set the OCTOAI_API_KEY environment variable." 43 | ) 44 | octoai_base_url = base_url 45 | encoder = OpenAIEncoder(model_name, 46 | base_url=octoai_base_url, api_key=octoai_api_key, 47 | **kwargs) 48 | super().__init__(dense_encoder=encoder, batch_size=batch_size) 49 | 50 | def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]: 51 | """ 52 | Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk. 53 | 54 | Args: 55 | documents: A list of KBDocChunk to encode. 56 | 57 | Returns: 58 | encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector. 59 | """ # noqa: E501 60 | return super().encode_documents(documents) 61 | 62 | async def _aencode_documents_batch(self, 63 | documents: List[KBDocChunk] 64 | ) -> List[KBEncodedDocChunk]: 65 | raise NotImplementedError 66 | 67 | async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 68 | raise NotImplementedError 69 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/openai.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError 4 | from pinecone_text.dense.openai_encoder import OpenAIEncoder 5 | from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery 6 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder 7 | from canopy.models.data_models import Query 8 | 9 | 10 | class OpenAIRecordEncoder(DenseRecordEncoder): 11 | """ 12 | OpenAIRecordEncoder is a type of DenseRecordEncoder that uses the OpenAI `embeddings` API. 13 | The implementation uses the `OpenAIEncoder` class from the `pinecone-text` library. 14 | For more information about see: https://github.com/pinecone-io/pinecone-text 15 | 16 | """ # noqa: E501 17 | 18 | def __init__( 19 | self, 20 | *, 21 | model_name: str = "text-embedding-3-small", 22 | batch_size: int = 400, 23 | dimension: Optional[int] = None, 24 | **kwargs 25 | ): 26 | """ 27 | Initialize the OpenAIRecordEncoder 28 | 29 | Args: 30 | model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings 31 | batch_size: The number of documents or queries to encode at once. 32 | Defaults to 400. 33 | dimension: The dimension of the embeddings vector to generate. 34 | **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`. 35 | """ # noqa: E501 36 | try: 37 | encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs) 38 | except OpenAIError as e: 39 | raise RuntimeError( 40 | "Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY " 41 | "environment variable is set correctly.\n" 42 | f"Error: {self._format_openai_error(e)}" 43 | ) from e 44 | super().__init__(dense_encoder=encoder, batch_size=batch_size) 45 | 46 | async def _aencode_documents_batch(self, 47 | documents: List[KBDocChunk] 48 | ) -> List[KBEncodedDocChunk]: 49 | raise NotImplementedError 50 | 51 | async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 52 | raise NotImplementedError 53 | 54 | @staticmethod 55 | def _format_openai_error(e): 56 | try: 57 | response = e.response.json() 58 | if "error" in response: 59 | return response["error"]["message"] 60 | elif "message" in response: 61 | return response["message"] 62 | else: 63 | return str(e) 64 | except Exception: 65 | return str(e) 66 | 67 | def _format_error(self, err): 68 | if isinstance(err, RateLimitError): 69 | return (f"Your OpenAI account seem to have reached the rate limit. " 70 | f"Details: {self._format_openai_error(err)}") 71 | elif isinstance(err, (AuthenticationError, APIConnectionError)): 72 | return (f"Failed to connect to OpenAI, please make sure that the " 73 | f"OPENAI_API_KEY environment variable is set correctly. " 74 | f"Details: {self._format_openai_error(err)}") 75 | else: 76 | return self._format_openai_error(err) 77 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/record_encoder/sentence_transformers.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from pinecone_text.dense import SentenceTransformerEncoder 3 | from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder 4 | from huggingface_hub.utils import RepositoryNotFoundError 5 | 6 | 7 | class SentenceTransformerRecordEncoder(DenseRecordEncoder): 8 | """ 9 | SentenceTransformerRecordEncoder is a type of DenseRecordEncoder that uses a Sentence Transformer model. 10 | The implementation uses the `SentenceTransformerEncoder` class from the `pinecone-text` library. 11 | For more information about see: https://github.com/pinecone-io/pinecone-text 12 | 13 | """ # noqa: E501 14 | 15 | def __init__(self, 16 | *, 17 | model_name: str = "sentence-transformers/all-MiniLM-L6-v2", 18 | query_encoder_name: Optional[str] = None, 19 | batch_size: int = 400, 20 | device: Optional[str] = None, 21 | **kwargs) -> None: 22 | """ 23 | Initialize the SentenceTransformerRecordEncoder 24 | 25 | Args: 26 | model_name: The name of the embedding model to use for encoding documents. 27 | See https://huggingface.co/models?library=sentence-transformers 28 | for all possible Sentence Transformer models. 29 | query_encoder_name: The name of the embedding model to use for encoding queries. 30 | See https://huggingface.co/models?library=sentence-transformers 31 | for all possible Sentence Transformer models. 32 | Defaults to `model_name`. 33 | batch_size: The number of documents or queries to encode at once. 34 | Defaults to 400. 35 | device: The local device to use for encoding, for example "cpu", "cuda" or "mps". 36 | Defaults to "cuda" if cuda is available, otherwise to "cpu". 37 | **kwargs: Additional arguments to pass to the underlying `pinecone-text.SentenceTransformerEncoder`. 38 | """ # noqa: E501 39 | try: 40 | encoder = SentenceTransformerEncoder( 41 | document_encoder_name=model_name, 42 | query_encoder_name=query_encoder_name, 43 | device=device, 44 | **kwargs, 45 | ) 46 | except RepositoryNotFoundError as e: 47 | raise RuntimeError( 48 | "Your chosen Sentence Transformer model(s) could not be found. " 49 | f"Details: {str(e)}" 50 | ) from e 51 | except ImportError: 52 | raise ImportError( 53 | f"{self.__class__.__name__} requires the `torch` and `transformers` " 54 | f"extra dependencies. Please install them using " 55 | f"`pip install canopy-sdk[torch,transformers]`." 56 | ) 57 | super().__init__(dense_encoder=encoder, batch_size=batch_size) 58 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/reranker/__init__.py: -------------------------------------------------------------------------------- 1 | from .reranker import Reranker 2 | from .transparent import TransparentReranker 3 | from .cohere import CohereReranker 4 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/reranker/cohere.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional 3 | 4 | 5 | from canopy.knowledge_base.models import KBQueryResult 6 | from canopy.knowledge_base.reranker import Reranker 7 | 8 | try: 9 | import cohere 10 | from cohere import CohereAPIError 11 | except (OSError, ImportError, ModuleNotFoundError): 12 | _cohere_installed = False 13 | else: 14 | _cohere_installed = True 15 | 16 | 17 | class CohereReranker(Reranker): 18 | """ 19 | Reranker that uses Cohere's text embedding to rerank documents. 20 | 21 | For each query and documents returned for that query, returns a list 22 | of documents ordered by their relevance to the provided query. 23 | """ 24 | 25 | def __init__(self, 26 | model_name: str = 'rerank-english-v2.0', 27 | *, 28 | top_n: int = 10, 29 | api_key: Optional[str] = None): 30 | """ 31 | Initializes the Cohere reranker. 32 | 33 | Args: 34 | model_name: The identifier of the model to use, one of : 35 | rerank-english-v2.0, rerank-multilingual-v2.0 36 | top_n: The number of most relevant documents return, defaults to 10 37 | api_key: API key for Cohere. If not passed `CO_API_KEY` environment 38 | variable will be used. 39 | """ 40 | 41 | if not _cohere_installed: 42 | raise ImportError( 43 | "Failed to import cohere. Make sure you install cohere extra " 44 | "dependencies by running: " 45 | "pip install canopy-sdk[cohere]" 46 | ) 47 | cohere_api_key = api_key or os.environ.get("CO_API_KEY") 48 | if cohere_api_key is None: 49 | raise RuntimeError( 50 | "Cohere API key is required to use Cohere Reranker. " 51 | "Please provide it as an argument " 52 | "or set the CO_API_KEY environment variable." 53 | ) 54 | self._client = cohere.Client(api_key=cohere_api_key) 55 | self._model_name = model_name 56 | self._top_n = top_n 57 | 58 | def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]: 59 | reranked_query_results: List[KBQueryResult] = [] 60 | for result in results: 61 | texts = [doc.text for doc in result.documents] 62 | try: 63 | response = self._client.rerank(query=result.query, 64 | documents=texts, 65 | top_n=self._top_n, 66 | model=self._model_name) 67 | except CohereAPIError as e: 68 | raise RuntimeError("Failed to rerank documents using Cohere." 69 | f" Underlying Error:\n{e.message}") 70 | 71 | reranked_docs = [] 72 | for rerank_result in response: 73 | doc = result.documents[rerank_result.index].model_copy( 74 | deep=True, 75 | update=dict(score=rerank_result.relevance_score) 76 | ) 77 | reranked_docs.append(doc) 78 | 79 | reranked_query_results.append(KBQueryResult(query=result.query, 80 | documents=reranked_docs)) 81 | return reranked_query_results 82 | 83 | async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]: 84 | raise NotImplementedError() 85 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/reranker/reranker.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | from canopy.knowledge_base.models import KBQueryResult 5 | from canopy.utils.config import ConfigurableMixin 6 | 7 | 8 | class Reranker(ABC, ConfigurableMixin): 9 | """ 10 | Abstract class for rerankers. Rerankers take a list of KBQueryResult and return a list of KBQueryResult, 11 | where the results are reranked according to the reranker logic. 12 | Reranker is an abstract class that must be subclassed to be used, 13 | """ # noqa: E501 14 | 15 | @abstractmethod 16 | def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]: 17 | pass 18 | 19 | @abstractmethod 20 | async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]: 21 | pass 22 | -------------------------------------------------------------------------------- /src/canopy/knowledge_base/reranker/transparent.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from canopy.knowledge_base.models import KBQueryResult 4 | from canopy.knowledge_base.reranker import Reranker 5 | 6 | 7 | class TransparentReranker(Reranker): 8 | """ 9 | Transparent reranker that does nothing, it just returns the results as is. This is the default reranker. 10 | The TransparentReranker is used as a placeholder for future development "forcing" every result set to be reranked. 11 | """ # noqa: E501 12 | 13 | def rerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]: 14 | """ 15 | Returns the results as is. 16 | 17 | Args: 18 | results: A list of KBQueryResult to rerank. 19 | 20 | Returns: 21 | results: A list of KBQueryResult, same as the input. 22 | """ # noqa: E501 23 | return results 24 | 25 | async def arerank(self, results: List[KBQueryResult]) -> List[KBQueryResult]: 26 | return results 27 | -------------------------------------------------------------------------------- /src/canopy/llm/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLLM 2 | from .openai import OpenAILLM 3 | from .anyscale import AnyscaleLLM 4 | from .azure_openai_llm import AzureOpenAILLM 5 | from .cohere import CohereLLM 6 | from .octoai import OctoAILLM 7 | -------------------------------------------------------------------------------- /src/canopy/llm/anyscale.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Any 2 | import os 3 | from canopy.llm import OpenAILLM 4 | from canopy.llm.models import Function 5 | from canopy.models.data_models import Messages 6 | 7 | ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1" 8 | FUNCTION_MODEL_LIST = [ 9 | "mistralai/Mistral-7B-Instruct-v0.1", 10 | "mistralai/Mixtral-8x7B-Instruct-v0.1", 11 | ] 12 | 13 | 14 | class AnyscaleLLM(OpenAILLM): 15 | """ 16 | Anyscale LLM wrapper built on top of the OpenAI Python client. 17 | 18 | Note: Anyscale requires a valid API key to use this class. 19 | You can set the "ANYSCALE_API_KEY" environment variable. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | model_name: str = "meta-llama/Llama-2-7b-chat-hf", 25 | *, 26 | base_url: Optional[str] = ANYSCALE_BASE_URL, 27 | api_key: Optional[str] = None, 28 | **kwargs: Any, 29 | ): 30 | ae_api_key = api_key or os.environ.get("ANYSCALE_API_KEY") 31 | if not ae_api_key: 32 | raise ValueError( 33 | "Anyscale API key is required to use Anyscale. " 34 | "Please provide it as an argument " 35 | "or set the ANYSCALE_API_KEY environment variable." 36 | ) 37 | ae_base_url = base_url 38 | super().__init__(model_name, api_key=ae_api_key, base_url=ae_base_url, **kwargs) 39 | 40 | def enforced_function_call( 41 | self, 42 | system_prompt: str, 43 | chat_history: Messages, 44 | function: Function, 45 | *, 46 | max_tokens: Optional[int] = None, 47 | model_params: Optional[dict] = None, 48 | ) -> dict: 49 | model = self.model_name 50 | if model_params and "model" in model_params: 51 | model = model_params["model"] 52 | if model not in FUNCTION_MODEL_LIST: 53 | raise NotImplementedError( 54 | f"Model {model} doesn't support function calling. " 55 | "To use function calling capability, please select a different model.\n" 56 | "Pleaes check following link for details: " 57 | "https://docs.endpoints.anyscale.com/guides/function-calling" 58 | ) 59 | else: 60 | return super().enforced_function_call( 61 | system_prompt, chat_history, function, 62 | max_tokens=max_tokens, model_params=model_params 63 | ) 64 | 65 | def aenforced_function_call(self, 66 | system_prompt: str, 67 | chat_history: Messages, 68 | function: Function, 69 | *, 70 | max_tokens: Optional[int] = None, 71 | model_params: Optional[dict] = None 72 | ): 73 | raise NotImplementedError() 74 | -------------------------------------------------------------------------------- /src/canopy/llm/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union, Iterable, Optional 3 | 4 | from canopy.llm.models import Function 5 | from canopy.models.api_models import ChatResponse, StreamingChatChunk 6 | from canopy.models.data_models import Messages, Context 7 | from canopy.utils.config import ConfigurableMixin 8 | 9 | 10 | class BaseLLM(ABC, ConfigurableMixin): 11 | def __init__(self, 12 | model_name: str): 13 | self.model_name = model_name 14 | 15 | @abstractmethod 16 | def chat_completion(self, 17 | system_prompt: str, 18 | chat_history: Messages, 19 | context: Optional[Context] = None, 20 | *, 21 | stream: bool = False, 22 | max_tokens: Optional[int] = None, 23 | model_params: Optional[dict] = None, 24 | ) -> Union[ChatResponse, Iterable[StreamingChatChunk]]: 25 | pass 26 | 27 | @abstractmethod 28 | def enforced_function_call(self, 29 | system_prompt: str, 30 | chat_history: Messages, 31 | function: Function, 32 | *, 33 | max_tokens: Optional[int] = None, 34 | model_params: Optional[dict] = None, 35 | ) -> dict: 36 | pass 37 | 38 | @abstractmethod 39 | async def achat_completion(self, 40 | system_prompt: str, 41 | chat_history: Messages, 42 | context: Optional[Context] = None, 43 | *, 44 | stream: bool = False, 45 | max_generated_tokens: Optional[int] = None, 46 | model_params: Optional[dict] = None, 47 | ) -> Union[ChatResponse, 48 | Iterable[StreamingChatChunk]]: 49 | pass 50 | 51 | @abstractmethod 52 | async def aenforced_function_call(self, 53 | system_prompt: str, 54 | chat_history: Messages, 55 | function: Function, 56 | *, 57 | max_tokens: Optional[int] = None, 58 | model_params: Optional[dict] = None 59 | ): 60 | pass 61 | -------------------------------------------------------------------------------- /src/canopy/llm/models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union 2 | 3 | from pydantic import BaseModel, model_serializer 4 | 5 | 6 | class FunctionPrimitiveProperty(BaseModel): 7 | name: str 8 | type: str 9 | description: Optional[str] = None 10 | enum: Optional[List[str]] = None 11 | 12 | 13 | class FunctionArrayProperty(BaseModel): 14 | name: str 15 | items_type: str 16 | # we require description for array properties 17 | # because the model is more struggling with them 18 | description: str 19 | 20 | def model_dump(self, *args, **kwargs): 21 | super_dict = super().model_dump(*args, **kwargs) 22 | if "items_type" in super_dict: 23 | super_dict["type"] = "array" 24 | super_dict["items"] = {"type": super_dict.pop("items_type")} 25 | return super_dict 26 | 27 | 28 | FunctionProperty = Union[FunctionPrimitiveProperty, FunctionArrayProperty] 29 | 30 | 31 | class FunctionParameters(BaseModel): 32 | required_properties: List[FunctionProperty] 33 | optional_properties: List[FunctionProperty] = [] 34 | 35 | @model_serializer() 36 | def serialize_model(self): 37 | return { 38 | "type": "object", 39 | "properties": { 40 | pro.name: pro.model_dump(exclude_none=True, exclude={"name"}) 41 | for pro in self.required_properties + self.optional_properties 42 | }, 43 | "required": [pro.name for pro in self.required_properties], 44 | } 45 | 46 | 47 | class Function(BaseModel): 48 | name: str 49 | description: str 50 | parameters: FunctionParameters 51 | -------------------------------------------------------------------------------- /src/canopy/llm/octoai.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Any 2 | import os 3 | from canopy.llm import OpenAILLM 4 | from canopy.llm.models import Function 5 | from canopy.models.data_models import Messages 6 | 7 | OCTOAI_BASE_URL = "https://text.octoai.run/v1" 8 | 9 | 10 | class OctoAILLM(OpenAILLM): 11 | """ 12 | OctoAI LLM wrapper built on top of the OpenAI Python client. 13 | 14 | Note: OctoAI requires a valid API key to use this class. 15 | You can set the "OCTOAI_API_KEY" environment variable. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | model_name: str = "mistral-7b-instruct-fp16", 21 | *, 22 | base_url: Optional[str] = OCTOAI_BASE_URL, 23 | api_key: Optional[str] = None, 24 | **kwargs: Any, 25 | ): 26 | octoai_api_key = api_key or os.environ.get("OCTOAI_API_KEY") 27 | if not octoai_api_key: 28 | raise ValueError( 29 | "OctoAI API key is required to use OctoAI. " 30 | "If you haven't done it, please sign up at https://octo.ai \n" 31 | "The key can be provided as an argument or " 32 | "via the OCTOAI_API_KEY environment variable." 33 | ) 34 | octoai_base_url = base_url 35 | super().__init__( 36 | model_name, 37 | api_key=octoai_api_key, 38 | base_url=octoai_base_url, 39 | **kwargs 40 | ) 41 | 42 | def enforced_function_call( 43 | self, 44 | system_prompt: str, 45 | chat_history: Messages, 46 | function: Function, 47 | *, 48 | max_tokens: Optional[int] = None, 49 | model_params: Optional[dict] = None, 50 | ) -> dict: 51 | raise NotImplementedError("OctoAI doesn't support function calling.") 52 | 53 | def aenforced_function_call(self, 54 | system_prompt: str, 55 | chat_history: Messages, 56 | function: Function, 57 | *, 58 | max_tokens: Optional[int] = None, 59 | model_params: Optional[dict] = None 60 | ): 61 | raise NotImplementedError("OctoAI doesn't support function calling.") 62 | -------------------------------------------------------------------------------- /src/canopy/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy/models/__init__.py -------------------------------------------------------------------------------- /src/canopy/models/api_models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Iterable 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | from canopy.models.data_models import MessageBase 6 | 7 | 8 | class _Choice(BaseModel): 9 | index: int 10 | message: MessageBase 11 | finish_reason: Optional[str] = None 12 | 13 | 14 | class _StreamChoice(BaseModel): 15 | index: int 16 | delta: dict 17 | finish_reason: Optional[str] = None 18 | 19 | 20 | class TokenCounts(BaseModel): 21 | prompt_tokens: int 22 | completion_tokens: int 23 | total_tokens: int 24 | 25 | 26 | class ChatResponse(BaseModel): 27 | id: str = Field(description="Canopy session Id.") 28 | object: str 29 | created: int 30 | model: str 31 | choices: Sequence[_Choice] 32 | usage: TokenCounts 33 | debug_info: dict = Field(default_factory=dict, exclude=True) 34 | 35 | 36 | class StreamingChatChunk(BaseModel): 37 | id: str 38 | object: str 39 | created: int 40 | model: str 41 | choices: Sequence[_StreamChoice] 42 | 43 | 44 | class StreamingChatResponse(BaseModel): 45 | chunks: Iterable[StreamingChatChunk] 46 | debug_info: dict = Field(default_factory=dict, exclude=True) 47 | -------------------------------------------------------------------------------- /src/canopy/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .cohere import CohereAPITokenizer, CohereHFTokenizer 2 | from .llama import LlamaTokenizer 3 | from .openai import OpenAITokenizer 4 | from .tokenizer import Tokenizer 5 | -------------------------------------------------------------------------------- /src/canopy/tokenizer/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | from canopy.models.data_models import Messages 5 | from canopy.utils.config import ConfigurableMixin 6 | 7 | 8 | class BaseTokenizer(ABC, ConfigurableMixin): 9 | 10 | @abstractmethod 11 | def tokenize(self, text: str) -> List[str]: 12 | pass 13 | 14 | @abstractmethod 15 | def detokenize(self, tokens: List[str]) -> str: 16 | pass 17 | 18 | def token_count(self, text: str) -> int: 19 | return len(self.tokenize(text)) 20 | 21 | @abstractmethod 22 | def messages_token_count(self, messages: Messages) -> int: 23 | pass 24 | -------------------------------------------------------------------------------- /src/canopy/tokenizer/openai.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | from typing import List 3 | from .base import BaseTokenizer 4 | from ..models.data_models import Messages 5 | 6 | 7 | class OpenAITokenizer(BaseTokenizer): 8 | """ 9 | Tokenizer for OpenAI models, based on the tiktoken library. 10 | 11 | Usage: 12 | Initialize the singleton tokenizer with the OpenAITokenizer class: 13 | >>> from canopy.tokenizer import Tokenizer 14 | >>> Tokenizer.initialize(tokenizer_class=OpenAITokenizer, model_name="gpt-3.5-turbo") 15 | 16 | You can then use the tokenizer instance from anywhere in the code: 17 | >>> from canopy.tokenizer import Tokenizer 18 | >>> tokenizer = Tokenizer() 19 | >>> tokenizer.tokenize("Hello world!") 20 | ['Hello', ' world', '!'] 21 | """ # noqa: E501 22 | 23 | MESSAGE_TOKENS_OVERHEAD = 3 24 | FIXED_PREFIX_TOKENS = 3 25 | 26 | def __init__(self, model_name: str = "gpt-3.5-turbo"): 27 | """ 28 | Initialize the tokenizer. 29 | 30 | Args: 31 | model_name: The name of the model to use. Defaults to "gpt-3.5-turbo". 32 | You can find the list of available models here: https://github.com/openai/tiktoken/blob/39f29cecdb6fc38d9a3434e5dd15e4de58cf3c80/tiktoken/model.py#L19C1-L19C18 33 | As you can see, both gpt-3.5 and gpt-4 are using the same cl100k_base tokenizer. 34 | """ # noqa: E501 35 | self._encoder = tiktoken.encoding_for_model(model_name) 36 | 37 | def tokenize(self, text: str) -> List[str]: 38 | """ 39 | Tokenize a text using tiktoken. 40 | 41 | Args: 42 | text: The text to tokenize. 43 | 44 | Returns: 45 | The list of tokens. 46 | """ 47 | return [self._encoder.decode([encoded_token]) 48 | for encoded_token in self._encode(text)] 49 | 50 | def detokenize(self, tokens: List[str]) -> str: 51 | """ 52 | Detokenize a list of tokens that were previously tokenized using this tokenizer. 53 | 54 | Args: 55 | tokens: The list of tokens to detokenize. 56 | 57 | Returns: 58 | The detokenized text as a string. 59 | """ 60 | if not isinstance(tokens, List): 61 | raise TypeError(f"detokenize expect List[str], got f{type(tokens)}") 62 | return "".join(tokens) 63 | 64 | def token_count(self, text: str) -> int: 65 | """ 66 | Count the number of tokens in a text. 67 | 68 | Args: 69 | text: The text to count the tokens of. 70 | 71 | Returns: 72 | The number of tokens in the text. 73 | """ 74 | return len(self._encode(text)) 75 | 76 | def _encode(self, text): 77 | return self._encoder.encode(text, disallowed_special=()) 78 | 79 | def messages_token_count(self, messages: Messages) -> int: 80 | """ 81 | Count the number of tokens in a list of messages as expected to be counted by OpenAI models. 82 | Account for the overhead of the messages structure. 83 | Taken from: https://github.com/openai/openai-cookbook/.../How_to_format_inputs_to_ChatGPT_models.ipynb 84 | 85 | Args: 86 | messages: The list of messages to count the tokens of. 87 | 88 | Returns: 89 | The number of tokens in the messages, as expected to be counted by OpenAI models. 90 | """ # noqa: E501 91 | num_tokens = 0 92 | for message in messages: 93 | num_tokens += self.MESSAGE_TOKENS_OVERHEAD 94 | for key, value in message.model_dump().items(): 95 | num_tokens += self.token_count(value) 96 | num_tokens += self.FIXED_PREFIX_TOKENS 97 | return num_tokens 98 | -------------------------------------------------------------------------------- /src/canopy/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy/utils/__init__.py -------------------------------------------------------------------------------- /src/canopy/utils/debugging.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | CANOPY_DEBUG_INFO = os.getenv("CANOPY_DEBUG_INFO", "FALSE").lower() == "true" 4 | -------------------------------------------------------------------------------- /src/canopy/utils/directory.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | class Directory: 5 | """Stores the directory paths for Canopy library""" 6 | 7 | ROOT = Path(__file__).parent.parent 8 | CONFIG_TEMPLATES = ROOT.joinpath("config_templates") 9 | -------------------------------------------------------------------------------- /src/canopy_cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy_cli/__init__.py -------------------------------------------------------------------------------- /src/canopy_cli/cli_spinner.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | import itertools 4 | 5 | 6 | class Spinner(object): 7 | 8 | def __init__(self, disable=False, force=False, stream=sys.stdout, cycle=None): 9 | _cycle = cycle or ['-', '/', '|', '\\'] 10 | self.spinner_cycle = itertools.cycle(_cycle) 11 | self.disable = disable 12 | self.force = force 13 | self.stream = stream 14 | self.stop_running = None 15 | self.spin_thread = None 16 | 17 | def start(self): 18 | if self.disable: 19 | return 20 | if self.stream.isatty() or self.force: 21 | self.stop_running = threading.Event() 22 | self.spin_thread = threading.Thread(target=self.init_spin) 23 | self.spin_thread.start() 24 | 25 | def stop(self): 26 | if self.spin_thread: 27 | self.stop_running.set() 28 | self.spin_thread.join() 29 | 30 | def init_spin(self): 31 | while not self.stop_running.is_set(): 32 | content_to_stream = next(self.spinner_cycle) 33 | self.stream.write(content_to_stream) 34 | self.stream.flush() 35 | self.stop_running.wait(0.25) 36 | self.stream.write(''.join(['\b'] * len(content_to_stream))) 37 | self.stream.flush() 38 | 39 | def __enter__(self): 40 | self.start() 41 | return self 42 | 43 | def __exit__(self, exc_type, exc_val, exc_tb): 44 | if self.disable: 45 | return False 46 | self.stop() 47 | return False 48 | -------------------------------------------------------------------------------- /src/canopy_cli/data_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_loader import ( 2 | load_from_path, 3 | IDsNotUniqueError, 4 | DocumentsValidationError 5 | ) 6 | -------------------------------------------------------------------------------- /src/canopy_cli/data_loader/errors.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from click._compat import get_text_stderr 3 | from click import echo 4 | 5 | 6 | class IDsNotUniqueError(ValueError): 7 | pass 8 | 9 | 10 | class DocumentsValidationError(ValueError): 11 | pass 12 | 13 | 14 | class DataLoaderException(Exception): 15 | """An exception that Click can handle and show to the user.""" 16 | 17 | #: The exit code for this exception. 18 | exit_code = 1 19 | 20 | def __init__(self, file_name: str, row_id: str, err: str) -> None: 21 | message = f""" 22 | {file_name}, line {row_id} - {err} 23 | """ 24 | super().__init__(message) 25 | self.file_name = file_name 26 | self.row_id = row_id 27 | self.err = err 28 | 29 | def format_message(self) -> str: 30 | message = f""" 31 | {self.file_name}, line {self.row_id} - {self.err} 32 | """ 33 | return message 34 | 35 | def __str__(self) -> str: 36 | return self.format_message() 37 | 38 | def show(self, file: t.Optional[t.IO] = None) -> None: 39 | if file is None: 40 | file = get_text_stderr() 41 | 42 | echo("{message}".format(message=self.format_message()), file=file) 43 | -------------------------------------------------------------------------------- /src/canopy_cli/errors.py: -------------------------------------------------------------------------------- 1 | import click 2 | from click import ClickException 3 | 4 | from canopy_cli.data_loader.data_loader import format_multiline 5 | 6 | 7 | class CLIError(ClickException): 8 | def format_message(self) -> str: 9 | return click.style(format_multiline(self.message), fg='red') 10 | 11 | 12 | class ConfigError(RuntimeError): 13 | pass 14 | -------------------------------------------------------------------------------- /src/canopy_server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/src/canopy_server/__init__.py -------------------------------------------------------------------------------- /src/canopy_server/_redocs_template.py: -------------------------------------------------------------------------------- 1 | HTML_TEMPLATE = """<!DOCTYPE html> 2 | <html> 3 | <head> 4 | <meta http-equiv="content-type" content="text/html; charset=UTF-8"> 5 | <title>Canopy API Spec 6 | 7 | 8 | 9 | 15 | 16 | 17 | 18 |
19 | Redoc 20 | 21 | 25 | 26 | 27 | """ # noqa: E501 28 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .unit.stubs.stub_tokenizer import StubTokenizer 2 | from canopy.tokenizer import Tokenizer 3 | 4 | Tokenizer.initialize(StubTokenizer) 5 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | TEST_NAMESPACE = "ns" 4 | TEST_CREATE_INDEX_PARAMS = [ 5 | {"spec": {"serverless": {"cloud": "aws", "region": "us-west-2"}}}, 6 | {"spec": {"pod": {"environment": "eu-west1-gcp", "pod_type": "p1.x1"}}}, 7 | {"spec": {"pod": {"environment": "gcp-starter", "pod_type": "p1.x1"}}}, 8 | ] 9 | 10 | 11 | @pytest.fixture(scope="module", params=[None, TEST_NAMESPACE]) 12 | def namespace(request): 13 | return request.param 14 | 15 | 16 | @pytest.fixture(scope="module", 17 | params=TEST_CREATE_INDEX_PARAMS, 18 | # The first key in the spec is the index type ("serverless" \ "pod") 19 | ids=[next(iter(_["spec"])) for _ in TEST_CREATE_INDEX_PARAMS]) 20 | def create_index_params(request): 21 | return request.param 22 | -------------------------------------------------------------------------------- /tests/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/e2e/__init__.py -------------------------------------------------------------------------------- /tests/system/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/__init__.py -------------------------------------------------------------------------------- /tests/system/knowledge_base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/knowledge_base/__init__.py -------------------------------------------------------------------------------- /tests/system/knowledge_base/qdrant/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/knowledge_base/qdrant/__init__.py -------------------------------------------------------------------------------- /tests/system/knowledge_base/qdrant/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import requests 3 | from canopy.knowledge_base.qdrant.constants import DENSE_VECTOR_NAME 4 | from canopy.knowledge_base.qdrant.converter import QdrantConverter 5 | from canopy.knowledge_base.qdrant.qdrant_knowledge_base import QdrantKnowledgeBase 6 | 7 | import logging 8 | from typing import List 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def total_vectors_in_collection(knowledge_base: QdrantKnowledgeBase): 14 | return knowledge_base._client.count(knowledge_base.collection_name).count 15 | 16 | 17 | def assert_chunks_in_collection(knowledge_base: QdrantKnowledgeBase, encoded_chunks): 18 | ids = [QdrantConverter.convert_id(c.id) for c in encoded_chunks] 19 | fetch_result = knowledge_base._client.retrieve( 20 | knowledge_base.collection_name, ids=ids, with_payload=True, with_vectors=True 21 | ) 22 | points = {p.id: p for p in fetch_result} 23 | for chunk in encoded_chunks: 24 | id = QdrantConverter.convert_id(chunk.id) 25 | assert id in points 26 | point = points[id] 27 | assert np.allclose( 28 | point.vector[DENSE_VECTOR_NAME], 29 | np.array(chunk.values, dtype=np.float32), 30 | atol=1e-8, 31 | ) 32 | 33 | assert point.payload["text"] == chunk.text 34 | assert point.payload["document_id"] == chunk.document_id 35 | assert point.payload["source"] == chunk.source 36 | for key, value in chunk.metadata.items(): 37 | assert point.payload[key] == value 38 | 39 | 40 | def assert_ids_in_collection(knowledge_base, ids): 41 | fetch_result = knowledge_base._client.retrieve( 42 | knowledge_base.collection_name, 43 | ids=ids, 44 | ) 45 | assert len(fetch_result) == len( 46 | ids 47 | ), f"Expected {len(ids)} ids, got {len(fetch_result)}" 48 | 49 | 50 | def assert_num_points_in_collection(knowledge_base, num_vectors): 51 | points_in_index = total_vectors_in_collection(knowledge_base) 52 | assert ( 53 | points_in_index == num_vectors 54 | ), f"Expected {num_vectors} vectors in index, got {points_in_index}" 55 | 56 | 57 | def assert_ids_not_in_collection(knowledge_base, ids): 58 | fetch_result = knowledge_base._client.retrieve( 59 | knowledge_base.collection_name, 60 | ids=ids, 61 | ) 62 | assert len(fetch_result) == 0, f"Found {len(fetch_result)} unexpected ids" 63 | 64 | 65 | def qdrant_server_running() -> bool: 66 | """Check if Qdrant server is running.""" 67 | 68 | try: 69 | response = requests.get("http://localhost:6333", timeout=10.0) 70 | response_json = response.json() 71 | return response_json.get("title") == "qdrant - vector search engine" 72 | except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): 73 | return False 74 | 75 | 76 | def qdrant_locations() -> List[str]: 77 | if not qdrant_server_running(): 78 | logger.warning("Running Qdrant tests in memory mode only.") 79 | return [":memory:"] 80 | return ["http://localhost:6333", ":memory:"] 81 | -------------------------------------------------------------------------------- /tests/system/knowledge_base/qdrant/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from canopy.knowledge_base.qdrant.constants import COLLECTION_NAME_PREFIX 3 | from canopy.knowledge_base.qdrant.qdrant_knowledge_base import QdrantKnowledgeBase 4 | from canopy.models.data_models import Document 5 | from tests.system.knowledge_base.qdrant.common import qdrant_locations 6 | from tests.system.knowledge_base.test_knowledge_base import _generate_text 7 | from tests.unit.stubs.stub_chunker import StubChunker 8 | from tests.unit.stubs.stub_dense_encoder import StubDenseEncoder 9 | from tests.unit.stubs.stub_record_encoder import StubRecordEncoder 10 | from tests.util import create_system_tests_index_name 11 | 12 | 13 | @pytest.fixture(scope="module") 14 | def collection_name(testrun_uid): 15 | return create_system_tests_index_name(testrun_uid) 16 | 17 | 18 | @pytest.fixture(scope="module") 19 | def collection_full_name(collection_name): 20 | return COLLECTION_NAME_PREFIX + collection_name 21 | 22 | 23 | @pytest.fixture(scope="module") 24 | def chunker(): 25 | return StubChunker(num_chunks_per_doc=2) 26 | 27 | 28 | @pytest.fixture(scope="module") 29 | def encoder(): 30 | return StubRecordEncoder(StubDenseEncoder()) 31 | 32 | 33 | @pytest.fixture(scope="module", autouse=True, params=qdrant_locations()) 34 | def knowledge_base(collection_name, chunker, encoder, request): 35 | kb = QdrantKnowledgeBase( 36 | collection_name=collection_name, 37 | record_encoder=encoder, 38 | chunker=chunker, 39 | location=request.param, 40 | ) 41 | kb.create_canopy_collection() 42 | 43 | return kb 44 | 45 | 46 | @pytest.fixture 47 | def documents_large(): 48 | return [ 49 | Document( 50 | id=f"doc_{i}_large", 51 | text=f"Sample document {i}", 52 | metadata={"my-key-large": f"value-{i}"}, 53 | ) 54 | for i in range(1000) 55 | ] 56 | 57 | 58 | @pytest.fixture 59 | def encoded_chunks_large(documents_large, chunker, encoder): 60 | chunks = chunker.chunk_documents(documents_large) 61 | return encoder.encode_documents(chunks) 62 | 63 | 64 | @pytest.fixture 65 | def documents_with_datetime_metadata(): 66 | return [ 67 | Document( 68 | id="doc_1_metadata", 69 | text="document with datetime metadata", 70 | source="source_1", 71 | metadata={ 72 | "datetime": "2021-01-01T00:00:00Z", 73 | "datetime_other_format": "January 1, 2021 00:00:00", 74 | "datetime_other_format_2": "2210.03945", 75 | }, 76 | ), 77 | Document(id="2021-01-01T00:00:00Z", text="id is datetime", source="source_1"), 78 | ] 79 | 80 | 81 | @pytest.fixture 82 | def datetime_metadata_encoded_chunks( 83 | documents_with_datetime_metadata, chunker, encoder 84 | ): 85 | chunks = chunker.chunk_documents(documents_with_datetime_metadata) 86 | return encoder.encode_documents(chunks) 87 | 88 | 89 | @pytest.fixture 90 | def encoded_chunks(documents, chunker, encoder): 91 | chunks = chunker.chunk_documents(documents) 92 | return encoder.encode_documents(chunks) 93 | 94 | 95 | @pytest.fixture(scope="module", autouse=True) 96 | def teardown_knowledge_base(collection_full_name, knowledge_base): 97 | yield 98 | 99 | knowledge_base._client.delete_collection(collection_full_name) 100 | knowledge_base.close() 101 | 102 | 103 | @pytest.fixture(scope="module") 104 | def random_texts(): 105 | return [_generate_text(10) for _ in range(5)] 106 | 107 | 108 | @pytest.fixture 109 | def documents(random_texts): 110 | return [ 111 | Document( 112 | id=f"doc_{i}", 113 | text=random_texts[i], 114 | source=f"source_{i}", 115 | metadata={"my-key": f"value-{i}"}, 116 | ) 117 | for i in range(5) 118 | ] 119 | -------------------------------------------------------------------------------- /tests/system/knowledge_base/qdrant/test_config.yml: -------------------------------------------------------------------------------- 1 | # =========================================================== 2 | # QdrantKnowledgeBase test configuration file 3 | # =========================================================== 4 | 5 | knowledge_base: 6 | params: 7 | default_top_k: 5 8 | collection_name: test-config-collection 9 | default_top_k: 10 -------------------------------------------------------------------------------- /tests/system/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/llm/__init__.py -------------------------------------------------------------------------------- /tests/system/llm/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.models.data_models import UserMessage, AssistantMessage 4 | 5 | 6 | @pytest.fixture 7 | def messages(): 8 | # Create a list of MessageBase objects 9 | return [ 10 | UserMessage(content="Hello, assistant."), 11 | AssistantMessage(content="Hello, user. How can I assist you?"), 12 | UserMessage(content="Just checking in. Be concise."), 13 | ] 14 | -------------------------------------------------------------------------------- /tests/system/llm/test_azure_openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from canopy.llm import AzureOpenAILLM 6 | from .test_openai import SYSTEM_PROMPT 7 | 8 | MODEL_NAME = os.getenv("AZURE_DEPLOYMENT_NAME") 9 | 10 | 11 | @pytest.fixture 12 | def azure_openai_llm(): 13 | if os.getenv("AZURE_DEPLOYMENT_NAME") is None: 14 | pytest.skip( 15 | "Couldn't find Azure deployment name. Skipping Azure OpenAI tests." 16 | ) 17 | return AzureOpenAILLM(model_name=os.getenv("AZURE_DEPLOYMENT_NAME")) 18 | 19 | 20 | def test_init_params(azure_openai_llm): 21 | llm = AzureOpenAILLM( 22 | model_name="test_model_name", 23 | api_version="2020-05-03", 24 | api_key="test_api_key", 25 | temperature=0.9, 26 | top_p=0.95, 27 | n=3, 28 | ) 29 | 30 | assert llm.model_name == "test_model_name" 31 | assert llm.default_model_params["temperature"] == 0.9 32 | assert llm.default_model_params["top_p"] == 0.95 33 | assert llm.default_model_params["n"] == 3 34 | assert llm._client.api_key == "test_api_key" 35 | assert llm._client._api_version == "2020-05-03" 36 | 37 | 38 | @pytest.fixture() 39 | def no_api_key(): 40 | before = os.environ.pop("AZURE_OPENAI_API_KEY", None) 41 | yield 42 | if before is not None: 43 | os.environ["AZURE_OPENAI_API_KEY"] = before 44 | 45 | 46 | def test_missing_api_key(no_api_key): 47 | with pytest.raises(RuntimeError, match="AZURE_OPENAI_API_KEY"): 48 | AzureOpenAILLM(MODEL_NAME) 49 | 50 | 51 | @pytest.fixture() 52 | def bad_api_key(): 53 | before = os.environ.pop("AZURE_OPENAI_API_KEY", None) 54 | os.environ["AZURE_OPENAI_API_KEY"] = "bad key" 55 | yield 56 | if before is not None: 57 | os.environ["AZURE_OPENAI_API_KEY"] = before 58 | 59 | 60 | def test_bad_api_key(bad_api_key, messages): 61 | with pytest.raises(RuntimeError, match="AZURE_OPENAI_API_KEY"): 62 | llm = AzureOpenAILLM(MODEL_NAME) 63 | llm.chat_completion(system_prompt=SYSTEM_PROMPT, chat_history=messages) 64 | 65 | 66 | @pytest.fixture() 67 | def no_azure_endpoint(): 68 | before = os.environ.pop("AZURE_OPENAI_ENDPOINT", None) 69 | yield 70 | if before is not None: 71 | os.environ["AZURE_OPENAI_ENDPOINT"] = before 72 | 73 | 74 | def test_missing_azure_endpoint(no_azure_endpoint): 75 | with pytest.raises(RuntimeError, match="AZURE_OPENAI_ENDPOINT"): 76 | AzureOpenAILLM(MODEL_NAME) 77 | 78 | 79 | @pytest.fixture() 80 | def bad_azure_endpoint(): 81 | before = os.environ.pop("AZURE_OPENAI_ENDPOINT", None) 82 | os.environ["AZURE_OPENAI_ENDPOINT"] = "bad endpoint" 83 | yield 84 | if before is not None: 85 | os.environ["AZURE_OPENAI_ENDPOINT"] = before 86 | 87 | 88 | def test_bad_azure_endpoint(bad_azure_endpoint, messages): 89 | with pytest.raises(RuntimeError, match="Azure OpenAI endpoint"): 90 | llm = AzureOpenAILLM(MODEL_NAME) 91 | llm.chat_completion(system_prompt=SYSTEM_PROMPT, chat_history=messages) 92 | 93 | # def test_function_calling_error(azure_openai_llm): 94 | -------------------------------------------------------------------------------- /tests/system/query_generator/test_cohere_query_generator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.chat_engine.query_generator.cohere import CohereQueryGenerator 4 | from canopy.models.data_models import MessageBase, Role 5 | 6 | 7 | @pytest.fixture 8 | def messages(): 9 | return [ 10 | MessageBase( 11 | role=Role.USER, content="Hello, assistant."), 12 | MessageBase( 13 | role=Role.ASSISTANT, content="Hello, user. How can I assist you?"), 14 | MessageBase( 15 | role=Role.USER, content="How do I init a pinecone client?.") 16 | ] 17 | 18 | 19 | def test_generate_queries(messages): 20 | query_generator = CohereQueryGenerator() 21 | queries = query_generator.generate(messages, max_prompt_tokens=100) 22 | assert queries 23 | assert queries[0].text 24 | 25 | 26 | def test_max_tokens_exceeded_raises_error(messages): 27 | query_generator = CohereQueryGenerator() 28 | 29 | with pytest.raises(ValueError): 30 | query_generator.generate(messages, max_prompt_tokens=10) 31 | -------------------------------------------------------------------------------- /tests/system/query_generator/test_query_generator_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.tokenizer.tokenizer import Tokenizer # noqa 4 | from canopy.llm.openai import OpenAILLM # noqa 5 | from canopy.models.data_models import MessageBase, Query # noqa 6 | from canopy.chat_engine.query_generator import FunctionCallingQueryGenerator # noqa 7 | from typing import List # noqa 8 | 9 | 10 | class TestFunctionCallingQueryGeneratorSystem: 11 | 12 | @staticmethod 13 | @pytest.fixture 14 | def openai_llm(): 15 | Tokenizer.initialize() 16 | 17 | @staticmethod 18 | @pytest.fixture 19 | def query_generator(openai_llm): 20 | query_gen = FunctionCallingQueryGenerator( 21 | llm=openai_llm, 22 | ) 23 | return query_gen 24 | 25 | @staticmethod 26 | @pytest.fixture 27 | def sample_messages(): 28 | return [ 29 | MessageBase(role="user", content="What is photosynthesis?") 30 | ] 31 | 32 | @staticmethod 33 | def test_generate_default_params(query_generator, 34 | sample_messages): 35 | result = query_generator.generate(messages=sample_messages, 36 | max_prompt_tokens=100) 37 | assert isinstance(result, List) 38 | assert len(result) > 0 39 | for query in result: 40 | assert isinstance(query, Query) 41 | assert len(query.text) > 0 42 | -------------------------------------------------------------------------------- /tests/system/record_encoder/test_anyscale_record_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.knowledge_base.record_encoder.anyscale import AnyscaleRecordEncoder 5 | from canopy.models.data_models import Query 6 | 7 | 8 | documents = [KBDocChunk( 9 | id=f"doc_1_{i}", 10 | text=f"Sample document {i}", 11 | document_id=f"doc_{i}", 12 | metadata={"test": i}, 13 | source="doc_1", 14 | ) 15 | for i in range(4) 16 | ] 17 | 18 | queries = [Query(text="Sample query 1"), 19 | Query(text="Sample query 2"), 20 | Query(text="Sample query 3"), 21 | Query(text="Sample query 4")] 22 | 23 | 24 | @pytest.fixture 25 | def encoder(): 26 | return AnyscaleRecordEncoder(batch_size=2) 27 | 28 | 29 | def test_dimension(encoder): 30 | assert encoder.dimension == 1024 31 | 32 | 33 | @pytest.mark.parametrize("items,function", 34 | [(documents, "encode_documents"), 35 | (queries, "encode_queries"), 36 | ([], "encode_documents"), 37 | ([], "encode_queries")]) 38 | def test_encode_documents(encoder, items, function): 39 | 40 | encoded_documents = getattr(encoder, function)(items) 41 | 42 | assert len(encoded_documents) == len(items) 43 | assert all(len(encoded.values) == encoder.dimension 44 | for encoded in encoded_documents) 45 | 46 | 47 | @pytest.mark.asyncio 48 | @pytest.mark.parametrize("items,function", 49 | [("aencode_documents", documents), 50 | ("aencode_queries", queries)]) 51 | async def test_aencode_not_implemented(encoder, function, items): 52 | with pytest.raises(NotImplementedError): 53 | await encoder.aencode_queries(items) 54 | -------------------------------------------------------------------------------- /tests/system/record_encoder/test_cohere_record_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.knowledge_base.record_encoder.cohere import CohereRecordEncoder 5 | from canopy.models.data_models import Query 6 | 7 | 8 | documents = [KBDocChunk( 9 | id=f"doc_1_{i}", 10 | text=f"Sample document {i}", 11 | document_id=f"doc_{i}", 12 | metadata={"test": i}, 13 | source="doc_1", 14 | ) 15 | for i in range(4) 16 | ] 17 | 18 | queries = [Query(text="Sample query 1"), 19 | Query(text="Sample query 2"), 20 | Query(text="Sample query 3"), 21 | Query(text="Sample query 4")] 22 | 23 | 24 | @pytest.fixture 25 | def encoder(): 26 | return CohereRecordEncoder(batch_size=2) 27 | 28 | 29 | def test_dimension(encoder): 30 | assert encoder.dimension == 1024 31 | 32 | 33 | @pytest.mark.parametrize("items,function", 34 | [(documents, "encode_documents"), 35 | (queries, "encode_queries"), 36 | ([], "encode_documents"), 37 | ([], "encode_queries")]) 38 | def test_encode_documents(encoder, items, function): 39 | 40 | encoded_documents = getattr(encoder, function)(items) 41 | 42 | assert len(encoded_documents) == len(items) 43 | assert all(len(encoded.values) == encoder.dimension 44 | for encoded in encoded_documents) 45 | 46 | 47 | @pytest.mark.asyncio 48 | @pytest.mark.parametrize("items,function", 49 | [("aencode_documents", documents), 50 | ("aencode_queries", queries)]) 51 | async def test_aencode_not_implemented(encoder, function, items): 52 | with pytest.raises(NotImplementedError): 53 | await encoder.aencode_queries(items) 54 | -------------------------------------------------------------------------------- /tests/system/record_encoder/test_jina_record_encoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from canopy.knowledge_base.models import KBDocChunk 6 | from canopy.knowledge_base.record_encoder.jina import JinaRecordEncoder 7 | from canopy.models.data_models import Query 8 | 9 | 10 | documents = [KBDocChunk( 11 | id=f"doc_1_{i}", 12 | text=f"Sample document {i}", 13 | document_id=f"doc_{i}", 14 | metadata={"test": i}, 15 | source="doc_1", 16 | ) 17 | for i in range(4) 18 | ] 19 | 20 | queries = [Query(text="Sample query 1"), 21 | Query(text="Sample query 2"), 22 | Query(text="Sample query 3"), 23 | Query(text="Sample query 4")] 24 | 25 | 26 | @pytest.fixture 27 | def encoder(): 28 | if os.getenv("JINA_API_KEY", None) is None: 29 | pytest.skip("Did not find JINA_API_KEY environment variable. Skipping...") 30 | return JinaRecordEncoder(batch_size=2) 31 | 32 | 33 | def test_dimension(encoder): 34 | assert encoder.dimension == 768 35 | 36 | 37 | @pytest.mark.parametrize("items,function", 38 | [(documents, "encode_documents"), 39 | (queries, "encode_queries"), 40 | ([], "encode_documents"), 41 | ([], "encode_queries")]) 42 | def test_encode_documents(encoder, items, function): 43 | 44 | encoded_documents = getattr(encoder, function)(items) 45 | 46 | assert len(encoded_documents) == len(items) 47 | assert all(len(encoded.values) == encoder.dimension 48 | for encoded in encoded_documents) 49 | 50 | 51 | @pytest.mark.asyncio 52 | @pytest.mark.parametrize("items,function", 53 | [("aencode_documents", documents), 54 | ("aencode_queries", queries)]) 55 | async def test_aencode_not_implemented(encoder, function, items): 56 | with pytest.raises(NotImplementedError): 57 | await encoder.aencode_queries(items) 58 | -------------------------------------------------------------------------------- /tests/system/record_encoder/test_octoai_record_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.knowledge_base.record_encoder.octoai import OctoAIRecordEncoder 5 | from canopy.models.data_models import Query 6 | 7 | 8 | documents = [KBDocChunk( 9 | id=f"doc_1_{i}", 10 | text=f"Sample document {i}", 11 | document_id=f"doc_{i}", 12 | metadata={"test": i}, 13 | source="doc_1", 14 | ) 15 | for i in range(4) 16 | ] 17 | 18 | queries = [Query(text="Sample query 1"), 19 | Query(text="Sample query 2"), 20 | Query(text="Sample query 3"), 21 | Query(text="Sample query 4")] 22 | 23 | 24 | @pytest.fixture 25 | def encoder(): 26 | return OctoAIRecordEncoder(batch_size=2) 27 | 28 | 29 | def test_dimension(encoder): 30 | assert encoder.dimension == 1024 31 | 32 | 33 | @pytest.mark.parametrize("items,function", 34 | [(documents, "encode_documents"), 35 | (queries, "encode_queries"), 36 | ([], "encode_documents"), 37 | ([], "encode_queries")]) 38 | def test_encode_documents(encoder, items, function): 39 | 40 | encoded_documents = getattr(encoder, function)(items) 41 | 42 | assert len(encoded_documents) == len(items) 43 | assert all(len(encoded.values) == encoder.dimension 44 | for encoded in encoded_documents) 45 | 46 | 47 | @pytest.mark.asyncio 48 | @pytest.mark.parametrize("items,function", 49 | [("aencode_documents", documents), 50 | ("aencode_queries", queries)]) 51 | async def test_aencode_not_implemented(encoder, function, items): 52 | with pytest.raises(NotImplementedError): 53 | await encoder.aencode_queries(items) 54 | -------------------------------------------------------------------------------- /tests/system/record_encoder/test_openai_record_encoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from canopy.knowledge_base.models import KBDocChunk 6 | from canopy.knowledge_base.record_encoder import AzureOpenAIRecordEncoder 7 | from canopy.knowledge_base.record_encoder.openai import OpenAIRecordEncoder 8 | from canopy.models.data_models import Query 9 | 10 | 11 | documents = [KBDocChunk( 12 | id=f"doc_1_{i}", 13 | text=f"Sample document {i}", 14 | document_id=f"doc_{i}", 15 | metadata={"test": i}, 16 | source="doc_1", 17 | ) 18 | for i in range(4) 19 | ] 20 | 21 | queries = [Query(text="Sample query 1"), 22 | Query(text="Sample query 2"), 23 | Query(text="Sample query 3"), 24 | Query(text="Sample query 4")] 25 | 26 | 27 | @pytest.fixture(params=[OpenAIRecordEncoder, AzureOpenAIRecordEncoder]) 28 | def encoder(request): 29 | encoder_class = request.param 30 | if encoder_class == AzureOpenAIRecordEncoder: 31 | model_name = os.getenv("AZURE_EMBEDDING_DEPLOYMENT_NAME") 32 | if model_name is None: 33 | pytest.skip( 34 | "Couldn't find Azure deployment name. Skipping Azure OpenAI tests." 35 | ) 36 | return AzureOpenAIRecordEncoder(model_name=model_name, batch_size=2) 37 | elif encoder_class == OpenAIRecordEncoder: 38 | return OpenAIRecordEncoder(batch_size=2) 39 | 40 | 41 | def test_dimension(encoder): 42 | assert encoder.dimension == 1536 43 | 44 | 45 | @pytest.mark.parametrize("items,function", 46 | [(documents, "encode_documents"), 47 | (queries, "encode_queries"), 48 | ([], "encode_documents"), 49 | ([], "encode_queries")]) 50 | def test_encode_documents(encoder, items, function): 51 | 52 | encoded_documents = getattr(encoder, function)(items) 53 | 54 | assert len(encoded_documents) == len(items) 55 | assert all(len(encoded.values) == encoder.dimension 56 | for encoded in encoded_documents) 57 | 58 | 59 | @pytest.mark.asyncio 60 | @pytest.mark.parametrize("items,function", 61 | [("aencode_documents", documents), 62 | ("aencode_queries", queries)]) 63 | async def test_aencode_not_implemented(encoder, function, items): 64 | with pytest.raises(NotImplementedError): 65 | await encoder.aencode_queries(items) 66 | -------------------------------------------------------------------------------- /tests/system/record_encoder/test_sentence_transformers_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.knowledge_base.record_encoder.sentence_transformers import ( 5 | SentenceTransformerRecordEncoder 6 | ) 7 | from canopy.models.data_models import Query 8 | 9 | documents = [KBDocChunk( 10 | id=f"doc_1_{i}", 11 | text=f"Sample document {i}", 12 | document_id=f"doc_{i}", 13 | metadata={"test": i}, 14 | source="doc_1", 15 | ) 16 | for i in range(4) 17 | ] 18 | 19 | queries = [Query(text="Sample query 1"), 20 | Query(text="Sample query 2"), 21 | Query(text="Sample query 3"), 22 | Query(text="Sample query 4")] 23 | 24 | 25 | @pytest.fixture 26 | def encoder(): 27 | try: 28 | encoder = SentenceTransformerRecordEncoder(batch_size=2) 29 | except ImportError: 30 | pytest.skip( 31 | "`transformers` extra not installed. Skipping SentenceTransformer system " 32 | "tests" 33 | ) 34 | return encoder 35 | 36 | 37 | def test_dimension(encoder): 38 | assert encoder.dimension == 384 39 | 40 | 41 | @pytest.mark.parametrize("items,function", 42 | [(documents, "encode_documents"), 43 | (queries, "encode_queries"), 44 | ([], "encode_documents"), 45 | ([], "encode_queries")]) 46 | def test_encode_documents(encoder, items, function): 47 | 48 | encoded_documents = getattr(encoder, function)(items) 49 | 50 | assert len(encoded_documents) == len(items) 51 | assert all(len(encoded.values) == encoder.dimension 52 | for encoded in encoded_documents) 53 | 54 | 55 | @pytest.mark.asyncio 56 | @pytest.mark.parametrize("items,function", 57 | [("aencode_documents", documents), 58 | ("aencode_queries", queries)]) 59 | async def test_aencode_not_implemented(encoder, function, items): 60 | with pytest.raises(NotImplementedError): 61 | await encoder.aencode_queries(items) 62 | -------------------------------------------------------------------------------- /tests/system/reranker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/reranker/__init__.py -------------------------------------------------------------------------------- /tests/system/reranker/test_cohere_reranker.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from canopy.knowledge_base.models import KBQueryResult, KBDocChunkWithScore 6 | from canopy.knowledge_base.reranker import CohereReranker 7 | 8 | 9 | @pytest.fixture 10 | def should_run_test(): 11 | if os.getenv("CO_API_KEY") is None: 12 | pytest.skip( 13 | "Couldn't find Cohere API key. Skipping Cohere tests." 14 | ) 15 | 16 | 17 | @pytest.fixture 18 | def cohere_reranker(should_run_test): 19 | return CohereReranker() 20 | 21 | 22 | @pytest.fixture 23 | def documents(): 24 | return [ 25 | KBDocChunkWithScore( 26 | id=f"doc_1_{i}", 27 | text=f"Sample chunk {i}", 28 | document_id="doc_1", 29 | source="doc_1", 30 | score=0.1 * i 31 | ) for i in range(4) 32 | ] 33 | 34 | 35 | @pytest.fixture 36 | def query_result(documents): 37 | return KBQueryResult(query="Sample query 1", 38 | documents=documents) 39 | 40 | 41 | def test_rerank_empty(cohere_reranker): 42 | results = cohere_reranker.rerank([]) 43 | assert results == [] 44 | 45 | 46 | def test_rerank(cohere_reranker, query_result, documents): 47 | id_to_score = {d.id: d.score for d in query_result.documents} 48 | ranked_result = next(iter(cohere_reranker.rerank([query_result]))) 49 | reranked_scores = [doc.score for doc in ranked_result.documents] 50 | 51 | assert len(ranked_result.documents) == len(documents) 52 | assert reranked_scores == sorted(reranked_scores, reverse=True) 53 | 54 | # Make sure the scores are overriden by the reranker 55 | for doc in ranked_result.documents: 56 | assert doc.score != id_to_score[doc.id] 57 | 58 | 59 | def test_bad_api_key(should_run_test, query_result): 60 | with pytest.raises(RuntimeError, match="invalid api token"): 61 | CohereReranker(api_key="bad key").rerank([query_result]) 62 | 63 | 64 | def test_model_name_invalid(should_run_test, query_result): 65 | with pytest.raises(RuntimeError, match="model .* not found"): 66 | CohereReranker(model_name="my-madeup-model").rerank([query_result]) 67 | 68 | 69 | def test_top_n(should_run_test, query_result): 70 | results = CohereReranker(top_n=1).rerank([query_result]) 71 | assert len(results[0].documents) == 1 72 | -------------------------------------------------------------------------------- /tests/system/reranker/test_transparent_reranker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunkWithScore, KBQueryResult 4 | from canopy.knowledge_base.reranker import TransparentReranker 5 | 6 | 7 | @pytest.fixture 8 | def documents(): 9 | return [ 10 | KBDocChunkWithScore( 11 | id=f"doc_1_{i}", 12 | text=f"Sample chunk {i}", 13 | document_id="doc_1", 14 | source="doc_1", 15 | score=0.1 * i 16 | ) for i in range(1) 17 | ] 18 | 19 | 20 | @pytest.fixture 21 | def query_result(documents): 22 | return KBQueryResult(query="Sample query 1", 23 | documents=documents) 24 | 25 | 26 | def test_rerank(query_result): 27 | assert TransparentReranker().rerank([query_result]) == [query_result] 28 | -------------------------------------------------------------------------------- /tests/system/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/tokenizer/__init__.py -------------------------------------------------------------------------------- /tests/system/tokenizer/test_cohere_api_tokenizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from canopy.models.data_models import MessageBase, Role 6 | from canopy.tokenizer import CohereAPITokenizer 7 | from ...unit.tokenizer.base_test_tokenizer import BaseTestTokenizer 8 | 9 | 10 | class TestCohereAPITokenizer(BaseTestTokenizer): 11 | @staticmethod 12 | @pytest.fixture(scope="class") 13 | def tokenizer(): 14 | if not os.getenv("CO_API_KEY"): 15 | pytest.skip("Skipping Cohere API tokenizer tests because " 16 | "COHERE_API_KEY environment variable is not set.") 17 | return CohereAPITokenizer(model_name="command") 18 | 19 | @staticmethod 20 | @pytest.fixture 21 | def text(): 22 | return "string with special characters like !@#$%^&*()_+日本 " \ 23 | "spaces \n \n\n CASE cAse " 24 | 25 | @staticmethod 26 | @pytest.fixture 27 | def expected_tokens(text): 28 | return ['string', ' with', ' special', ' characters', ' like', 29 | ' !', '@', '#', '$', '%', '^', '&', '*', '()', '_', '+', '日', 30 | '本', ' spaces', ' ', '\n ', '\n\n', ' CASE', ' c', 'A', 31 | 'se', " "] 32 | 33 | @staticmethod 34 | def test_messages_token_count(tokenizer): 35 | messages = [MessageBase(role=Role.USER, content="Hello, assistant.")] 36 | assert tokenizer.messages_token_count(messages) == 11 37 | 38 | messages = [MessageBase(role=Role.USER, 39 | content="Hello, assistant."), 40 | MessageBase(role=Role.ASSISTANT, 41 | content="Hello, user. How can I assist you?")] 42 | assert tokenizer.messages_token_count(messages) == 25 43 | 44 | @staticmethod 45 | def test_messages_token_count_empty_messages(tokenizer): 46 | assert tokenizer.messages_token_count([]) == 3 47 | -------------------------------------------------------------------------------- /tests/system/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/system/utils/__init__.py -------------------------------------------------------------------------------- /tests/system/utils/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import yaml 5 | 6 | from canopy.chat_engine import ChatEngine 7 | from canopy.context_engine import ContextEngine 8 | from canopy.knowledge_base import KnowledgeBase 9 | from canopy.utils.directory import Directory 10 | 11 | 12 | @pytest.fixture(scope='module') 13 | def temp_index_name(): 14 | index_name_before = os.getenv("INDEX_NAME", None) 15 | 16 | os.environ["INDEX_NAME"] = "temp_index" 17 | yield "temp_index" 18 | 19 | if index_name_before is None: 20 | del os.environ["INDEX_NAME"] 21 | else: 22 | os.environ["INDEX_NAME"] = index_name_before 23 | 24 | 25 | def test_default_config_matches_code_defaults(temp_index_name): 26 | 27 | with open(Directory.CONFIG_TEMPLATES.joinpath("default.yaml")) as file: 28 | default_config = yaml.safe_load(file) 29 | 30 | chat_engine_config = default_config['chat_engine'] 31 | 32 | loaded_chat_engine = ChatEngine.from_config(chat_engine_config) 33 | default_kb = KnowledgeBase(index_name=temp_index_name) 34 | default_context_engine = ContextEngine(default_kb) 35 | default_chat_engine = ChatEngine(default_context_engine) 36 | 37 | def assert_identical_components(loaded_component, default_component): 38 | assert type(loaded_component) == type(default_component) # noqa: E721 39 | if not loaded_component.__module__.startswith("canopy"): 40 | return 41 | 42 | for key, value in default_component.__dict__.items(): 43 | assert hasattr(loaded_component, key), ( 44 | f"Missing attribute {key} in {type(loaded_component)}" 45 | ) 46 | if hasattr(value, '__dict__'): 47 | assert_identical_components(getattr(loaded_component, key), value) 48 | else: 49 | assert getattr(loaded_component, key) == value, ( 50 | f"Attribute {key} in {type(loaded_component)} is {value} in code " 51 | f"but {getattr(loaded_component, key)} in config" 52 | ) 53 | 54 | assert_identical_components(loaded_chat_engine, default_chat_engine) 55 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | random_words = [ 2 | "apple", "banana", "cherry", "date", "elephant", "flamingo", "grape", "honey", 3 | "iceberg", "jacket", "kangaroo", "lemon", "mango", "noodle", "octopus", "penguin", 4 | "quill", "raspberry", "strawberry", "tiger", "umbrella", "violin", "walrus", 5 | "xylophone", "yarn", "zebra", "ant", "bear", "cat", "dog", "eagle", "falcon", 6 | "giraffe", "horse", "iguana", "jellyfish", "koala", "lion", "monkey", "newt", 7 | "ostrich", "parrot", "quokka", "rhino", "snake", "turtle", "urchin", "vulture", 8 | "whale", "x-ray", "yak", "zeppelin", "atom", "bubble", "candle", "desk", "elevator", 9 | "fan", "globe", "hat", "ice", "juice", "kite", "lamp", "mountain", "nail", "orange", 10 | "piano", "quartz", "river", "sun", "tree", "unicorn", "volcano", "wind", 11 | "yogurt", "zipper", "accordion", "bat", "cymbal", "drum", "flute", "guitar", 12 | "harmonica", "ivory", "jazz", "keyboard", "lyre", "maracas", "note", "organ", 13 | "piccolo", "quena", "recorder", "saxophone", "trumpet", "ukulele", "viola", 14 | "yacht", "zone", "adventure", "backpack", "calendar", "dolphin", "equator", 15 | "gazelle", "helicopter", "island", "jigsaw", "kaleidoscope", "lighthouse", 16 | "narrator", "obelisk", "puzzle", "quicksand", "rainbow", "satellite", "telescope", 17 | "utensil", "vortex", "wavelength", "xenon", "yodel", "zucchini", "asteroid", 18 | "crescent", "dynamo", "echo", "fractal", "galaxy", "hexagon", "infinity", "jungle", 19 | "krypton", "lunar", "meteor", "nebula", "orbit", "prism", "quasar", "radiator", 20 | "tornado", "universe", "vector", "warp", "x-axis", "yellow", "zenith", 21 | "xylograph", "window", "festival", "molecule", "biscuit", "solar", 22 | ] 23 | -------------------------------------------------------------------------------- /tests/unit/chat_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/chat_engine/__init__.py -------------------------------------------------------------------------------- /tests/unit/chunker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/chunker/__init__.py -------------------------------------------------------------------------------- /tests/unit/chunker/base_test_chunker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from abc import ABC, abstractmethod 3 | from canopy.models.data_models import Document 4 | 5 | 6 | class BaseTestChunker(ABC): 7 | 8 | @staticmethod 9 | @pytest.fixture(scope="class") 10 | @abstractmethod 11 | def chunker(): 12 | pass 13 | 14 | @staticmethod 15 | @pytest.fixture 16 | def documents(): 17 | return [ 18 | Document( 19 | id="test_document_1", 20 | text="I am a simple test string" 21 | " to check the happy path of this simple chunker", 22 | metadata={"test": 1}), 23 | Document( 24 | id="test_document_2", 25 | text="another simple test string", 26 | metadata={"test": 2}, 27 | source="doc_2" 28 | ), 29 | Document( 30 | id="test_document_3", 31 | text="short", 32 | metadata={"test": 2}, 33 | source="doc_3" 34 | ) 35 | ] 36 | 37 | @staticmethod 38 | @pytest.fixture 39 | @abstractmethod 40 | def expected_chunks(documents): 41 | pass 42 | 43 | # region: test chunk_single_document 44 | 45 | @staticmethod 46 | def test_chunk_single_document_happy_path(chunker, documents, expected_chunks): 47 | for doc in documents: 48 | expected_chunks_for_doc = [chunk for chunk in 49 | expected_chunks if chunk.document_id == doc.id] 50 | actual_chunks = chunker.chunk_single_document(doc) 51 | assert len(actual_chunks) == len(expected_chunks_for_doc) 52 | for actual_chunk, expected_chunk in zip(actual_chunks, 53 | expected_chunks_for_doc): 54 | assert actual_chunk == expected_chunk, f"actual: {actual_chunk}\n, " \ 55 | f"expected: {expected_chunk}" 56 | 57 | @staticmethod 58 | def test_chunk_single_document_empty_content(chunker, documents): 59 | empty_document = Document(id="test_document_3", text="", metadata={"test": 3}) 60 | assert chunker.chunk_single_document(empty_document) == [] 61 | 62 | # endregion 63 | 64 | # region: test achunk_single_document 65 | 66 | @staticmethod 67 | @pytest.mark.asyncio 68 | async def test_achunk_single_document_raise_error(chunker, 69 | documents, 70 | expected_chunks): 71 | with pytest.raises(NotImplementedError): 72 | await chunker.achunk_single_document(documents[0]) 73 | 74 | # endregion 75 | 76 | # region: test chunk_documents 77 | 78 | @staticmethod 79 | def test_chunk_documents_happy_path(chunker, 80 | documents, 81 | expected_chunks): 82 | chunks = chunker.chunk_documents(documents) 83 | assert len(chunks) == len(expected_chunks) 84 | for chunk, expected_chunk in zip(chunks, expected_chunks): 85 | assert chunk == expected_chunk 86 | 87 | @staticmethod 88 | def test_chunk_documents_empty_list(chunker): 89 | assert chunker.chunk_documents([]) == [] 90 | 91 | @staticmethod 92 | def test_chunk_documents_empty_content(chunker): 93 | empty_document = Document(id="test_document_3", text="", metadata={"test": 3}) 94 | assert chunker.chunk_documents([empty_document]) == [] 95 | 96 | # endregion 97 | 98 | # region: test achunk_documents 99 | 100 | @staticmethod 101 | @pytest.mark.asyncio 102 | async def test_achunk_documents_raise_error(chunker, documents): 103 | with pytest.raises(NotImplementedError): 104 | await chunker.achunk_documents(documents) 105 | 106 | # endregion 107 | -------------------------------------------------------------------------------- /tests/unit/chunker/test_recursive_character_chunker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from canopy.knowledge_base.chunker.recursive_character \ 3 | import RecursiveCharacterChunker 4 | from canopy.knowledge_base.models import KBDocChunk 5 | from tests.unit.chunker.base_test_chunker import BaseTestChunker 6 | 7 | 8 | class TestRecursiveCharacterChunker(BaseTestChunker): 9 | 10 | @staticmethod 11 | @pytest.fixture(scope="class") 12 | def chunker(): 13 | return RecursiveCharacterChunker(chunk_size=3, 14 | chunk_overlap=1) 15 | 16 | @staticmethod 17 | @pytest.fixture 18 | def expected_chunks(documents): 19 | return [ 20 | KBDocChunk(id='test_document_1_0', 21 | text='I am a', 22 | metadata={'test': 1}, 23 | document_id='test_document_1'), 24 | KBDocChunk(id='test_document_1_1', 25 | text='a simple test', 26 | metadata={'test': 1}, 27 | document_id='test_document_1'), 28 | KBDocChunk(id='test_document_1_2', 29 | text='test string to', 30 | metadata={'test': 1}, 31 | document_id='test_document_1'), 32 | KBDocChunk(id='test_document_1_3', 33 | text='to check the', 34 | metadata={'test': 1}, 35 | document_id='test_document_1'), 36 | KBDocChunk(id='test_document_1_4', 37 | text='the happy path', 38 | metadata={'test': 1}, 39 | document_id='test_document_1'), 40 | KBDocChunk(id='test_document_1_5', 41 | text='path of this', 42 | metadata={'test': 1}, 43 | document_id='test_document_1'), 44 | KBDocChunk(id='test_document_1_6', 45 | text='this simple chunker', 46 | metadata={'test': 1}, 47 | document_id='test_document_1'), 48 | KBDocChunk(id='test_document_2_0', 49 | text='another simple test', 50 | metadata={'test': 2}, 51 | document_id='test_document_2', 52 | source='doc_2'), 53 | KBDocChunk(id='test_document_2_1', 54 | text='test string', 55 | metadata={'test': 2}, 56 | document_id='test_document_2', 57 | source='doc_2'), 58 | KBDocChunk(id='test_document_3_0', 59 | text='sho', 60 | metadata={'test': 2}, 61 | document_id='test_document_3', 62 | source='doc_3'), 63 | KBDocChunk(id='test_document_3_1', 64 | text='ort', 65 | metadata={'test': 2}, 66 | document_id='test_document_3', 67 | source='doc_3')] 68 | -------------------------------------------------------------------------------- /tests/unit/chunker/test_stub_chunker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from .base_test_chunker import BaseTestChunker 5 | from ..stubs.stub_chunker import StubChunker 6 | 7 | 8 | class TestStubChunker(BaseTestChunker): 9 | 10 | @staticmethod 11 | @pytest.fixture(scope="class") 12 | def chunker(): 13 | return StubChunker() 14 | 15 | @staticmethod 16 | @pytest.fixture 17 | def expected_chunks(documents): 18 | return [KBDocChunk(id=f"{document.id}_0", 19 | document_id=document.id, 20 | text=document.text, 21 | metadata=document.metadata, 22 | source=document.source) 23 | for document in documents] 24 | -------------------------------------------------------------------------------- /tests/unit/chunker/test_token_chunker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.models.data_models import Document 5 | from .base_test_chunker import BaseTestChunker 6 | from canopy.knowledge_base.chunker.token_chunker import TokenChunker 7 | 8 | 9 | class TestTokenChunker(BaseTestChunker): 10 | 11 | @staticmethod 12 | @pytest.fixture(scope="class") 13 | def chunker(): 14 | return TokenChunker(max_chunk_size=5, 15 | overlap=2) 16 | 17 | @staticmethod 18 | @pytest.fixture 19 | def expected_chunks(documents): 20 | return [KBDocChunk(id='test_document_1_0', 21 | text='I am a simple test', 22 | metadata={'test': 1}, 23 | document_id='test_document_1'), 24 | KBDocChunk(id='test_document_1_1', 25 | text='simple test string to check', 26 | metadata={'test': 1}, 27 | document_id='test_document_1'), 28 | KBDocChunk(id='test_document_1_2', 29 | text='to check the happy path', 30 | metadata={'test': 1}, 31 | document_id='test_document_1'), 32 | KBDocChunk(id='test_document_1_3', 33 | text='happy path of this simple', 34 | metadata={'test': 1}, 35 | document_id='test_document_1'), 36 | KBDocChunk(id='test_document_1_4', 37 | text='this simple chunker', 38 | metadata={'test': 1}, 39 | document_id='test_document_1',), 40 | KBDocChunk(id='test_document_2_0', 41 | text='another simple test string', 42 | metadata={'test': 2}, 43 | document_id='test_document_2', 44 | source='doc_2'), 45 | KBDocChunk(id='test_document_3_0', 46 | text='short', 47 | metadata={'test': 2}, 48 | document_id='test_document_3', 49 | source='doc_3'), 50 | ] 51 | 52 | @staticmethod 53 | def test_chunk_single_document_zero_overlap(chunker): 54 | chunker._overlap = 0 55 | document = Document(id="test_document_1", 56 | text="I am a test string with no overlap", 57 | metadata={"test": 1}) 58 | actual = chunker.chunk_single_document(document) 59 | 60 | expected = [KBDocChunk(id='test_document_1_0', 61 | text='I am a test string', 62 | metadata={'test': 1}, 63 | document_id='test_document_1'), 64 | KBDocChunk(id='test_document_1_1', 65 | text='with no overlap', 66 | metadata={'test': 1}, 67 | document_id='test_document_1')] 68 | 69 | for actual_chunk, expected_chunk in zip(actual, expected): 70 | assert actual_chunk == expected_chunk 71 | 72 | @staticmethod 73 | def test_chunker_init_raise_on_negative_overlap(): 74 | with pytest.raises(ValueError): 75 | TokenChunker(max_chunk_size=5, 76 | overlap=-1) 77 | 78 | @staticmethod 79 | def test_chunker_init_raise_on_non_positive_max_tokens(): 80 | with pytest.raises(ValueError): 81 | TokenChunker(max_chunk_size=0, 82 | overlap=5) 83 | -------------------------------------------------------------------------------- /tests/unit/cli/test_non_schematic_data_loader.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pandas as pd 4 | from pandas.testing import assert_frame_equal 5 | 6 | from canopy_cli.data_loader.data_loader import ( 7 | DataLoaderException, 8 | _load_multiple_txt_files 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def two_valid_txt_files(tmpdir): 14 | file1 = tmpdir.join("file1.txt") 15 | file1.write("the little brown fox\njumped over the lazy dog") 16 | file2 = tmpdir.join("file2.txt") 17 | file2.write("meow meow meow\nmeow meow meow") 18 | return [file1, file2] 19 | 20 | 21 | @pytest.fixture 22 | def invalid_txt_file(tmpdir): 23 | file_path = tmpdir.join("file.txt") 24 | with open(str(file_path), 'w', encoding='latin-1') as file: 25 | file.write("This is a text with bad encoding for UTF-8. ñáéíóú") 26 | return [file_path] 27 | 28 | 29 | @pytest.fixture 30 | def one_invalid_rest_is_valid(tmpdir): 31 | file1 = tmpdir.join("file1.txt") 32 | file1.write("the little brown fox\njumped over the lazy dog") 33 | file2 = tmpdir.join("file2.txt") 34 | file2.write("meow meow meow\nmeow meow meow") 35 | file3 = tmpdir.join("file3.txt") 36 | with open(str(file3), 'w', encoding='latin-1') as file: 37 | file.write("This is a text with bad encoding for UTF-8. ñáéíóú") 38 | return [file1, file2, file3] 39 | 40 | 41 | def test_loading_files_good(two_valid_txt_files): 42 | expected = pd.DataFrame([ 43 | { 44 | "id": "file1", 45 | "text": "the little brown fox\njumped over the lazy dog", 46 | "source": str(two_valid_txt_files[0]) 47 | }, 48 | { 49 | "id": "file2", 50 | "text": "meow meow meow\nmeow meow meow", 51 | "source": str(two_valid_txt_files[1]) 52 | } 53 | ]) 54 | docs = _load_multiple_txt_files(two_valid_txt_files) 55 | assert isinstance(docs, pd.DataFrame) 56 | assert_frame_equal(docs, expected) 57 | 58 | 59 | def test_loading_files_bad(invalid_txt_file): 60 | with pytest.raises(DataLoaderException) as e: 61 | _load_multiple_txt_files(invalid_txt_file) 62 | assert str(e.value) == f""" 63 | {invalid_txt_file[0]}, line * - File must be UTF-8 encoded 64 | """ 65 | 66 | 67 | def test_loading_file_one_is_corrupted(one_invalid_rest_is_valid): 68 | expected = pd.DataFrame([ 69 | { 70 | "id": "file1", 71 | "text": "the little brown fox\njumped over the lazy dog", 72 | "source": str(one_invalid_rest_is_valid[0]) 73 | }, 74 | { 75 | "id": "file2", 76 | "text": "meow meow meow\nmeow meow meow", 77 | "source": str(one_invalid_rest_is_valid[1]) 78 | } 79 | ]) 80 | with pytest.raises(DataLoaderException) as e: 81 | _load_multiple_txt_files(one_invalid_rest_is_valid) 82 | assert str(e.value) == f""" 83 | {one_invalid_rest_is_valid[2]}, line * - File must be UTF-8 encoded 84 | """ 85 | remaining = _load_multiple_txt_files(one_invalid_rest_is_valid[:-1]) 86 | assert isinstance(remaining, pd.DataFrame) 87 | assert_frame_equal(remaining, expected) 88 | -------------------------------------------------------------------------------- /tests/unit/context_builder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/context_builder/__init__.py -------------------------------------------------------------------------------- /tests/unit/history_pruner/test_raising_history_pruner.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.chat_engine.history_pruner import RaisingHistoryPruner 4 | from canopy.models.data_models import \ 5 | UserMessage, AssistantMessage, Context, StringContextContent 6 | from canopy.tokenizer import Tokenizer 7 | 8 | 9 | SAMPLE_CONTEXT = Context(content=StringContextContent( 10 | "Some context information" 11 | ), 12 | num_tokens=3 13 | ) 14 | SYSTEM_PROMPT = "This is a system prompt." 15 | 16 | 17 | @pytest.fixture 18 | def raising_history_builder(): 19 | return RaisingHistoryPruner() 20 | 21 | 22 | @pytest.fixture 23 | def sample_messages(): 24 | return [ 25 | UserMessage(content="Hello there!"), 26 | AssistantMessage(content="Hi! How can I help you?"), 27 | UserMessage(content="Tell me about the weather."), 28 | AssistantMessage(content="Anything else?"), 29 | UserMessage(content="No that's enough"), 30 | ] 31 | 32 | 33 | @pytest.mark.parametrize( 34 | "token_limit, expected_token_count, context, prompt", 35 | [ 36 | (33, 33, None, None), 37 | (50, 33, SAMPLE_CONTEXT, None), 38 | (50, 33, None, SYSTEM_PROMPT), 39 | (50, 33, SAMPLE_CONTEXT, SYSTEM_PROMPT), 40 | ], 41 | ids=[ 42 | "within_limit_no_context_no_prompt", 43 | "within_limit_with_context", 44 | "within_limit_with_prompt", 45 | "within_limit_with_context_and_prompt", 46 | ] 47 | ) 48 | def test_build_within_limits(raising_history_builder, sample_messages, 49 | token_limit, expected_token_count, context, prompt): 50 | messages = raising_history_builder.build(sample_messages, token_limit, 51 | system_prompt=prompt, context=context) 52 | assert Tokenizer().messages_token_count(messages) == expected_token_count 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "token_limit, context, prompt", 57 | [ 58 | (32, None, None), 59 | (33, SAMPLE_CONTEXT, None), 60 | (33, None, SYSTEM_PROMPT), 61 | (31, SAMPLE_CONTEXT, SYSTEM_PROMPT), 62 | ], 63 | ids=[ 64 | "exceed_limit_no_context_no_prompt", 65 | "exceed_limit_with_context", 66 | "exceed_limit_with_prompt", 67 | "exceed_limit_with_context_and_prompt", 68 | ] 69 | ) 70 | def test_build_exceeds_limits(raising_history_builder, sample_messages, 71 | token_limit, context, prompt): 72 | with pytest.raises(ValueError) as e: 73 | raising_history_builder.build(sample_messages, token_limit, 74 | system_prompt=prompt, context=context) 75 | err_msg = e.value.args[0] 76 | assert f"require {Tokenizer().messages_token_count(sample_messages)} " \ 77 | f"tokens" in err_msg 78 | assert f"of {token_limit} tokens left for history" in err_msg 79 | 80 | 81 | @pytest.mark.asyncio 82 | async def test_abuild_not_implemented(raising_history_builder, sample_messages): 83 | with pytest.raises(NotImplementedError): 84 | await raising_history_builder.abuild(sample_messages, 25) 85 | -------------------------------------------------------------------------------- /tests/unit/history_pruner/test_recent_history_pruner.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.chat_engine.history_pruner import RecentHistoryPruner 4 | from canopy.models.data_models import UserMessage, \ 5 | AssistantMessage, Context, StringContextContent 6 | from canopy.tokenizer import Tokenizer 7 | 8 | 9 | SAMPLE_CONTEXT = Context(content=StringContextContent( 10 | "Some context information" 11 | ), num_tokens=3) 12 | SYSTEM_PROMPT = "This is a system prompt." 13 | 14 | 15 | @pytest.fixture 16 | def recent_history_builder(): 17 | return RecentHistoryPruner(min_history_messages=1) 18 | 19 | 20 | @pytest.fixture 21 | def sample_messages(): 22 | return [ 23 | UserMessage(content="Hello there!"), 24 | AssistantMessage(content="Hi! How can I help you?"), 25 | UserMessage(content="Tell me about the weather."), 26 | AssistantMessage(content="Anything else?"), 27 | UserMessage(content="No that's enough"), 28 | ] 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "token_limit, expected_tail, expected_token_count, context, prompt", 33 | [ 34 | (50, 5, 33, None, None), 35 | (18, 2, 11, None, None), 36 | (10, 1, 6, None, None), 37 | (50, 5, 33, SAMPLE_CONTEXT, None), 38 | (50, 5, 33, None, SYSTEM_PROMPT), 39 | (50, 5, 33, SAMPLE_CONTEXT, SYSTEM_PROMPT), 40 | (11, 1, 6, SAMPLE_CONTEXT, None), 41 | (18, 1, 6, None, SYSTEM_PROMPT), 42 | (19, 1, 6, SAMPLE_CONTEXT, SYSTEM_PROMPT), 43 | ], 44 | ids=[ 45 | "full_history_fit_no_context_no_prompt", 46 | "truncated_no_context_no_prompt", 47 | "single_message_no_context_no_prompt", 48 | "full_history_fit_with_context", 49 | "full_history_fit_with_prompt", 50 | "full_history_fit_with_context_and_prompt", 51 | "truncated_with_context", 52 | "truncated_with_prompt", 53 | "truncated_with_context_and_prompt", 54 | ] 55 | ) 56 | def test_build(recent_history_builder, 57 | sample_messages, 58 | token_limit, 59 | expected_tail, 60 | expected_token_count, 61 | context, 62 | prompt): 63 | messages = recent_history_builder.build(sample_messages, 64 | token_limit, 65 | system_prompt=prompt, 66 | context=context) 67 | assert messages == sample_messages[-expected_tail:] 68 | assert Tokenizer().messages_token_count(messages) == expected_token_count 69 | 70 | 71 | def test_min_history_messages(sample_messages): 72 | recent_history_builder = RecentHistoryPruner( 73 | min_history_messages=2 74 | ) 75 | token_limit = 18 76 | messages = recent_history_builder.build(sample_messages, token_limit) 77 | assert messages == sample_messages[-2:] 78 | assert Tokenizer().messages_token_count(messages) == 11 79 | 80 | token_limit = 10 81 | with pytest.raises(ValueError) as e: 82 | recent_history_builder.build(sample_messages, token_limit) 83 | err_msg = e.value.args[0] 84 | assert f"The {2} most recent" in err_msg 85 | assert f"calculated history of {token_limit}" in err_msg 86 | assert "history require 11 tokens" in err_msg 87 | 88 | 89 | def test_build_with_empty_history(recent_history_builder): 90 | messages = recent_history_builder.build([], 15) 91 | assert messages == [] 92 | assert Tokenizer().messages_token_count(messages) == 0 93 | 94 | 95 | @pytest.mark.asyncio 96 | async def test_abuild_not_implemented(recent_history_builder, sample_messages): 97 | with pytest.raises(NotImplementedError): 98 | await recent_history_builder.abuild(sample_messages, 25) 99 | -------------------------------------------------------------------------------- /tests/unit/query_generators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/query_generators/__init__.py -------------------------------------------------------------------------------- /tests/unit/query_generators/test_instruction_query_generator.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from unittest.mock import create_autospec 3 | 4 | import pytest 5 | 6 | from canopy.chat_engine.query_generator import InstructionQueryGenerator 7 | from canopy.llm import BaseLLM 8 | from canopy.models.api_models import ChatResponse, _Choice, TokenCounts 9 | from canopy.models.data_models import Query, UserMessage, AssistantMessage 10 | 11 | 12 | @pytest.fixture 13 | def mock_llm(): 14 | return create_autospec(BaseLLM) 15 | 16 | 17 | @pytest.fixture 18 | def query_generator(mock_llm): 19 | query_gen = InstructionQueryGenerator( 20 | llm=mock_llm, 21 | ) 22 | return query_gen 23 | 24 | 25 | @pytest.fixture 26 | def sample_messages(): 27 | return [UserMessage(content="How can I init a client?"), 28 | AssistantMessage(content="Which kind of client?"), 29 | UserMessage(content="A pinecone client.")] 30 | 31 | 32 | @pytest.mark.parametrize(("response", "query", "call_count"), [ 33 | ( 34 | '{"question": "How do I init a pinecone client?"}', 35 | "How do I init a pinecone client?", 36 | 1 37 | ), 38 | 39 | ( 40 | 'Unparseable JSON response from LLM, falling back to the last message', 41 | "A pinecone client.", 42 | 3 43 | ) 44 | 45 | ]) 46 | def test_generate(query_generator, 47 | mock_llm, 48 | sample_messages, 49 | response, 50 | query, 51 | call_count): 52 | mock_llm.chat_completion.return_value = ChatResponse( 53 | id="meta-llama/Llama-2-7b-chat-hf-HTQ-4", 54 | object="text_completion", 55 | created=1702569324, 56 | model='meta-llama/Llama-2-7b-chat-hf', 57 | usage=TokenCounts( 58 | prompt_tokens=367, 59 | completion_tokens=19, 60 | total_tokens=386 61 | ), 62 | choices=[ 63 | _Choice( 64 | index=0, 65 | message=AssistantMessage( 66 | content=response 67 | ) 68 | ) 69 | ] 70 | ) 71 | 72 | result = query_generator.generate(messages=sample_messages, 73 | max_prompt_tokens=4096) 74 | 75 | assert mock_llm.chat_completion.call_count == call_count 76 | assert isinstance(result, List) 77 | assert len(result) == 1 78 | assert result[0] == Query(text=query) 79 | 80 | 81 | @pytest.mark.asyncio 82 | async def test_agenerate_not_implemented(query_generator, 83 | mock_llm, 84 | sample_messages 85 | ): 86 | with pytest.raises(NotImplementedError): 87 | await query_generator.agenerate(messages=sample_messages, 88 | max_prompt_tokens=100) 89 | -------------------------------------------------------------------------------- /tests/unit/query_generators/test_last_message_query_generator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.chat_engine.query_generator import LastMessageQueryGenerator 4 | from canopy.models.data_models import UserMessage, Query, AssistantMessage 5 | 6 | 7 | @pytest.fixture 8 | def sample_messages(): 9 | return [ 10 | UserMessage(content="What is photosynthesis?") 11 | ] 12 | 13 | 14 | @pytest.fixture 15 | def query_generator(): 16 | return LastMessageQueryGenerator() 17 | 18 | 19 | def test_generate(query_generator, sample_messages): 20 | expected = [Query(text=sample_messages[-1].content)] 21 | actual = query_generator.generate(sample_messages, 0) 22 | assert actual == expected 23 | 24 | 25 | @pytest.mark.asyncio 26 | async def test_agenerate(query_generator, sample_messages): 27 | expected = [Query(text=sample_messages[-1].content)] 28 | actual = await query_generator.agenerate(sample_messages, 0) 29 | assert actual == expected 30 | 31 | 32 | def test_generate_fails_with_empty_history(query_generator): 33 | with pytest.raises(ValueError): 34 | query_generator.generate([], 0) 35 | 36 | 37 | def test_generate_fails_with_no_user_message(query_generator): 38 | with pytest.raises(ValueError): 39 | query_generator.generate([ 40 | AssistantMessage(content="Hi! How can I help you?") 41 | ], 0) 42 | -------------------------------------------------------------------------------- /tests/unit/record_encoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/record_encoder/__init__.py -------------------------------------------------------------------------------- /tests/unit/record_encoder/test_dense_record_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.record_encoder import DenseRecordEncoder 4 | from .base_test_record_encoder import BaseTestRecordEncoder 5 | from ..stubs.stub_dense_encoder import StubDenseEncoder 6 | 7 | 8 | class TestStubRecordEncoder(BaseTestRecordEncoder): 9 | 10 | @staticmethod 11 | @pytest.fixture 12 | def expected_dimension(): 13 | return 3 14 | 15 | @staticmethod 16 | @pytest.fixture 17 | def inner_encoder(expected_dimension): 18 | return StubDenseEncoder(dimension=3) 19 | 20 | @staticmethod 21 | @pytest.fixture 22 | def record_encoder(inner_encoder): 23 | return DenseRecordEncoder(inner_encoder, batch_size=2) 24 | -------------------------------------------------------------------------------- /tests/unit/record_encoder/test_jina_record_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.knowledge_base.record_encoder.jina import JinaRecordEncoder 5 | from canopy.models.data_models import Query 6 | 7 | from unittest.mock import patch 8 | 9 | documents = [KBDocChunk( 10 | id=f"doc_1_{i}", 11 | text=f"Sample document {i}", 12 | document_id=f"doc_{i}", 13 | metadata={"test": i}, 14 | source="doc_1", 15 | ) 16 | for i in range(4) 17 | ] 18 | 19 | queries = [Query(text="Sample query 1"), 20 | Query(text="Sample query 2"), 21 | Query(text="Sample query 3"), 22 | Query(text="Sample query 4")] 23 | 24 | 25 | @pytest.fixture 26 | def encoder(): 27 | return JinaRecordEncoder(api_key='test_api_key', batch_size=2) 28 | 29 | 30 | def test_dimension(encoder): 31 | with patch('pinecone_text.dense.JinaEncoder.encode_documents') \ 32 | as mock_encode_documents: 33 | mock_encode_documents.return_value = [0.1, 0.2, 0.3] 34 | assert encoder.dimension == 3 35 | 36 | 37 | def custom_encode(*args, **kwargs): 38 | input_to_encode = args[0] 39 | if isinstance(input_to_encode, list): 40 | return [[0.1, 0.2, 0.3] for _ in input_to_encode] 41 | else: 42 | return [0.1, 0.2, 0.3] 43 | 44 | 45 | @pytest.mark.parametrize("items,function", 46 | [(documents, "encode_documents"), 47 | (queries, "encode_queries"), 48 | ([], "encode_documents"), 49 | ([], "encode_queries")]) 50 | def test_encode_documents(encoder, items, function): 51 | with patch('pinecone_text.dense.JinaEncoder.encode_documents', 52 | side_effect=custom_encode): 53 | with patch('pinecone_text.dense.JinaEncoder.encode_queries', 54 | side_effect=custom_encode): 55 | encoded_documents = getattr(encoder, function)(items) 56 | 57 | assert len(encoded_documents) == len(items) 58 | assert all(len(encoded.values) == encoder.dimension 59 | for encoded in encoded_documents) 60 | 61 | 62 | @pytest.mark.asyncio 63 | @pytest.mark.parametrize("items,function", 64 | [("aencode_documents", documents), 65 | ("aencode_queries", queries)]) 66 | async def test_aencode_not_implemented(encoder, function, items): 67 | with pytest.raises(NotImplementedError): 68 | await encoder.aencode_queries(items) 69 | -------------------------------------------------------------------------------- /tests/unit/record_encoder/test_sentence_transformers_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.knowledge_base.record_encoder.sentence_transformers import ( 5 | SentenceTransformerRecordEncoder 6 | ) 7 | from canopy.models.data_models import Query 8 | 9 | from unittest.mock import patch 10 | 11 | documents = [KBDocChunk( 12 | id=f"doc_1_{i}", 13 | text=f"Sample document {i}", 14 | document_id=f"doc_{i}", 15 | metadata={"test": i}, 16 | source="doc_1", 17 | ) 18 | for i in range(4) 19 | ] 20 | 21 | queries = [Query(text="Sample query 1"), 22 | Query(text="Sample query 2"), 23 | Query(text="Sample query 3"), 24 | Query(text="Sample query 4")] 25 | 26 | 27 | @pytest.fixture 28 | def encoder(): 29 | try: 30 | encoder = SentenceTransformerRecordEncoder(batch_size=2) 31 | except ImportError: 32 | pytest.skip( 33 | "`transformers` extra not installed. Skipping SentenceTransformer unit " 34 | "tests" 35 | ) 36 | return encoder 37 | 38 | 39 | def test_dimension(encoder): 40 | with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_documents') \ 41 | as mock_encode_documents: 42 | mock_encode_documents.return_value = [0.1, 0.2, 0.3] 43 | assert encoder.dimension == 3 44 | 45 | 46 | def custom_encode(*args, **kwargs): 47 | input_to_encode = args[0] 48 | if isinstance(input_to_encode, list): 49 | return [[0.1, 0.2, 0.3] for _ in input_to_encode] 50 | else: 51 | return [0.1, 0.2, 0.3] 52 | 53 | 54 | @pytest.mark.parametrize("items,function", 55 | [(documents, "encode_documents"), 56 | (queries, "encode_queries"), 57 | ([], "encode_documents"), 58 | ([], "encode_queries")]) 59 | def test_encode_documents(encoder, items, function): 60 | with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_documents', 61 | side_effect=custom_encode): 62 | with patch('pinecone_text.dense.SentenceTransformerEncoder.encode_queries', 63 | side_effect=custom_encode): 64 | encoded_documents = getattr(encoder, function)(items) 65 | 66 | assert len(encoded_documents) == len(items) 67 | assert all(len(encoded.values) == encoder.dimension 68 | for encoded in encoded_documents) 69 | 70 | 71 | @pytest.mark.asyncio 72 | @pytest.mark.parametrize("items,function", 73 | [("aencode_documents", documents), 74 | ("aencode_queries", queries)]) 75 | async def test_aencode_not_implemented(encoder, function, items): 76 | with pytest.raises(NotImplementedError): 77 | await encoder.aencode_queries(items) 78 | -------------------------------------------------------------------------------- /tests/unit/record_encoder/test_stub_record_encoder.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from .base_test_record_encoder import BaseTestRecordEncoder 3 | from ..stubs.stub_dense_encoder import StubDenseEncoder 4 | from ..stubs.stub_record_encoder import StubRecordEncoder 5 | 6 | 7 | class TestStubRecordEncoder(BaseTestRecordEncoder): 8 | 9 | @staticmethod 10 | @pytest.fixture 11 | def expected_dimension(): 12 | return 3 13 | 14 | @staticmethod 15 | @pytest.fixture 16 | def inner_encoder(expected_dimension): 17 | return StubDenseEncoder(dimension=3) 18 | 19 | @staticmethod 20 | @pytest.fixture 21 | def record_encoder(inner_encoder): 22 | return StubRecordEncoder(inner_encoder, 23 | batch_size=2) 24 | -------------------------------------------------------------------------------- /tests/unit/stubs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/stubs/__init__.py -------------------------------------------------------------------------------- /tests/unit/stubs/stub_chunker.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from canopy.knowledge_base.chunker.base import Chunker 3 | from canopy.knowledge_base.models import KBDocChunk 4 | from canopy.models.data_models import Document 5 | 6 | 7 | class StubChunker(Chunker): 8 | 9 | def __init__(self, num_chunks_per_doc: int = 1): 10 | super().__init__() 11 | self.num_chunks_per_doc = num_chunks_per_doc 12 | 13 | def chunk_single_document(self, document: Document) -> List[KBDocChunk]: 14 | if document.text == "": 15 | return [] 16 | 17 | # simply duplicate docs as chunks 18 | return [KBDocChunk(id=self.generate_chunk_id(document.id, i), 19 | document_id=document.id, 20 | text=document.text + (f" dup_{i}" if i > 0 else ""), 21 | source=document.source, 22 | metadata=document.metadata) 23 | for i in range(self.num_chunks_per_doc)] 24 | 25 | async def achunk_single_document(self, document: Document) -> List[KBDocChunk]: 26 | raise NotImplementedError() 27 | -------------------------------------------------------------------------------- /tests/unit/stubs/stub_dense_encoder.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | import numpy as np 3 | from collections import defaultdict 4 | from typing import Union, List 5 | 6 | from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder 7 | 8 | 9 | class StubDenseEncoder(BaseDenseEncoder): 10 | """ 11 | Bag-of-words encoder that uses a random projection matrix to 12 | project sparse vectors to dense vectors. 13 | uses Johnson–Lindenstrauss lemma to project BOW sparse vectors to dense vectors. 14 | """ 15 | 16 | def __init__(self, 17 | dimension: int = 8, 18 | vocab_size: int = 2 ** 12): 19 | self.input_dim = vocab_size 20 | self.dimension = dimension 21 | 22 | def _text_to_word_counts(self, text: str) -> defaultdict: 23 | words = text.split() 24 | word_counts = defaultdict(int) 25 | for word in words: 26 | hashed_word = mmh3.hash(word) % self.input_dim 27 | word_counts[hashed_word] += 1 28 | return word_counts 29 | 30 | def _encode_text(self, text: str) -> List[float]: 31 | word_counts = self._text_to_word_counts(text) 32 | 33 | # This will hold the result of word_counts * random_matrix 34 | projected_embedding = np.zeros(self.dimension, dtype=np.float32) 35 | 36 | for hashed_word, count in word_counts.items(): 37 | rng = np.random.default_rng(hashed_word) 38 | # Seed the RNG with the hashed word index for consistency 39 | random_vector = rng.standard_normal(self.dimension) 40 | projected_embedding += count * random_vector 41 | 42 | projected_embedding = projected_embedding.astype(np.float32) 43 | return list(projected_embedding / np.linalg.norm(projected_embedding)) 44 | 45 | def encode_documents(self, 46 | texts: Union[str, List[str]] 47 | ) -> Union[List[float], List[List[float]]]: 48 | return self._encode(texts) 49 | 50 | def encode_queries(self, 51 | texts: Union[str, List[str]] 52 | ) -> Union[List[float], List[List[float]]]: 53 | return self._encode(texts) 54 | 55 | def _encode(self, 56 | texts: Union[str, List[str]] 57 | ) -> Union[List[float], List[List[float]]]: 58 | if isinstance(texts, str): 59 | return self._encode_text(texts) 60 | else: 61 | return [self._encode_text(text) for text in texts] 62 | -------------------------------------------------------------------------------- /tests/unit/stubs/stub_record_encoder.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from canopy.knowledge_base.record_encoder import RecordEncoder 4 | from canopy.knowledge_base.models import KBQuery, KBDocChunk, KBEncodedDocChunk 5 | from canopy.models.data_models import Query 6 | from .stub_dense_encoder import StubDenseEncoder 7 | 8 | 9 | class StubRecordEncoder(RecordEncoder): 10 | 11 | def __init__(self, 12 | stub_dense_encoder: StubDenseEncoder, 13 | batch_size: int = 1): 14 | super().__init__(batch_size) 15 | self._dense_encoder = stub_dense_encoder 16 | 17 | def _encode_documents_batch(self, 18 | documents: List[KBDocChunk] 19 | ) -> List[KBEncodedDocChunk]: 20 | result: List[KBEncodedDocChunk] = [] 21 | for doc in documents: 22 | values = self._dense_encoder.encode_documents(doc.text) 23 | result.append( 24 | KBEncodedDocChunk( 25 | **doc.model_dump(), 26 | values=values)) 27 | return result 28 | 29 | def _encode_queries_batch(self, 30 | queries: List[Query] 31 | ) -> List[KBQuery]: 32 | result: List[KBQuery] = [] 33 | for query in queries: 34 | values = self._dense_encoder.encode_queries(query.text) 35 | result.append( 36 | KBQuery(**query.model_dump(), 37 | values=values)) 38 | return result 39 | 40 | async def _aencode_documents_batch(self, 41 | documents: List[KBDocChunk] 42 | ) -> List[KBEncodedDocChunk]: 43 | raise NotImplementedError() 44 | 45 | async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]: 46 | raise NotImplementedError() 47 | 48 | @property 49 | def dimension(self) -> int: 50 | return self._dense_encoder.dimension 51 | -------------------------------------------------------------------------------- /tests/unit/stubs/stub_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from canopy.tokenizer.base import BaseTokenizer 3 | from canopy.models.data_models import Messages 4 | 5 | 6 | class StubTokenizer(BaseTokenizer): 7 | 8 | def __init__(self, message_overhead: int = 3): 9 | self._message_overhead = message_overhead 10 | 11 | def tokenize(self, text: str) -> List[str]: 12 | return text.split() 13 | 14 | def detokenize(self, tokens: List[str]) -> str: 15 | if not isinstance(tokens, List): 16 | raise TypeError(f"detokenize expect List[str], got f{type(tokens)}") 17 | return " ".join(tokens) 18 | 19 | def messages_token_count(self, messages: Messages) -> int: 20 | return sum(len(self.tokenize(msg.content)) + self._message_overhead 21 | for msg in messages) 22 | -------------------------------------------------------------------------------- /tests/unit/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/tokenizer/__init__.py -------------------------------------------------------------------------------- /tests/unit/tokenizer/base_test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import pytest 3 | 4 | 5 | class BaseTestTokenizer(ABC): 6 | 7 | @staticmethod 8 | @pytest.fixture(scope="class") 9 | @abstractmethod 10 | def tokenizer(): 11 | pass 12 | 13 | @staticmethod 14 | @pytest.fixture 15 | def text(): 16 | return "string with special characters like !@#$%^&*()_+ 日本 " \ 17 | "spaces \n \n\n CASE cAse " 18 | 19 | @staticmethod 20 | @pytest.fixture 21 | @abstractmethod 22 | def expected_tokens(text): 23 | pass 24 | 25 | # region: test tokenize 26 | 27 | @staticmethod 28 | def test_tokenize(tokenizer, text, expected_tokens): 29 | tokens = tokenizer.tokenize(text) 30 | assert tokens == expected_tokens, f"\nExpected: {expected_tokens}" \ 31 | f"\nActual: {tokens}" 32 | 33 | @staticmethod 34 | def test_tokenize_empty_string(tokenizer): 35 | assert tokenizer.tokenize("") == [] 36 | 37 | @staticmethod 38 | def test_tokenize_invalid_input_type_raise_exception(tokenizer): 39 | with pytest.raises(Exception): 40 | tokenizer.tokenize(1) 41 | 42 | with pytest.raises(Exception): 43 | tokenizer.tokenize(["asd"]) 44 | 45 | # endregion 46 | 47 | # region: test detokenize 48 | 49 | @staticmethod 50 | def test_detokenize(tokenizer, text, expected_tokens): 51 | text = tokenizer.detokenize(expected_tokens) 52 | assert text == text 53 | 54 | @staticmethod 55 | def test_detokenize_empty_string(tokenizer): 56 | assert tokenizer.detokenize([]) == "" 57 | 58 | @staticmethod 59 | def test_detokenize_invalid_input_type_raise_exception(tokenizer): 60 | with pytest.raises(Exception): 61 | tokenizer.detokenize(1) 62 | 63 | with pytest.raises(Exception): 64 | tokenizer.detokenize("asd") 65 | 66 | # endregion 67 | 68 | # region test token_count 69 | 70 | @staticmethod 71 | def test_token_count(tokenizer, text, expected_tokens): 72 | token_count = tokenizer.token_count(text) 73 | assert token_count == len(expected_tokens) 74 | assert token_count == len(tokenizer.tokenize(text)) 75 | 76 | @staticmethod 77 | def test_token_count_empty_string(tokenizer): 78 | assert tokenizer.token_count("") == 0 79 | 80 | # endregion 81 | 82 | @staticmethod 83 | def test_tokenize_detokenize_compatibility(tokenizer, text, expected_tokens): 84 | retext = tokenizer.detokenize(tokenizer.tokenize(text)) 85 | assert retext == text, f"\nExpected: {text}\nActual: {retext}" 86 | reconstructed_expected_tokens = tokenizer.tokenize( 87 | tokenizer.detokenize(expected_tokens)) 88 | assert reconstructed_expected_tokens == expected_tokens, \ 89 | f"\nExpected: {expected_tokens}\nActual: {reconstructed_expected_tokens}" 90 | -------------------------------------------------------------------------------- /tests/unit/tokenizer/test_cohere_hf_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from canopy.tokenizer import CohereHFTokenizer 3 | from canopy.models.data_models import MessageBase, Role 4 | from .base_test_tokenizer import BaseTestTokenizer 5 | 6 | 7 | class TestCohereHFTokenizer(BaseTestTokenizer): 8 | @staticmethod 9 | @pytest.fixture(scope="class") 10 | def tokenizer(): 11 | try: 12 | tokenizer = CohereHFTokenizer() 13 | except ImportError: 14 | pytest.skip( 15 | "`cohere` extra not installed. Skipping CohereHFTokenizer unit " 16 | "tests" 17 | ) 18 | return tokenizer 19 | 20 | @staticmethod 21 | @pytest.fixture 22 | def expected_tokens(text): 23 | return ['string', 24 | 'Ġwith', 25 | 'Ġspecial', 26 | 'Ġcharacters', 27 | 'Ġlike', 28 | 'Ġ!', 29 | '@', 30 | '#$', 31 | '%^', 32 | '&', 33 | '*', 34 | '()', 35 | '_', 36 | '+', 37 | 'ĠæĹ¥æľ¬', 38 | 'Ġspaces', 39 | 'ĠĠĠ', 40 | 'ĊĠĊĊ', 41 | 'ĠCASE', 42 | 'Ġc', 43 | 'A', 44 | 'se', 45 | 'Ġ'] 46 | 47 | @staticmethod 48 | def test_messages_token_count(tokenizer): 49 | messages = [MessageBase(role=Role.USER, content="Hello, assistant.")] 50 | assert tokenizer.messages_token_count(messages) == 11 51 | 52 | messages = [ 53 | MessageBase(role=Role.USER, content="Hello, assistant."), 54 | MessageBase( 55 | role=Role.ASSISTANT, content="Hello, user. How can I assist you?" 56 | ), 57 | ] 58 | assert tokenizer.messages_token_count(messages) == 25 59 | -------------------------------------------------------------------------------- /tests/unit/tokenizer/test_llama_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from canopy.tokenizer import LlamaTokenizer 3 | from canopy.models.data_models import MessageBase, Role 4 | from .base_test_tokenizer import BaseTestTokenizer 5 | 6 | 7 | class TestLlamaTokenizer(BaseTestTokenizer): 8 | @staticmethod 9 | @pytest.fixture(scope="class") 10 | def tokenizer(): 11 | try: 12 | tokenizer = LlamaTokenizer(model_name="hf-internal-testing/llama-tokenizer") 13 | except ImportError: 14 | pytest.skip( 15 | "`transformers` extra not installed. Skipping LLamaTokenizer unit " 16 | "tests" 17 | ) 18 | return tokenizer 19 | 20 | @staticmethod 21 | @pytest.fixture 22 | def expected_tokens(text): 23 | return [ 24 | "▁string", 25 | "▁with", 26 | "▁special", 27 | "▁characters", 28 | "▁like", 29 | "▁!", 30 | "@", 31 | "#", 32 | "$", 33 | "%", 34 | "^", 35 | "&", 36 | "*", 37 | "()", 38 | "_+", 39 | "▁", 40 | "日", 41 | "本", 42 | "▁spaces", 43 | "▁▁▁", 44 | "<0x0A>", 45 | "▁", 46 | "<0x0A>", 47 | "<0x0A>", 48 | "▁CASE", 49 | "▁c", 50 | "A", 51 | "se", 52 | "▁", 53 | ] 54 | 55 | @staticmethod 56 | def test_messages_token_count(tokenizer): 57 | messages = [MessageBase(role=Role.USER, content="Hello, assistant.")] 58 | assert tokenizer.messages_token_count(messages) == 11 59 | 60 | messages = [ 61 | MessageBase(role=Role.USER, content="Hello, assistant."), 62 | MessageBase( 63 | role=Role.ASSISTANT, content="Hello, user. How can I assist you?" 64 | ), 65 | ] 66 | assert tokenizer.messages_token_count(messages) == 25 67 | 68 | @staticmethod 69 | def test_messages_token_count_empty_messages(tokenizer): 70 | assert tokenizer.messages_token_count([]) == 3 71 | 72 | @staticmethod 73 | def test_special_tokens_to_natural_text(tokenizer): 74 | input_text = "_<0x0A>__ word" 75 | tokens = tokenizer.tokenize(input_text) 76 | expected_tokens = [ 77 | "▁", 78 | "_", 79 | "<", 80 | "0", 81 | "x", 82 | "0", 83 | "A", 84 | ">", 85 | "__", 86 | "▁", 87 | "<", 88 | "s", 89 | ">", 90 | "word", 91 | ] 92 | assert tokens == expected_tokens 93 | 94 | # TODO: this currently fails since detokenize() adds a space after and . 95 | # We need to decide if this is the desired behavior or not. 96 | assert tokenizer.detokenize(tokens) == input_text 97 | 98 | assert tokenizer.token_count(input_text) == len(tokens) 99 | -------------------------------------------------------------------------------- /tests/unit/tokenizer/test_openai_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from canopy.tokenizer import OpenAITokenizer 3 | from canopy.models.data_models import MessageBase, Role 4 | from .base_test_tokenizer import BaseTestTokenizer 5 | 6 | 7 | class TestOpenAITokenizer(BaseTestTokenizer): 8 | 9 | @staticmethod 10 | @pytest.fixture(scope="class") 11 | def tokenizer(): 12 | return OpenAITokenizer(model_name="gpt-3.5-turbo") 13 | 14 | @staticmethod 15 | @pytest.fixture 16 | def expected_tokens(text): 17 | return ['string', ' with', ' special', ' characters', ' like', 18 | ' !', '@', '#$', '%^', '&', '*', '()', '_', '+', ' 日', 19 | '本', ' spaces', ' \n', ' \n\n', ' CASE', ' c', 'A', 20 | 'se', " "] 21 | 22 | @staticmethod 23 | def test_messages_token_count(tokenizer): 24 | messages = [MessageBase(role=Role.USER, content="Hello, assistant.")] 25 | assert tokenizer.messages_token_count(messages) == 11 26 | 27 | messages = [MessageBase(role=Role.USER, 28 | content="Hello, assistant."), 29 | MessageBase(role=Role.ASSISTANT, 30 | content="Hello, user. How can I assist you?")] 31 | assert tokenizer.messages_token_count(messages) == 25 32 | 33 | @staticmethod 34 | def test_messages_token_count_empty_messages(tokenizer): 35 | assert tokenizer.messages_token_count([]) == 3 36 | 37 | @staticmethod 38 | def test_special_tokens_to_natural_text(tokenizer): 39 | tokens = tokenizer.tokenize("<|endoftext|>") 40 | assert tokens == ['<', '|', 'endo', 'ft', 'ext', '|', '>'] 41 | 42 | assert tokenizer.detokenize(tokens) == "<|endoftext|>" 43 | 44 | assert tokenizer.token_count("<|endoftext|>") == 7 45 | -------------------------------------------------------------------------------- /tests/unit/tokenizer/test_stub_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from canopy.models.data_models import MessageBase, Role 4 | from .base_test_tokenizer import BaseTestTokenizer 5 | from ..stubs.stub_tokenizer import StubTokenizer 6 | 7 | 8 | class TestStubTokenizer(BaseTestTokenizer): 9 | 10 | @staticmethod 11 | @pytest.fixture(scope="class") 12 | def tokenizer(): 13 | return StubTokenizer() 14 | 15 | @staticmethod 16 | @pytest.fixture 17 | def expected_tokens(text): 18 | return text.split() 19 | 20 | @staticmethod 21 | def test_tokenize_detokenize_compatibility(tokenizer, text, expected_tokens): 22 | assert tokenizer.detokenize(tokenizer.tokenize(text)) \ 23 | == " ".join(text.split()) 24 | assert tokenizer.tokenize(tokenizer.detokenize(expected_tokens))\ 25 | == expected_tokens 26 | 27 | @staticmethod 28 | def test_messages_token_count(tokenizer): 29 | messages = [MessageBase(role=Role.USER, content="hi bye"), 30 | MessageBase(role=Role.ASSISTANT, content="hi")] 31 | assert tokenizer.messages_token_count(messages) == 3 + len(messages) * 3 32 | 33 | @staticmethod 34 | def test_messages_token_count_empty_messages(tokenizer): 35 | assert tokenizer.messages_token_count([]) == 0 36 | -------------------------------------------------------------------------------- /tests/unit/tokenizer/test_tokenizer_singleton.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from canopy.tokenizer import Tokenizer 3 | from ..stubs.stub_tokenizer import StubTokenizer 4 | 5 | 6 | class StubChildTokenizer(StubTokenizer): 7 | pass 8 | 9 | 10 | @pytest.fixture 11 | def reset_tokenizer_singleton(): 12 | before = Tokenizer._tokenizer_instance.__class__ 13 | Tokenizer.clear() 14 | yield 15 | Tokenizer.clear() 16 | Tokenizer.initialize(tokenizer_class=before) 17 | 18 | 19 | def test_tokenizer_init(reset_tokenizer_singleton): 20 | Tokenizer.initialize(StubTokenizer) 21 | assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer) 22 | assert Tokenizer._initialized is True 23 | 24 | 25 | def test_tokenizer_init_already_initialized_same_class(reset_tokenizer_singleton): 26 | Tokenizer.initialize(StubTokenizer, message_overhead=10) 27 | tokenizer = Tokenizer() 28 | assert isinstance(Tokenizer._tokenizer_instance, StubTokenizer) 29 | assert Tokenizer._initialized is True 30 | assert Tokenizer._tokenizer_instance._message_overhead == 10 31 | assert tokenizer._tokenizer_instance._message_overhead == 10 32 | 33 | 34 | def test_tokenizer_init_already_initialized_different_class(reset_tokenizer_singleton): 35 | Tokenizer.initialize(StubChildTokenizer, message_overhead=10) 36 | tokenizer = Tokenizer() 37 | assert isinstance(Tokenizer._tokenizer_instance, StubChildTokenizer) 38 | assert Tokenizer._initialized is True 39 | assert isinstance(tokenizer._tokenizer_instance, StubChildTokenizer) 40 | 41 | 42 | def test_tokenizer_init_invalid_same_class(reset_tokenizer_singleton): 43 | with pytest.raises(ValueError): 44 | Tokenizer.initialize(Tokenizer) 45 | 46 | 47 | def test_tokenizer_init_invalid_tokenizer_class(reset_tokenizer_singleton): 48 | class InvalidTokenizer: 49 | pass 50 | with pytest.raises(ValueError): 51 | Tokenizer.initialize(InvalidTokenizer) 52 | 53 | 54 | def test_tokenizer_uniqueness(reset_tokenizer_singleton): 55 | Tokenizer.initialize(StubTokenizer) 56 | tokenizer = Tokenizer() 57 | assert tokenizer is Tokenizer() 58 | another_tokenizer = Tokenizer() 59 | assert tokenizer is another_tokenizer 60 | -------------------------------------------------------------------------------- /tests/unit/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinecone-io/canopy/fabb7d7c8bafffd9c7d0890c1624bec75e5bef77/tests/unit/utils/__init__.py -------------------------------------------------------------------------------- /tests/unit/utils/_stub_classes.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Optional 3 | 4 | from canopy.utils.config import ConfigurableMixin 5 | 6 | 7 | # A base class that inherits from ConfigurableMixin, with multiple derived classes 8 | class BaseStubChunker(abc.ABC, ConfigurableMixin): 9 | @abc.abstractmethod 10 | def chunk(self, text: str) -> str: 11 | pass 12 | 13 | 14 | class StubChunker(BaseStubChunker): 15 | DEFAULT_CHUNK_SIZE = 100 16 | DEFAULT_SPLITTER = ' ' 17 | 18 | def __init__(self, chunk_size=DEFAULT_CHUNK_SIZE, splitter=DEFAULT_SPLITTER): 19 | self.chunk_size = chunk_size 20 | self.splitter = splitter 21 | 22 | def chunk(self, text: str) -> str: 23 | return text.split(self.splitter) 24 | 25 | 26 | class StubOtherChunker(BaseStubChunker): 27 | DEFAULT_CHUNK_SIZE = 200 28 | 29 | def __init__(self, chunk_size=DEFAULT_CHUNK_SIZE, some_param=' '): 30 | self.chunk_size = chunk_size 31 | self.splitter = some_param 32 | 33 | def chunk(self, text: str) -> str: 34 | return text.split(self.splitter) 35 | 36 | 37 | # A base class that inherits from ConfigurableMixin, where the derived class has 38 | # default components 39 | class BaseStubKB(abc.ABC, ConfigurableMixin): 40 | pass 41 | 42 | 43 | class StubKB(BaseStubKB): 44 | _DEFAULT_COMPONENTS = { 45 | 'chunker': StubChunker, 46 | } 47 | 48 | DEFAULT_TOP_K = 5 49 | 50 | def __init__(self, 51 | chunker: Optional[BaseStubChunker] = None, 52 | top_k: int = DEFAULT_TOP_K, 53 | ): 54 | self.chunker = chunker or self._DEFAULT_COMPONENTS['chunker']() 55 | self.top_k = top_k 56 | 57 | 58 | class BaseStubContextBuilder(ConfigurableMixin): 59 | pass 60 | 61 | 62 | class StubContextBuilder(BaseStubContextBuilder): 63 | DEFAULT_MAX_CONTEXT_LENGTH = 1000 64 | 65 | def __init__(self, max_context_length: int = DEFAULT_MAX_CONTEXT_LENGTH): 66 | self.max_context_length = max_context_length 67 | 68 | 69 | # A base class that inherits from ConfigurableMixin, where the derived class has 70 | # default components, one of them is a class that also inherits from ConfigurableMixin 71 | class BaseStubContextEngine(ConfigurableMixin): 72 | pass 73 | 74 | 75 | class StubContextEngine(BaseStubContextEngine): 76 | _DEFAULT_COMPONENTS = { 77 | 'knowledge_base': StubKB, 78 | 'context_builder': StubContextBuilder, 79 | } 80 | 81 | def __init__(self, 82 | knowledge_base: StubKB, 83 | context_builder: Optional[BaseStubContextBuilder] = None, 84 | filter: Optional[dict] = None, 85 | ): 86 | self.knowledge_base = knowledge_base or self._DEFAULT_COMPONENTS['kb']() 87 | self.context_builder = (context_builder or 88 | self._DEFAULT_COMPONENTS['context_builder']()) 89 | self.filter = filter 90 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | from typing import List 4 | 5 | from canopy.knowledge_base.knowledge_base import _get_global_client, INDEX_NAME_PREFIX 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def create_index_name(testrun_uid: str, prefix: str) -> str: 11 | today = datetime.today().strftime("%Y-%m-%d") 12 | return f"{testrun_uid[-6:]}-{prefix}-{today}" 13 | 14 | 15 | def create_system_tests_index_name(testrun_uid: str) -> str: 16 | return create_index_name(testrun_uid, "test-kb") 17 | 18 | 19 | def create_e2e_tests_index_name(testrun_uid: str, index_type: str) -> str: 20 | return create_index_name(testrun_uid, f"test-app-{index_type}") 21 | 22 | 23 | def get_related_indexes(indexes: List[str], testrun_uid: str) -> List[str]: 24 | return [ 25 | index for index in indexes 26 | if index.startswith(f"{INDEX_NAME_PREFIX}{testrun_uid[-6:]}") 27 | ] 28 | 29 | 30 | def cleanup_indexes(testrun_uid: str): 31 | client = _get_global_client() 32 | current_indexes = client.list_indexes().names() 33 | index_names = get_related_indexes(current_indexes, testrun_uid) 34 | logger.info(f"Preparing to cleanup indexes: {index_names}") 35 | for index_name in index_names: 36 | logger.info(f"Deleting index '{index_name}'...") 37 | client.delete_index(index_name) 38 | logger.info(f"Index '{index_name}' deleted.") 39 | --------------------------------------------------------------------------------