├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── user-story.md ├── actions │ └── integration_tests │ │ └── action.yml ├── dependabot.yml ├── pull_request_template.md ├── utils │ ├── add-category-id.py │ └── pydoc-markdown.sh └── workflows │ ├── api-docs.yaml │ ├── compliance.yml │ ├── continuous-deployment-dev.yml │ ├── continuous-deployment-prod.yml │ ├── continuous-integration.yml │ ├── deploy-prod.yml │ ├── deploy-test.yml │ ├── deploy.yml │ ├── high-prio-bug.yml │ └── merge-queue.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── assets ├── cli.gif └── logo.png ├── deepset_cloud_sdk ├── README.md ├── __about__.py ├── __init__.py ├── _api │ ├── config.py │ ├── deepset_cloud_api.py │ ├── files.py │ └── upload_sessions.py ├── _s3 │ ├── __init__.py │ └── upload.py ├── _service │ └── files_service.py ├── _utils │ ├── __init__.py │ └── datetime.py ├── cli.py ├── models.py └── workflows │ ├── __init__.py │ ├── async_client │ ├── __init__.py │ └── files.py │ ├── pipeline_client │ ├── __init__.py │ ├── models.py │ ├── pipeline_client.py │ └── pipeline_service.py │ └── sync_client │ ├── __init__.py │ ├── files.py │ └── utils.py ├── docs ├── _images │ ├── favicon.svg │ └── white-logo.svg ├── _pydoc │ ├── __init__.py │ ├── config │ │ ├── async_client.yml │ │ ├── cli.yml │ │ └── sync_client.yml │ ├── renderers.py │ └── requirements.txt ├── _stylesheets │ └── extra.css ├── examples │ ├── cli │ │ └── README.md │ ├── data │ │ ├── example.pdf │ │ ├── example.txt │ │ └── example.txt.meta.json │ └── sdk │ │ ├── README.md │ │ └── upload.py ├── index.md └── upload_files.md ├── mkdocs.yml ├── pyproject.toml ├── test-upload ├── example.txt ├── example.txt.meta.json ├── example2.txt └── example2.txt.meta.json └── tests ├── __init__.py ├── conftest.py ├── data ├── .fake-env ├── direct_upload │ ├── example.txt │ └── example.txt.meta.json ├── example.txt ├── upload_folder │ ├── example.csv │ ├── example.csv.meta.json │ ├── example.docx │ ├── example.html │ ├── example.jpg │ ├── example.json │ ├── example.md │ ├── example.pdf │ ├── example.pptx │ ├── example.txt │ ├── example.txt.meta.json │ ├── example.xlsx │ └── example.xml ├── upload_folder_nested │ ├── example.txt │ ├── meta │ │ └── example.txt.meta.json │ └── nested_folder │ │ └── second.txt └── upload_folder_with_duplicates │ ├── file1.txt │ ├── file2.txt │ └── old_files │ ├── file1.txt │ └── file2.txt ├── integration ├── api │ ├── test_integration_files.py │ └── test_integration_upload_sessions.py ├── service │ └── test_integration_files_service.py └── workflows │ └── test_integration_pipeline_client.py ├── test_data ├── basic.txt ├── msmarco.10 │ ├── 103275.txt │ ├── 103275.txt.meta.json │ ├── 103291.txt │ ├── 103291.txt.meta.json │ ├── 110580.txt │ ├── 110580.txt.meta.json │ ├── 117256.txt │ ├── 117256.txt.meta.json │ ├── 16675.txt │ ├── 16675.txt.meta.json │ ├── 22297.txt │ ├── 22297.txt.meta.json │ ├── 35887.txt │ ├── 35887.txt.meta.json │ ├── 61768.txt │ ├── 61768.txt.meta.json │ ├── 79388.txt │ ├── 79388.txt.meta.json │ ├── 87243.txt │ └── 87243.txt.meta.json ├── multiple_file_types │ ├── file00.txt │ ├── file00.txt.meta.json │ ├── file01.xml │ ├── file01.xml.meta.json │ ├── file02.pptx │ ├── file02.pptx.meta.json │ ├── file03.xlsx │ ├── file03.xlsx.meta.json │ ├── file04.json │ ├── file04.json.meta.json │ ├── file05.docx │ ├── file05.docx.meta.json │ ├── file06.md │ ├── file06.md.meta.json │ ├── file07.csv │ ├── file07.csv.meta.json │ ├── file08.pdf │ ├── file08.pdf.meta.json │ ├── file09.html │ └── file09.html.meta.json └── multiple_file_types_caps │ ├── File00.txt │ └── File00.txt.meta.json └── unit ├── api ├── test_deepset_cloud_api.py ├── test_files.py └── test_upload_sessions.py ├── s3 └── test_upload.py ├── service └── test_files_service.py ├── test_cli.py ├── utils ├── test_datetime_utils.py └── test_load_configuration.py └── workflows ├── async_client └── test_async_workflow_files.py ├── pipeline_client ├── test_models.py ├── test_pipeline_client.py └── test_pipeline_service.py └── sync_client ├── test_sync_workflow_files.py └── test_utils.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @deepset-ai/grow-squad-backend 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: 'Create a report ' 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | 27 | **Checklist** 28 | - [ ] I added a label for the level of urgency of this bug (see definitions [here](https://www.notion.so/deepsetai/DC-processes-Bugs-and-Issues-79f7250be94b450a934296afd987a29a#4139a178336c439b8f52a99dde5e6b87)) 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/user-story.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: User Story 3 | about: User Story 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **User Story** 11 | As a ... 12 | I want ... 13 | So that ... 14 | 15 | **Relevant Links** 16 | 17 | **Acceptance Criteria** 18 | - Criteria 1 19 | - Criteria 2 20 | - Criteria 3 21 | 22 | **Task List** 23 | - [ ] #2 24 | - [ ] Task 2 25 | - [ ] Task 3 26 | - ... 27 | -------------------------------------------------------------------------------- /.github/actions/integration_tests/action.yml: -------------------------------------------------------------------------------- 1 | name: "Integration Tests" 2 | description: "Runs the Integration tests " 3 | 4 | inputs: 5 | API_KEY: 6 | description: "The API_KEY for deepset Cloud" 7 | required: true 8 | API_URL: 9 | description: "The API_URL for deepset Cloud" 10 | required: true 11 | 12 | outputs: {} 13 | runs: 14 | using: "composite" 15 | steps: 16 | - uses: actions/setup-python@v4 17 | with: 18 | python-version: "3.10" 19 | - name: Install Hatch 20 | shell: bash 21 | run: pip install hatch==${{ env.HATCH_VERSION }} 22 | # we are using the "automated-tests" organization with predefined users and workspaces 23 | - name: Run SDK Tests 24 | shell: bash 25 | run: | 26 | API_KEY=${{inputs.API_KEY}} API_URL=${{inputs.API_URL}} hatch run test:integration 27 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # See all config options here https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 2 | version: 2 3 | updates: 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "weekly" 8 | commit-message: 9 | prefix: "build: " 10 | groups: 11 | python-production-updates: 12 | dependency-type: "production" 13 | update-types: 14 | - "major" 15 | - "minor" 16 | - "patch" 17 | python-development-updates: 18 | dependency-type: "development" 19 | update-types: 20 | - "major" 21 | - "minor" 22 | - "patch" 23 | - package-ecosystem: "github-actions" 24 | directory: "/" 25 | schedule: 26 | interval: "weekly" 27 | commit-message: 28 | prefix: "ci: " 29 | groups: 30 | # Specify a name for the group, which will be used in pull request titles 31 | # and branch names 32 | ci-updates: 33 | update-types: 34 | - "minor" 35 | - "patch" 36 | - "major" 37 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Related Issues 2 | 3 | - fixes #issue-number 4 | 5 | ### Proposed Changes? 6 | 7 | 8 | 9 | ### How did you test it? 10 | 11 | 12 | 13 | ### Notes for the reviewer 14 | 15 | 16 | 17 | ### Screenshots (optional) 18 | 19 | 20 | 21 | ### Checklist 22 | 23 | - [ ] I have updated the referenced issue with new insights and changes 24 | - [ ] If this is a code change, I have added unit tests 25 | - [ ] I've used the [conventional commit specification](https://www.conventionalcommits.org/en/v1.0.0/) for my PR title 26 | - [ ] I updated the docstrings 27 | - [ ] If this is a code change, I added meaningful logs and prepared Datadog visualizations and alerts 28 | -------------------------------------------------------------------------------- /.github/utils/add-category-id.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from typing import List 4 | 5 | 6 | def read_file(file_path: str) -> List[str]: 7 | """Reads the content of a markdown file and returns it as a list of lines.""" 8 | with open(file_path, "r", encoding="utf-8") as file: 9 | content = file.readlines() 10 | return content 11 | 12 | 13 | def modify_header(content: List[str], category_id: str) -> List[str]: 14 | """Modifies the YAML front matter in the markdown content to include the category.""" 15 | in_header = False 16 | new_content = [] 17 | category_added = False 18 | end_header_pattern = r"^---$" 19 | start_header_found = False 20 | 21 | for line in content: 22 | if re.match(end_header_pattern, line) and start_header_found: 23 | in_header = False 24 | if not category_added: 25 | new_content.append(f"category: {category_id}\n") 26 | new_content.append(line) 27 | elif in_header: 28 | if line.startswith("category:"): 29 | new_content.append(f"category: {category_id}\n") 30 | category_added = True 31 | else: 32 | new_content.append(line) 33 | else: 34 | if line.strip() == "---": 35 | in_header = True 36 | start_header_found = True 37 | new_content.append(line) 38 | return new_content 39 | 40 | 41 | def update_markdown_files(directory: str, category_id: str) -> None: 42 | """Updates all markdown files in a given directory by modifying their headers.""" 43 | for filename in os.listdir(directory): 44 | if filename.endswith(".md"): 45 | file_path = os.path.join(directory, filename) 46 | content = read_file(file_path) 47 | modified_content = modify_header(content, category_id) 48 | with open(file_path, "w", encoding="utf-8") as file: 49 | file.writelines(modified_content) 50 | 51 | 52 | # Example usage 53 | if __name__ == "__main__": 54 | directory = os.getenv("MARKDOWN_FILES_DIRECTORY", "default_directory") 55 | category_id = os.getenv("CATEGORY_ID", "default_category_id") 56 | update_markdown_files(directory, category_id) 57 | -------------------------------------------------------------------------------- /.github/utils/pydoc-markdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # Fails on any error in the following loop 4 | export PYTHONPATH=$PWD/docs/_pydoc # Make the renderers available to pydoc 5 | cd docs/_pydoc 6 | rm -rf temp && mkdir temp 7 | cd temp 8 | for file in ../config/* ; do 9 | echo "Converting $file..." 10 | pydoc-markdown "$file" 11 | done 12 | -------------------------------------------------------------------------------- /.github/workflows/api-docs.yaml: -------------------------------------------------------------------------------- 1 | name: API Docs 2 | 3 | on: 4 | release: 5 | types: 6 | - published 7 | 8 | env: 9 | CATEGORY_ID: ${{ secrets.CATEGORY_ID }} 10 | 11 | permissions: 12 | contents: write 13 | 14 | jobs: 15 | deploy: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | 21 | - name: Setup python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: 3.x 25 | 26 | - name: Setup cache 27 | uses: actions/cache@v4 28 | with: 29 | key: ${{ github.ref }} 30 | path: .cache 31 | 32 | - name: Install dependencies for doc generation 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install -r docs/_pydoc/requirements.txt 36 | pip install --upgrade setuptools # Fix to prevent: ModuleNotFoundError: No module named 'pkg_resources' 37 | 38 | 39 | - name: Generate API docs 40 | run: ./.github/utils/pydoc-markdown.sh 41 | 42 | - name: Configure git to push docs 43 | run: | 44 | git config --global user.name docs-bot 45 | git config --global user.email docs@bot.com 46 | git config pull.rebase false 47 | git pull --allow-unrelated-histories origin gh-pages 48 | 49 | - name: Install dependencies for doc deployment 50 | run: pip install mkdocs-material mkdocstrings[python] mkdocs-mermaid2-plugin mike 51 | 52 | - name: Publish docs to pages 53 | run: | 54 | mike deploy --push --update-aliases ${{github.ref_name}} && \ 55 | mike set-default --push ${{github.ref_name}} 56 | 57 | - name: Add Category ID to all API docs 58 | run: python ./.github/utils/add-category-id.py 59 | env: 60 | MARKDOWN_FILES_DIRECTORY: docs/_pydoc/temp/ 61 | CATEGORY_ID: ${{env.CATEGORY_ID}} 62 | 63 | - name: Run `docs` command 🚀 64 | uses: readmeio/rdme@v10 65 | with: 66 | rdme: docs docs/_pydoc/temp --key=${{ secrets.README_API_KEY }} --version=1.0 67 | -------------------------------------------------------------------------------- /.github/workflows/compliance.yml: -------------------------------------------------------------------------------- 1 | name: Compliance Checks 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'pyproject.toml' 7 | schedule: 8 | - cron: '0 0 * * *' # every day at midnight 9 | 10 | env: 11 | SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} 12 | SLACK_ALERT_CHANNEL: "#dc-alerts" 13 | 14 | jobs: 15 | check-license-compliance: 16 | name: Check license compliance 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c 22 | with: 23 | python-version: "3.10" 24 | 25 | - name: Install prod dependencies 26 | run: | 27 | pip install hatch==v1.14.0 28 | hatch run tools:requirements 29 | pip install -r requirements.txt 30 | 31 | - name: Create file with full dependency list 32 | run: | 33 | pip freeze > requirements-full.txt 34 | 35 | - name: Send license report to Fossa 36 | # This will collect all necessary information (mostly used dependencies) and send it to the Fossa API 37 | uses: fossas/fossa-action@3ebcea1862c6ffbd5cf1b4d0bd6b3fe7bd6f2cac # Use a specific version if locking is preferred 38 | with: 39 | api-key: ${{ secrets.FOSSA_LICENSE_SCAN_TOKEN }} 40 | 41 | - name: Check license compliance 42 | # This will poll the Fossa API until they have processed the information which we've sent in the previous step 43 | # and fail if Fossa found an issue with the licences of our dependencies. 44 | uses: fossas/fossa-action@3ebcea1862c6ffbd5cf1b4d0bd6b3fe7bd6f2cac # Use a specific version if locking is preferred 45 | with: 46 | api-key: ${{ secrets.FOSSA_LICENSE_SCAN_TOKEN }} 47 | run-tests: true 48 | 49 | - name: Send Slack notification if license check failed 50 | uses: act10ns/slack@44541246747a30eb3102d87f7a4cc5471b0ffb7d 51 | if: failure() && github.ref == 'refs/heads/main' 52 | with: 53 | status: ${{ job.status }} 54 | channel: ${{ env.SLACK_ALERT_CHANNEL }} 55 | -------------------------------------------------------------------------------- /.github/workflows/continuous-deployment-dev.yml: -------------------------------------------------------------------------------- 1 | name: CD - Dev Integration Tests 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | api_url: 7 | required: true 8 | default: https://api.dev.cloud.dpst.dev/api/v1 9 | type: string 10 | description: "The API URL for the test run" 11 | 12 | env: 13 | HATCH_VERSION: "v1.14.0" # keep in sync with deploy.yml 14 | 15 | jobs: 16 | tests: 17 | name: Tests 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Run integration tests 22 | uses: ./.github/actions/integration_tests 23 | with: 24 | API_KEY: "${{secrets.API_KEY}}" 25 | API_URL: "${{inputs.api_url}}" 26 | -------------------------------------------------------------------------------- /.github/workflows/continuous-deployment-prod.yml: -------------------------------------------------------------------------------- 1 | name: CD - Prod Integration Tests 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | api_url: 7 | required: false 8 | default: "https://api.cloud.deepset.ai/api/v1" 9 | type: string 10 | description: "The API URL for the test run" 11 | 12 | env: 13 | HATCH_VERSION: "v1.14.0" # keep in sync with deploy.yml 14 | 15 | jobs: 16 | tests: 17 | name: Tests 18 | environment: PROD 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Run integration tests 23 | uses: ./.github/actions/integration_tests 24 | with: 25 | API_KEY: "${{secrets.API_KEY}}" 26 | API_URL: ${{ inputs.api_url }} 27 | -------------------------------------------------------------------------------- /.github/workflows/continuous-integration.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | 6 | env: 7 | HATCH_VERSION: "v1.14.0" # keep in sync with deploy.yml 8 | 9 | permissions: 10 | pull-requests: write 11 | contents: write 12 | 13 | jobs: 14 | format-black: 15 | name: Format black 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.10" 22 | - name: Install Hatch 23 | run: pip install hatch==${{ env.HATCH_VERSION }} 24 | - name: Run black 25 | run: hatch run code-quality:format 26 | 27 | 28 | mypy: 29 | name: MyPy 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v4 33 | - uses: actions/setup-python@v5 34 | with: 35 | python-version: "3.10" 36 | - name: Install Hatch 37 | run: pip install hatch==${{ env.HATCH_VERSION }} 38 | - name: Run mypy 39 | run: hatch run code-quality:types 40 | 41 | lint: 42 | name: Lint Code 43 | runs-on: ubuntu-latest 44 | steps: 45 | - uses: actions/checkout@v4 46 | - uses: actions/setup-python@v5 47 | with: 48 | python-version: "3.10" 49 | - name: Install Hatch 50 | run: pip install hatch==${{ env.HATCH_VERSION }} 51 | - name: Run pylint 52 | run: hatch run code-quality:lint 53 | 54 | isort: 55 | name: Sort imports 56 | runs-on: ubuntu-latest 57 | steps: 58 | - uses: actions/checkout@v4 59 | - uses: actions/setup-python@v5 60 | with: 61 | python-version: "3.10" 62 | - name: Install Hatch 63 | run: pip install hatch==${{ env.HATCH_VERSION }} 64 | - name: Run mypy 65 | run: hatch run code-quality:sort 66 | 67 | pydocstyle: 68 | name: Check docstrings 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v4 72 | - uses: actions/setup-python@v5 73 | with: 74 | python-version: "3.10" 75 | - name: Install Hatch 76 | run: pip install hatch==${{ env.HATCH_VERSION }} 77 | - name: Run pydocstyle 78 | run: hatch run code-quality:docstrings 79 | 80 | scan-for-secrets: 81 | name: Scan for secrets 82 | runs-on: ubuntu-latest 83 | steps: 84 | - uses: actions/checkout@v4 85 | with: 86 | fetch-depth: 0 87 | - name: Install gitleaks 88 | run: wget -O - https://github.com/gitleaks/gitleaks/releases/download/v8.16.1/gitleaks_8.16.1_linux_x64.tar.gz | tar -xz 89 | - run: ./gitleaks detect --log-opts "${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}" 90 | if: github.event_name == 'pull_request' 91 | - run: ./gitleaks detect --log-opts "${{ github.event.before }}..${{ github.event.after }}" 92 | if: github.event_name == 'push' 93 | 94 | tests: 95 | name: Tests 96 | runs-on: ubuntu-latest 97 | env: 98 | API_KEY: "not-a-real-api-key" 99 | steps: 100 | - uses: actions/checkout@v4 101 | with: 102 | # for coverage comment action 103 | fetch-depth: 1000 104 | - uses: actions/setup-python@v5 105 | with: 106 | python-version: "3.10" 107 | - name: Install Hatch 108 | run: pip install hatch==${{ env.HATCH_VERSION }} 109 | - name: Run unit tests 110 | run: hatch run test:unit-with-cov 111 | - name: Coverage comment 112 | id: coverage_comment 113 | uses: py-cov-action/python-coverage-comment-action@0abd69a9baf90729d9b2d5b585fc790ec4e6f3dd 114 | with: 115 | GITHUB_TOKEN: ${{ github.token }} 116 | -------------------------------------------------------------------------------- /.github/workflows/deploy-prod.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to Prod PyPi 2 | 3 | on: 4 | release: 5 | types: 6 | - published 7 | workflow_dispatch: 8 | 9 | env: 10 | HATCH_VERSION: "v1.14.0" # keep in sync with deploy.yml 11 | 12 | permissions: 13 | id-token: write 14 | 15 | jobs: 16 | deploy-prod: 17 | runs-on: ubuntu-latest 18 | environment: release 19 | env: 20 | pypi: ${{ vars.PYPI_URL }} 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: actions/setup-python@v5 24 | with: 25 | python-version: "3.10" 26 | - name: Install Hatch 27 | run: pip install hatch==${{ env.HATCH_VERSION }} 28 | - name: Bump version 29 | # Bump version to Github action tag 30 | run: hatch version ${{github.ref_name}} 31 | - name: Build 32 | run: hatch build 33 | - name: publish 34 | uses: pypa/gh-action-pypi-publish@release/v1 35 | with: 36 | repository-url: ${{env.pypi}} 37 | -------------------------------------------------------------------------------- /.github/workflows/deploy-test.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to Test PyPi 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - labeled 7 | workflow_dispatch: 8 | 9 | env: 10 | HATCH_VERSION: "v1.14.0" # keep in sync with deploy.yml 11 | 12 | jobs: 13 | deploy-test: 14 | if: ${{ github.event.label.name == 'test-deploy' }} || github.event.label.name !='integration'` 15 | uses: ./.github/workflows/deploy.yml 16 | with: 17 | deployment_env: test 18 | api_url: "https://api.dev.cloud.dpst.dev/api/v1" 19 | secrets: inherit 20 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | 3 | on: 4 | workflow_call: 5 | inputs: 6 | deployment_env: 7 | required: true 8 | type: string 9 | api_url: 10 | required: true 11 | default: https://api.dev.cloud.dpst.dev/api/v1 12 | type: string 13 | version: 14 | required: true 15 | type: string 16 | 17 | permissions: 18 | id-token: write 19 | 20 | env: 21 | HATCH_VERSION: "v1.14.0" # keep in sync with continuous-integration.yml 22 | 23 | jobs: 24 | build-and-deploy: 25 | runs-on: ubuntu-latest 26 | environment: ${{inputs.deployment_env}} 27 | env: 28 | pypi: ${{ vars.PYPI_URL }} 29 | 30 | steps: 31 | - uses: actions/checkout@v4 32 | - uses: actions/setup-python@v5 33 | with: 34 | python-version: "3.10" 35 | - name: Install Hatch 36 | run: pip install hatch==${{ env.HATCH_VERSION }} 37 | - name: Bump version 38 | # Bump version to Github action tag 39 | run: hatch version ${{ inputs.version }} 40 | - name: Build 41 | run: hatch build 42 | - name: publish 43 | uses: pypa/gh-action-pypi-publish@release/v1 44 | with: 45 | repository-url: ${{env.pypi}} 46 | -------------------------------------------------------------------------------- /.github/workflows/high-prio-bug.yml: -------------------------------------------------------------------------------- 1 | name: Slack alert for high priority bugs 2 | 3 | on: 4 | issues: 5 | types: 6 | - labeled 7 | 8 | env: 9 | SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_BUG_ALERT_URL }} 10 | SLACK_ALERT_CHANNEL: "#dc-alerts" 11 | 12 | jobs: 13 | add-comment: 14 | if: github.event.label.name == 'high' 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Send Slack alert for high priority bug 18 | uses: act10ns/slack@44541246747a30eb3102d87f7a4cc5471b0ffb7d 19 | with: 20 | status: High Priority Bug 21 | message: High Priority Bug ${{ github.event.issue.html_url }} 22 | channel: ${{ env.SLACK_ALERT_CHANNEL }} 23 | -------------------------------------------------------------------------------- /.github/workflows/merge-queue.yml: -------------------------------------------------------------------------------- 1 | name: Merge Queue 2 | 3 | on: 4 | merge_group: 5 | pull_request: 6 | types: [labeled, synchronize] 7 | workflow_call: 8 | inputs: 9 | api_url: 10 | required: true 11 | type: string 12 | deployment_env: 13 | required: true 14 | type: string 15 | secrets: 16 | API_KEY: 17 | required: true 18 | 19 | env: 20 | HATCH_VERSION: "v1.14.0" # keep in sync with deploy.yml 21 | 22 | jobs: 23 | # the API_KEYs are stored as a secret in the repository 24 | # we are using the "automated-tests" organization with predefined users and workspaces 25 | integration_tests: 26 | name: Tests 27 | environment: ${{ github.event.inputs.deployment_env }} 28 | runs-on: ubuntu-latest 29 | if: (github.event.action =='labeled' && github.event.label.name =='integration') || (github.event.action =='synchronize' && contains(github.event.pull_request.labels.*.name, 'integration')) || github.event.action =='workflow_call' || github.event_name == 'merge_group' 30 | steps: 31 | - uses: actions/checkout@v4 32 | - name: Run integration tests 33 | uses: ./.github/actions/integration_tests 34 | with: 35 | API_KEY: "${{ inputs.deployment_env == 'release' && secrets.API_KEY_PROD || secrets.API_KEY}}" 36 | API_URL: "${{ inputs.api_url || 'https://api.dev.cloud.dpst.dev/api/v1'}}" 37 | 38 | build: 39 | name: Build package 40 | needs: [integration_tests] 41 | runs-on: ubuntu-latest 42 | steps: 43 | - uses: actions/checkout@v4 44 | - uses: actions/setup-python@v5 45 | with: 46 | python-version: "3.10" 47 | - name: Install Hatch 48 | run: pip install hatch==${{ env.HATCH_VERSION }} 49 | - name: Build 50 | run: hatch build 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # achive tests 10 | archive 11 | pytest_html_report.html 12 | output.json 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | *.env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | 168 | .vscode 169 | temp 170 | .idea 171 | .python-version 172 | .DS_Store 173 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.10 3 | fail_fast: true 4 | 5 | # We can't use local hooks since some developers use dev containers and commit outside an environment which has the 6 | # required dependencies installed. 7 | 8 | repos: 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v4.4.0 11 | hooks: 12 | - id: check-json # checks JSON files for parseable syntax. 13 | - id: check-yaml # checks yaml files for parseable syntax. 14 | - id: end-of-file-fixer # ensures that a file is either empty, or ends with one newline. 15 | - id: trailing-whitespace # trims trailing whitespace 16 | 17 | - repo: https://github.com/PyCQA/autoflake 18 | rev: v2.1.1 19 | hooks: 20 | - id: autoflake 21 | args: 22 | - "--in-place" 23 | - "--expand-star-imports" 24 | - "--remove-duplicate-keys" 25 | - "--remove-unused-variables" 26 | - "-v" 27 | 28 | - repo: https://github.com/psf/black 29 | # Please keep these aligned with the versions defined in the pyproject.toml [tool.hatch.envs.code-quality] 30 | rev: 23.3.0 31 | hooks: 32 | - id: black 33 | 34 | - repo: https://github.com/pre-commit/mirrors-mypy 35 | # Please keep these aligned with the versions defined in the pyproject.toml [tool.hatch.envs.code-quality] 36 | rev: "v1.1.1" 37 | hooks: 38 | - id: mypy 39 | args: 40 | - "--ignore-missing-imports" 41 | additional_dependencies: 42 | - types-aiofiles==23.1.0.2 43 | - "types-tabulate~=0.9.0" 44 | - "types-requests~=2.28.11" 45 | # - "types-Markdown~=3.4.2" 46 | # - "types-PyYAML~=6.0.12" 47 | # - "types-python-dateutil~=2.8.19" 48 | # - "types-redis~=4.5.1" 49 | 50 | - repo: https://github.com/pycqa/isort 51 | # Please keep these aligned with the versions defined in the pyproject.toml [tool.hatch.envs.code-quality] 52 | rev: 5.12.0 53 | hooks: 54 | - id: isort 55 | args: ["--profile", "black"] 56 | 57 | - repo: https://github.com/zricethezav/gitleaks 58 | rev: v8.16.1 59 | hooks: 60 | - id: gitleaks 61 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Setup 4 | 5 | ### Build from source 6 | 7 | Install hatch 8 | ``` 9 | pip install hatch=="v1.14.0" 10 | ``` 11 | 12 | ### Install pre-commit hooks 13 | ``` 14 | hatch run code-quality:hooks 15 | ``` 16 | 17 | ## CI 18 | Code quality checks, unit tests, and integration tests (against dev) are performed on the creation of a PR, and subsequent pushes for that PR. 19 | Code quality checks, unit tests, and integration tests (against dev) are performed on a push to main. 20 | Integration tests are triggered whenever the e2e tests are triggered (environment will be dependent on e2e tests) 21 | Code quality checks, unit tests, and integration tests (against prod) are performed on the publishing of a release tag. 22 | 23 | ## Deploy to test PyPi 24 | 25 | When you create a PR in the deepset-cloud-sdk repository, add the 'test-deploy' label to trigger deployment to the test PyPi repository. 26 | 27 | ## Publishing to PyPi 28 | 29 | To publish a new version of the SDK you will need to create and publish a new release tag. 30 | 31 | 32 | ## Software design 33 | 34 | Have a look at this [README](/deepset_cloud_sdk/README.md) to get an overview of the software design. 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2021 deepset GmbH 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | deepset Cloud SDK 3 |

4 | 5 | [![Coverage badge](https://github.com/deepset-ai/deepset-cloud-sdk/raw/python-coverage-comment-action-data/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/tree/python-coverage-comment-action-data) 6 | [![Tests](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/continuous-integration.yml/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/continuous-integration.yml) 7 | [![Deploy PyPi](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/deploy-prod.yml/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/deploy-prod.yml) 8 | [![Compliance Checks](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/compliance.yml/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/compliance.yml) 9 | 10 | The deepset Cloud SDK is an open source software development kit that provides convenient access to and integration with deepset Cloud, a powerful cloud offering for various natural language processing (NLP) tasks. 11 | This README provides an overview of the SDK and its features, and information on contributing to the project and exploring related resources. 12 | 13 | - [Official SDK Docs](https://docs.cloud.deepset.ai/docs/working-with-the-sdk) 14 | - Tutorials: 15 | - [Uploading with CLI](https://docs.cloud.deepset.ai/docs/tutorial-uploading-files-with-cli) 16 | - [Uploading with Python Methods](https://docs.cloud.deepset.ai/docs/tutorial-uploading-files-with-python-methods) 17 | 18 | # Supported Features 19 | 20 | In its current shape, the SDK offers a suite of tools for seamless data upload to deepset Cloud. 21 | The following examples demonstrate how to use the deepset Cloud SDK to interact with deepset Cloud using Python. 22 | You can use the deepset Cloud SDK in the command line as well. For more information, see the [CLI documentation](docs/examples/cli/README.md). 23 | 24 | - [SDK Examples - Upload datasets](/docs/examples/sdk/upload.py) 25 | - [CLI Examples - Upload datasets](/docs/examples/cli/README.md) 26 | 27 | ## Installation 28 | The deepset Cloud SDK is available on [PyPI](https://pypi.org/project/deepset-cloud-sdk/) and you can install it using pip: 29 | ```bash 30 | pip install deepset-cloud-sdk 31 | ``` 32 | 33 | After installing the deepset Cloud SDK, you can use it to interact with deepset Cloud. It comes with a command line interface (CLI), that you can use by calling: 34 | ```bash 35 | deepset-cloud --help 36 | ``` 37 | 38 |

39 | deepset Cloud CLI 40 |

41 | 42 | ### Development Installation 43 | To install the deepset Cloud SDK for development, clone the repository and install the package in editable mode: 44 | ```bash 45 | pip install hatch==1.7.0 46 | hatch build 47 | ``` 48 | 49 | Instead of calling the CLI from the build package, you can call it directly from the source code: 50 | ```bash 51 | python3 -m deepset_cloud_sdk.cli --help 52 | ``` 53 | 54 | ## Contributing 55 | We welcome contributions from the open source community to enhance the deepset Cloud SDK. If you would like to contribute, have a look at [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines and instructions on how to get started. 56 | We appreciate your contributions, whether they're bug fixes, new features, or documentation improvements. 57 | 58 | 59 | --- 60 | 61 | ## Interested in Haystack? 62 | deepset Cloud is powered by Haystack, an open source framework for building end-to-end NLP pipelines. 63 | 64 | - [Project website](https://haystack.deepset.ai/) 65 | - [GitHub repository](https://github.com/deepset-ai/haystack) 66 | 67 | --- 68 | 69 | # Licenses 70 | 71 | The SDK is licensed under Apache 2.0, you can see the license [here](https://github.com/deepset-ai/deepset-cloud-sdk/blob/main/LICENSE) 72 | 73 | We use several libraries in this SDK that are licensed under the [MPL 2.0 license](https://www.mozilla.org/en-US/MPL/2.0/) 74 | 75 | - [tqdm](https://github.com/tqdm/tqdm) for progress bars 76 | - [pathspec](https://github.com/cpburnz/python-pathspec) for pattern matching file paths 77 | - [certifi](https://github.com/certifi/python-certifi) for validating trustworthiness of SSL certificates 78 | -------------------------------------------------------------------------------- /assets/cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/assets/cli.gif -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/assets/logo.png -------------------------------------------------------------------------------- /deepset_cloud_sdk/README.md: -------------------------------------------------------------------------------- 1 | # Software development kit for the deepset Cloud API 2 | 3 | This package is split into multiple layers: 4 | - API layer 5 | - Client layer 6 | - Service layer 7 | - Workflow layer 8 | 9 | 10 | ### API layer 11 | This layer is the lowest level of abstraction and contains the API definition, including all HTTP methods. It takes care of the authentication. 12 | You can find this layer in the `deepset_cloud_sdk/_api/deepset_cloud_api.py` file. We should implement reties on this lowest layer. 13 | 14 | ### Client layer 15 | This layer adds a thin wrapper around the API layer and provides a more convenient interface to the API. It includes explicit methods 16 | for endpoints by specifying the HTTP methods and endpoints for example for uploading files. 17 | 18 | ### Service layer 19 | This layer takes care of combining client methods to provide more complex functionality. Within this layer, we can add functionalities like 20 | creating sessions, uploading files, and closing sessions. 21 | 22 | ### Workflow layer 23 | Public methods for users. These workflows are split into async and sync implementation. 24 | 25 | 26 | ## Software architecture principles 27 | 28 | ### Factories 29 | We are using factories implemented like this: 30 | ```python 31 | @classmethod 32 | async def factory(cls, config: CommonConfig) -> YourClass: 33 | """Create a new instance of the API client. 34 | 35 | :param config: CommonConfig object. 36 | """ 37 | yield cls(config) 38 | ``` 39 | 40 | ### Tests 41 | We are using the classical pyramid of tests: unit tests (for each layer), integration tests. The goal is to gradually test each layer and 42 | then test the whole stack once within the integration tests. 43 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/__about__.py: -------------------------------------------------------------------------------- 1 | """This file defines the package version.""" 2 | # Will be automatically overridden during the release process 3 | # It's okay if this is outdated in the repo. We will use the tag from the release as the version. 4 | __version__ = "1.0.2" 5 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/__init__.py: -------------------------------------------------------------------------------- 1 | """This is the entrypoint for the package.""" 2 | 3 | 4 | import logging 5 | 6 | import structlog 7 | 8 | from deepset_cloud_sdk.workflows.pipeline_client import PipelineClient 9 | from deepset_cloud_sdk.workflows.pipeline_client.models import ( 10 | IndexConfig, 11 | IndexInputs, 12 | IndexOutputs, 13 | PipelineConfig, 14 | PipelineInputs, 15 | PipelineOutputs, 16 | ) 17 | 18 | structlog.configure( 19 | wrapper_class=structlog.make_filtering_bound_logger(logging.INFO), 20 | ) 21 | 22 | log = structlog.get_logger() 23 | 24 | __all__ = [ 25 | "PipelineClient", 26 | "PipelineConfig", 27 | "PipelineInputs", 28 | "PipelineOutputs", 29 | "IndexConfig", 30 | "IndexInputs", 31 | "IndexOutputs", 32 | ] 33 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/_api/config.py: -------------------------------------------------------------------------------- 1 | """Config for loading env variables and setting default values.""" 2 | 3 | import os 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | 7 | import structlog 8 | from dotenv import load_dotenv 9 | 10 | logger = structlog.get_logger(__name__) 11 | 12 | ENV_FILE_PATH = Path.home() / ".deepset-cloud" / ".env" 13 | 14 | 15 | def load_environment(show_warnings: bool = True) -> bool: 16 | """Load environment variables using a cascading fallback model. 17 | 18 | 1. Load local .env file in current directory if it exists 19 | 2. Load from global ~/.deepset-cloud/.env to supplement local .env file 20 | 3. Environment variables can override both local and global .env files 21 | 22 | :param show_warnings: Whether to show warnings about missing files/variables 23 | :return: True if required environment variables were loaded successfully, False otherwise. 24 | """ 25 | current_path_env = Path.cwd() / ".env" 26 | local_loaded = current_path_env.is_file() and load_dotenv(current_path_env) 27 | global_loaded = ENV_FILE_PATH.is_file() and load_dotenv(ENV_FILE_PATH, override=False) 28 | 29 | if local_loaded: 30 | logger.info(f"Environment variables successfully loaded from local .env file at {current_path_env}.") 31 | if global_loaded: 32 | if local_loaded: 33 | logger.info(f"Loaded global .env file at {ENV_FILE_PATH} to supplement local .env file.") 34 | else: 35 | logger.info(f"Environment variables successfully loaded from global .env file at {ENV_FILE_PATH}.") 36 | 37 | if not (local_loaded or global_loaded) and show_warnings: 38 | logger.warning( 39 | "No .env files found. Run `deepset-cloud login` to create a global configuration file. " 40 | "You can also create a custom local .env file in your project directory." 41 | ) 42 | return False 43 | 44 | # Check for required environment variables 45 | required_vars = ["API_KEY", "API_URL", "DEFAULT_WORKSPACE_NAME"] 46 | missing_vars = [var for var in required_vars if not os.getenv(var)] 47 | 48 | if missing_vars and show_warnings: 49 | logger.warning( 50 | f"Missing required environment variables: {', '.join(missing_vars)}. " 51 | "Run `deepset-cloud login` to set up your configuration or set these variables " 52 | "manually in your .env file." 53 | ) 54 | return False 55 | 56 | return True 57 | 58 | 59 | # Load environment variables silently at import time to support CLI commands that depend on .env files. 60 | # Warnings are only shown later in CommonConfig when users don't provide explicit parameters 61 | # and the config values fall back to global defaults. 62 | load_environment(show_warnings=False) 63 | 64 | # connection to deepset AI Platform 65 | API_URL: str = os.getenv("API_URL", "https://api.cloud.deepset.ai/api/v1") 66 | 67 | API_KEY: str = os.getenv("API_KEY", "") 68 | 69 | # configuration to use a selected workspace 70 | DEFAULT_WORKSPACE_NAME: str = os.getenv("DEFAULT_WORKSPACE_NAME", "") 71 | 72 | ASYNC_CLIENT_TIMEOUT: int = int(os.getenv("ASYNC_CLIENT_TIMEOUT", "300")) 73 | 74 | 75 | @dataclass 76 | class CommonConfig: 77 | """Common config for connecting to the deepset AI Platform. 78 | 79 | Configuration is loaded in the following order of precedence: 80 | 1. Explicit parameters passed to this class 81 | 2. Environment variables 82 | 3. Local .env file in project root 83 | 4. Global .env file in ~/.deepset-cloud/ (supplements local .env) 84 | 5. Built-in defaults 85 | """ 86 | 87 | api_key: str = "" 88 | api_url: str = "" 89 | safe_mode: bool = False 90 | 91 | def __post_init__(self) -> None: 92 | """Validate config.""" 93 | # Only try loading from environment if user didn't provide explicit parameters) 94 | if not self.api_key or not self.api_url: 95 | load_environment(show_warnings=True) 96 | if not self.api_key: 97 | self.api_key = os.getenv("API_KEY", "") 98 | if not self.api_url: 99 | self.api_url = os.getenv("API_URL", "https://api.cloud.deepset.ai/api/v1") 100 | 101 | if not self.api_key: 102 | raise ValueError( 103 | "API key is required. Either set the API_KEY environment variable or pass api_key parameter. Go to [API Keys](https://cloud.deepset.ai/settings/api-keys) in deepset AI Platform to get an API key." 104 | ) 105 | 106 | if self.api_url.endswith("/"): 107 | self.api_url = self.api_url[:-1] 108 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/_api/deepset_cloud_api.py: -------------------------------------------------------------------------------- 1 | """DeepsetCloudAPI class.""" 2 | from __future__ import annotations 3 | 4 | from contextlib import asynccontextmanager 5 | from typing import Any, AsyncGenerator, Callable, Dict, Optional 6 | 7 | import httpx 8 | import structlog 9 | from httpx import Response 10 | from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed 11 | 12 | from deepset_cloud_sdk._api.config import CommonConfig 13 | 14 | logger = structlog.get_logger(__name__) 15 | 16 | 17 | DEFAULT_MAX_ATTEMPTS = 3 18 | SAFE_MODE_MAX_ATTEMPTS = 10 19 | 20 | 21 | class WorkspaceNotDefinedError(Exception): 22 | """The workspace_name is not defined. Set an environment variable or pass the `workspace_name` argument.""" 23 | 24 | 25 | class DeepsetCloudAPI: 26 | """deepset Cloud API client. 27 | 28 | This class takes care of all API calls to deepset Cloud and handles authentication and errors. 29 | """ 30 | 31 | def __init__(self, config: CommonConfig, client: httpx.AsyncClient) -> None: 32 | """Create a deepset Cloud API client. 33 | 34 | Add a config for authentication and a HTTPX client for 35 | sending requests. 36 | 37 | :param config: Config for authentication. 38 | :param client: HTTPX client for sending requests. 39 | """ 40 | self.headers = { 41 | "Accept": "application/json", 42 | "Authorization": f"Bearer {config.api_key}", 43 | "X-Client-Source": "deepset-cloud-sdk", 44 | } 45 | self.base_url = lambda workspace_name: self._get_base_url(config.api_url)(workspace_name) 46 | self.client = client 47 | self.max_attempts = SAFE_MODE_MAX_ATTEMPTS if config.safe_mode else DEFAULT_MAX_ATTEMPTS 48 | 49 | @staticmethod 50 | def _get_base_url(api_url: str) -> Callable: 51 | def func(workspace_name: str) -> str: 52 | """Get the base URL for the API. 53 | 54 | :param workspace_name: Name of the workspace to use. 55 | :return: Base URL. 56 | """ 57 | if not workspace_name or workspace_name == "": 58 | raise WorkspaceNotDefinedError( 59 | f"Workspace name is not defined. Got '{workspace_name}'. Enter the name of the workspace in `workspace_name`." 60 | ) 61 | 62 | return f"{api_url}/workspaces/{workspace_name}" 63 | 64 | return func 65 | 66 | @classmethod 67 | @asynccontextmanager 68 | async def factory(cls, config: CommonConfig) -> AsyncGenerator[DeepsetCloudAPI, None]: 69 | """Create a new instance of the API client. 70 | 71 | :param config: CommonConfig object. 72 | """ 73 | if config.safe_mode: 74 | safe_mode_limits = httpx.Limits(max_keepalive_connections=1, max_connections=1) 75 | safe_mode_timeout = httpx.Timeout(None) 76 | async with httpx.AsyncClient(limits=safe_mode_limits, timeout=safe_mode_timeout) as client: 77 | yield cls(config, client) 78 | else: 79 | async with httpx.AsyncClient() as client: 80 | yield cls(config, client) 81 | 82 | async def get( 83 | self, workspace_name: str, endpoint: str, params: Optional[Dict[str, Any]] = None, timeout_s: int = 20 84 | ) -> Response: 85 | """Make a GET request to the deepset Cloud API. 86 | 87 | :param workspace_name: Name of the workspace to use. 88 | :param endpoint: Endpoint to call. 89 | :param params: Query parameters to pass. 90 | :param timeout_s: Timeout in seconds. 91 | :return: Response object. 92 | """ 93 | 94 | @retry( 95 | retry=retry_if_exception_type(httpx.RequestError), 96 | stop=stop_after_attempt(self.max_attempts), 97 | wait=wait_fixed(1), 98 | reraise=True, 99 | ) 100 | async def retry_wrapper() -> Response: 101 | return await self._get(workspace_name, endpoint, params, timeout_s) 102 | 103 | return await retry_wrapper() 104 | 105 | async def _get( 106 | self, workspace_name: str, endpoint: str, params: Optional[Dict[str, Any]] = None, timeout_s: int = 20 107 | ) -> Response: 108 | response = await self.client.get( 109 | f"{self.base_url(workspace_name)}/{endpoint}", 110 | params=params or {}, 111 | headers=self.headers, 112 | timeout=timeout_s, 113 | ) 114 | logger.debug( 115 | "Called deepset Cloud API.", 116 | method="GET", 117 | workspace=workspace_name, 118 | endpoint=endpoint, 119 | params=params, 120 | status=response.status_code, 121 | ) 122 | return response 123 | 124 | async def post( 125 | self, 126 | workspace_name: str, 127 | endpoint: str, 128 | params: Optional[Dict[str, Any]] = None, 129 | json: Optional[Dict[str, Any]] = None, 130 | files: Optional[Dict[str, Any]] = None, 131 | data: Optional[Dict[str, Any]] = None, 132 | timeout_s: int = 20, 133 | ) -> Response: 134 | """Make a POST request to the deepset Cloud API. 135 | 136 | :param workspace_name: Name of the workspace to use. 137 | :param endpoint: Endpoint to call. 138 | :param params: Query parameters to pass. 139 | :param json: JSON data to pass. 140 | :param data: Data to pass. 141 | :param files: Files to pass. 142 | :param timeout_s: Timeout in seconds. 143 | :return: Response object. 144 | """ 145 | response = await self.client.post( 146 | f"{self.base_url(workspace_name)}/{endpoint}", 147 | params=params or {}, 148 | json=json or {}, 149 | data=data or {}, 150 | files=files, 151 | headers=self.headers, 152 | timeout=timeout_s, 153 | ) 154 | logger.debug( 155 | "Called deepset Cloud API", 156 | method="POST", 157 | workspace=workspace_name, 158 | endpoint=endpoint, 159 | data=data or {}, 160 | files=files, 161 | status=response.status_code, 162 | ) 163 | return response 164 | 165 | async def delete( 166 | self, workspace_name: str, endpoint: str, params: Optional[Dict[str, Any]] = None, timeout_s: int = 20 167 | ) -> Response: 168 | """ 169 | Make a DELETE request to the deepset Cloud API. 170 | 171 | :param workspace_name: Name of the workspace to use. 172 | :param endpoint: Endpoint to call. 173 | :param params: Query parameters to pass. 174 | :param timeout_s: Timeout in seconds. 175 | :return: Response object. 176 | """ 177 | response = await self.client.delete( 178 | f"{self.base_url(workspace_name)}/{endpoint}", 179 | params=params or {}, 180 | headers=self.headers, 181 | timeout=timeout_s, 182 | ) 183 | logger.debug( 184 | "Called deepset Cloud API", 185 | method="DELETE", 186 | workspace=workspace_name, 187 | endpoint=endpoint, 188 | params=params, 189 | status=response.status_code, 190 | ) 191 | return response 192 | 193 | async def put( 194 | self, 195 | workspace_name: str, 196 | endpoint: str, 197 | params: Optional[Dict[str, Any]] = None, 198 | data: Optional[Dict[str, Any]] = None, 199 | timeout_s: int = 20, 200 | ) -> Response: 201 | """Make a PUT request to the deepset Cloud API. 202 | 203 | :param workspace_name: Name of the workspace to use. 204 | :param endpoint: Endpoint to call. 205 | :param params: Query parameters to pass. 206 | :param data: Data to pass. 207 | :param timeout_s: Timeout in seconds. 208 | :return: Response object. 209 | """ 210 | 211 | @retry( 212 | retry=retry_if_exception_type(httpx.ConnectError), 213 | stop=stop_after_attempt(self.max_attempts), 214 | wait=wait_fixed(1), 215 | reraise=True, 216 | ) 217 | async def retry_wrapper() -> Response: 218 | return await self._put(workspace_name, endpoint, params, data, timeout_s) 219 | 220 | return await retry_wrapper() 221 | 222 | async def _put( 223 | self, 224 | workspace_name: str, 225 | endpoint: str, 226 | params: Optional[Dict[str, Any]] = None, 227 | data: Optional[Dict[str, Any]] = None, 228 | timeout_s: int = 20, 229 | ) -> Response: 230 | response = await self.client.put( 231 | f"{self.base_url(workspace_name)}/{endpoint}", 232 | params=params or {}, 233 | json=data or {}, 234 | headers=self.headers, 235 | timeout=timeout_s, 236 | ) 237 | logger.debug( 238 | "Called deepset Cloud API", 239 | method="PUT", 240 | workspace=workspace_name, 241 | endpoint=endpoint, 242 | data=data or {}, 243 | status=response.status_code, 244 | ) 245 | return response 246 | 247 | 248 | def get_deepset_cloud_api(config: CommonConfig, client: httpx.AsyncClient) -> DeepsetCloudAPI: # noqa 249 | """deepset Cloud API factory. Return an instance of DeepsetCloudAPI. 250 | 251 | :param config: CommonConfig object. 252 | :param client: httpx.AsyncClient object. 253 | :return: DeepsetCloudAPI object. 254 | """ 255 | return DeepsetCloudAPI(config=config, client=client) 256 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/_api/files.py: -------------------------------------------------------------------------------- 1 | """ 2 | File API for deepset Cloud. 3 | 4 | This module takes care of all file-related API calls to deepset Cloud, including uploading, downloading, listing, and 5 | deleting files. 6 | """ 7 | 8 | import datetime 9 | import inspect 10 | import json 11 | from dataclasses import dataclass 12 | from pathlib import Path 13 | from typing import Any, Dict, List, Optional, Union 14 | from uuid import UUID 15 | 16 | import structlog 17 | from httpx import codes 18 | 19 | from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI 20 | from deepset_cloud_sdk._api.upload_sessions import WriteMode 21 | from deepset_cloud_sdk._utils.datetime import from_isoformat 22 | 23 | logger = structlog.get_logger(__name__) 24 | 25 | 26 | class NotMatchingFileTypeException(Exception): 27 | """Exception raised when a file is not matching the file type.""" 28 | 29 | 30 | class FileNotFoundInDeepsetCloudException(Exception): 31 | """Exception raised when a file is not found.""" 32 | 33 | 34 | class FailedToUploadFileException(Exception): 35 | """Exception raised when a file failed to be uploaded.""" 36 | 37 | 38 | @dataclass 39 | class File: 40 | """File primitive from deepset Cloud. This dataclass is used for all file-related operations that don't include the actual file content.""" 41 | 42 | file_id: UUID 43 | url: str 44 | name: str 45 | size: int 46 | created_at: datetime.datetime 47 | meta: Dict[str, Any] 48 | 49 | @classmethod 50 | def from_dict(cls, env: Dict[str, Any]) -> Any: 51 | """Parse a dictionary into a File object. 52 | 53 | Ignores keys that don't exist. 54 | 55 | :param env: Dictionary to parse. 56 | """ 57 | to_parse = {k: v for k, v in env.items() if k in inspect.signature(cls).parameters} 58 | to_parse["created_at"] = from_isoformat(to_parse["created_at"]) 59 | to_parse["file_id"] = UUID(to_parse["file_id"]) 60 | return cls(**to_parse) 61 | 62 | 63 | @dataclass 64 | class FileList: 65 | """List of files from deepset Cloud. This dataclass is used for all file-related operations that return a list of files.""" 66 | 67 | total: int 68 | data: List[File] 69 | has_more: bool 70 | 71 | 72 | class FilesAPI: 73 | """File API for deepset Cloud. 74 | 75 | This module takes care of all file-related API calls to deepset Cloud, including 76 | uploading, downloading, listing, and deleting files. 77 | 78 | :param deepset_cloud_api: Instance of the DeepsetCloudAPI. 79 | """ 80 | 81 | def __init__(self, deepset_cloud_api: DeepsetCloudAPI) -> None: 82 | """ 83 | Create FileAPI object. 84 | 85 | :param deepset_cloud_api: Instance of the DeepsetCloudAPI. 86 | """ 87 | self._deepset_cloud_api = deepset_cloud_api 88 | 89 | async def list_paginated( 90 | self, 91 | workspace_name: str, 92 | limit: int = 100, 93 | name: Optional[str] = None, 94 | odata_filter: Optional[str] = None, 95 | after_value: Optional[Any] = None, 96 | after_file_id: Optional[UUID] = None, 97 | ) -> FileList: 98 | """ 99 | List files in a workspace using cursor-based pagination. 100 | 101 | :param workspace_name: Name of the workspace to use. 102 | :param limit: Number of files to return per page. 103 | :param name: Name of the file to odata_filter by. 104 | :param odata_filter: Odata odata_filter to apply. 105 | :param after_value: Value to start after. 106 | :param after_file_id: File ID to start after. 107 | """ 108 | params: Dict[str, Union[str, int]] = {"limit": limit} 109 | if after_value and after_file_id: 110 | params["after_value"] = ( 111 | after_value.isoformat() if isinstance(after_value, datetime.datetime) else str(after_value) 112 | ) 113 | params["after_file_id"] = str(after_file_id) 114 | 115 | # substring match file name 116 | if name: 117 | params["name"] = name 118 | 119 | # odata odata_filter for file meta 120 | if odata_filter: 121 | params["filter"] = odata_filter 122 | 123 | response = await self._deepset_cloud_api.get(workspace_name, "files", params=params) 124 | assert response.status_code == codes.OK, f"Failed to list files: {response.text}" 125 | response_body = response.json() 126 | total = response_body["total"] 127 | data = response_body["data"] 128 | has_more = response_body["has_more"] 129 | return FileList(total=total, data=[File.from_dict(d) for d in data], has_more=has_more) 130 | 131 | @staticmethod 132 | def _available_file_name(file_path: Path, suffix: str = "_1") -> str: 133 | logger.warning("File already exists. Renaming file to avoid overwriting.", file_path=str(file_path)) 134 | base = file_path.stem 135 | ext = file_path.suffix 136 | new_filename = file_path.with_name(f"{base}{suffix}{ext}") 137 | while new_filename.exists(): 138 | suffix = f"_{int(suffix[1:]) + 1}" 139 | new_filename = file_path.with_name(f"{base}{suffix}{ext}") 140 | return str(new_filename) 141 | 142 | async def _save_to_disk(self, file_dir: Path, file_name: str, content: bytes) -> str: 143 | """Save the given content to disk. 144 | 145 | If there is a collision, the file name is changed to avoid overwriting. 146 | This new name is returned by the function. 147 | 148 | :param file_dir: Path to the file. 149 | :param file_name: Name of the file. 150 | :param content: Content of the file. 151 | :return: The new file name. 152 | """ 153 | # Check if the directory exists, and create it if necessary 154 | file_dir.mkdir(parents=True, exist_ok=True) 155 | 156 | new_filename: str = file_name 157 | file_path = file_dir / file_name 158 | if file_path.exists(): 159 | new_filename = self._available_file_name(file_path) 160 | 161 | with (file_dir / new_filename).open("wb") as file: 162 | file.write(content) 163 | return new_filename 164 | 165 | async def direct_upload_path( 166 | self, 167 | workspace_name: str, 168 | file_path: Union[Path, str], 169 | file_name: Optional[str] = None, 170 | meta: Optional[Dict[str, Any]] = None, 171 | write_mode: WriteMode = WriteMode.KEEP, 172 | ) -> UUID: 173 | """Directly upload a file to deepset Cloud. 174 | 175 | :param workspace_name: Name of the workspace to use. 176 | :param file_path: Path to the file to upload. 177 | :param file_name: Name of the file to upload. 178 | :param meta: Meta information to attach to the file. 179 | :param write_mode: Specifies what to do when a file with the same name already exists in the workspace. 180 | Possible options are: 181 | KEEP - uploads the file with the same name and keeps both files in the workspace. 182 | OVERWRITE - overwrites the file that is in the workspace. 183 | FAIL - fails to upload the file with the same name. 184 | :return: ID of the uploaded file. 185 | """ 186 | if isinstance(file_path, str): 187 | file_path = Path(file_path) 188 | 189 | if file_name is None: 190 | file_name = file_path.name 191 | 192 | with file_path.open("rb") as file: 193 | response = await self._deepset_cloud_api.post( 194 | workspace_name, 195 | "files", 196 | files={"file": (file_name, file), "meta": (None, json.dumps(meta))}, 197 | params={"write_mode": write_mode.value}, 198 | ) 199 | if response.status_code != codes.CREATED or response.json().get("file_id") is None: 200 | raise FailedToUploadFileException( 201 | f"Failed to upload file with status code {response.status_code}. response was: {response.text}" 202 | ) 203 | file_id: UUID = UUID(response.json()["file_id"]) 204 | return file_id 205 | 206 | async def direct_upload_in_memory( 207 | self, 208 | workspace_name: str, 209 | content: Union[bytes, str], 210 | file_name: str, 211 | meta: Optional[Dict[str, Any]] = None, 212 | write_mode: WriteMode = WriteMode.KEEP, 213 | ) -> UUID: 214 | """Directly upload files to deepset Cloud. 215 | 216 | :param workspace_name: Name of the workspace to use. 217 | :param content: File text to upload. 218 | :param file_name: Name of the file to upload. 219 | :param meta: Meta information to attach to the file. 220 | :param write_mode: Specifies what to do when a file with the same name already exists in the workspace. 221 | Possible options are: 222 | KEEP - uploads the file with the same name and keeps both files in the workspace. 223 | OVERWRITE - overwrites the file that is in the workspace. 224 | FAIL - fails to upload the file with the same name. 225 | :return: ID of the uploaded file. 226 | """ 227 | response = await self._deepset_cloud_api.post( 228 | workspace_name, 229 | "files", 230 | files={"file": (file_name, content)}, 231 | data={"meta": json.dumps(meta)}, 232 | params={"write_mode": write_mode.value}, 233 | ) 234 | 235 | if response.status_code != codes.CREATED or response.json().get("file_id") is None: 236 | raise FailedToUploadFileException( 237 | f"Failed to upload file with status code {response.status_code}. response was: {response.text}" 238 | ) 239 | file_id: UUID = UUID(response.json()["file_id"]) 240 | return file_id 241 | 242 | async def download( 243 | self, 244 | workspace_name: str, 245 | file_id: UUID, 246 | file_name: str, 247 | include_meta: bool = True, 248 | file_dir: Optional[Union[Path, str]] = None, 249 | ) -> None: 250 | """Download a single file from a workspace. 251 | 252 | :param workspace_name: Name of the workspace to use. 253 | :param file_id: ID of the file to download. 254 | :param file_name: Name assigned to the downloaded file. 255 | :param include_meta: Whether to include the file meta in the folder. 256 | :param file_dir: Location to save the file locally. If not provided the current directory is used. 257 | """ 258 | if file_dir is None: 259 | file_dir = Path.cwd() 260 | 261 | if isinstance(file_dir, str): 262 | # format dir to Path and take relative path into account 263 | file_dir = Path(file_dir).resolve() 264 | 265 | response = await self._deepset_cloud_api.get(workspace_name, f"files/{file_id}") 266 | if response.status_code == codes.NOT_FOUND: 267 | raise FileNotFoundInDeepsetCloudException(f"Failed to download raw file: {response.text}") 268 | if response.status_code != codes.OK: 269 | raise Exception(f"Failed to download raw file: {response.text}") 270 | new_local_file_name: str = await self._save_to_disk( 271 | file_dir=file_dir, file_name=file_name, content=response.content 272 | ) 273 | 274 | if include_meta: 275 | response = await self._deepset_cloud_api.get(workspace_name, f"files/{file_id}/meta") 276 | if response.status_code == codes.NOT_FOUND: 277 | raise FileNotFoundInDeepsetCloudException(f"Failed to download raw file: {response.text}") 278 | if response.status_code != codes.OK: 279 | raise Exception(f"Failed to download raw file: {response.text}") 280 | await self._save_to_disk( 281 | file_dir=file_dir, 282 | file_name=f"{new_local_file_name}.meta.json", 283 | content=response.content, 284 | ) 285 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/_s3/__init__.py: -------------------------------------------------------------------------------- 1 | """Module that handles s3 interactions.""" 2 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """A set of utils for the SDK.""" 2 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/_utils/datetime.py: -------------------------------------------------------------------------------- 1 | """Utility functions for working with datetime objects.""" 2 | from datetime import datetime 3 | 4 | 5 | def from_isoformat(date_str: str) -> datetime: 6 | """Parse a date string in ISO 8601 format and returns a datetime object. 7 | 8 | Our new Pydantic 2.0 API returns with the `Z` suffix, but the old one returns with `+00:00` 9 | Python versions < 3.12 don't support the `Z` suffix, so we need to replace it with `+00:00` 10 | """ 11 | date_str = date_str.replace("Z", "+00:00") 12 | return datetime.fromisoformat(date_str) 13 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/models.py: -------------------------------------------------------------------------------- 1 | """General data classes for deepset Cloud SDK.""" 2 | import json 3 | from abc import abstractmethod 4 | from dataclasses import dataclass 5 | from typing import Any, Dict, Optional, Union 6 | from uuid import UUID 7 | 8 | 9 | @dataclass 10 | class UserInfo: 11 | """User info data class.""" 12 | 13 | user_id: UUID 14 | given_name: str 15 | family_name: str 16 | 17 | 18 | class DeepsetCloudFileBase: # pylint: disable=too-few-public-methods 19 | """Base class for deepset Cloud files.""" 20 | 21 | def __init__(self, name: str, meta: Optional[Dict[str, Any]] = None): 22 | """ 23 | Initialize DeepsetCloudFileBase. 24 | 25 | :param name: The file name 26 | :param meta: The file's metadata 27 | """ 28 | self.name = name 29 | self.meta = meta 30 | 31 | @abstractmethod 32 | def content(self) -> Union[str, bytes]: 33 | """Return content.""" 34 | raise NotImplementedError 35 | 36 | def meta_as_string(self) -> str: 37 | """Return metadata as a string.""" 38 | if self.meta: 39 | return json.dumps(self.meta) 40 | 41 | return json.dumps({}) 42 | 43 | 44 | class DeepsetCloudFile(DeepsetCloudFileBase): # pylint: disable=too-few-public-methods 45 | """Data class for text files in deepset Cloud.""" 46 | 47 | def __init__(self, text: str, name: str, meta: Optional[Dict[str, Any]] = None): 48 | """ 49 | Initialize DeepsetCloudFileBase. 50 | 51 | :param name: The file name 52 | :param text: The text content of the file 53 | :param meta: The file's metadata 54 | """ 55 | super().__init__(name, meta) 56 | self.text = text 57 | 58 | def content(self) -> str: 59 | """ 60 | Return the content of the file. 61 | 62 | :return: The text of the file. 63 | """ 64 | return self.text 65 | 66 | 67 | # Didn't want to cause breaking changes in the DeepsetCloudFile class, though it 68 | # is technically the same as the below, the naming of the text field will be confusing 69 | # for users that are uploading anything other than text. 70 | 71 | 72 | class DeepsetCloudFileBytes(DeepsetCloudFileBase): # pylint: disable=too-few-public-methods 73 | """Data class for uploading files of any valid type in deepset Cloud.""" 74 | 75 | def __init__(self, file_bytes: bytes, name: str, meta: Optional[Dict[str, Any]] = None): 76 | """ 77 | Initialize DeepsetCloudFileBase. 78 | 79 | :param name: The file name 80 | :param file_bytes: The content of the file represented in bytes 81 | :param meta: The file's metadata 82 | """ 83 | super().__init__(name, meta) 84 | self.file_bytes = file_bytes 85 | 86 | def content(self) -> bytes: 87 | """ 88 | Return the content of the file in bytes. 89 | 90 | :return: The content of the file in bytes. 91 | """ 92 | return self.file_bytes 93 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | """Workflows for deepset AI platform SDK.""" 2 | 3 | from deepset_cloud_sdk.workflows.pipeline_client.models import ( 4 | IndexConfig, 5 | IndexInputs, 6 | IndexOutputs, 7 | PipelineConfig, 8 | PipelineInputs, 9 | PipelineOutputs, 10 | ) 11 | from deepset_cloud_sdk.workflows.pipeline_client.pipeline_client import PipelineClient 12 | 13 | __all__ = [ 14 | "PipelineInputs", 15 | "IndexInputs", 16 | "IndexOutputs", 17 | "PipelineOutputs", 18 | "IndexConfig", 19 | "PipelineConfig", 20 | "PipelineClient", 21 | ] 22 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/async_client/__init__.py: -------------------------------------------------------------------------------- 1 | """Async implementation of workflows client.""" 2 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/pipeline_client/__init__.py: -------------------------------------------------------------------------------- 1 | """Package to enable importing pipelines and indexes to deepset AI platform.""" 2 | 3 | from deepset_cloud_sdk.workflows.pipeline_client.models import ( 4 | IndexConfig, 5 | IndexInputs, 6 | IndexOutputs, 7 | PipelineConfig, 8 | PipelineInputs, 9 | PipelineOutputs, 10 | ) 11 | from deepset_cloud_sdk.workflows.pipeline_client.pipeline_client import PipelineClient 12 | 13 | __all__ = [ 14 | "PipelineClient", 15 | "PipelineInputs", 16 | "IndexInputs", 17 | "IndexOutputs", 18 | "PipelineOutputs", 19 | "IndexConfig", 20 | "PipelineConfig", 21 | ] 22 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/pipeline_client/models.py: -------------------------------------------------------------------------------- 1 | """Models for the pipeline service.""" 2 | from typing import List 3 | 4 | from pydantic import BaseModel, Field, model_validator 5 | 6 | 7 | class InputOutputBaseModel(BaseModel): 8 | """Base model for input and output configurations. 9 | 10 | This class provides common functionality for input and output models, such as YAML conversion. 11 | """ 12 | 13 | def to_yaml_dict(self) -> dict: 14 | """Convert the model to a YAML-compatible dictionary. 15 | 16 | Clears empty values from the dictionary. 17 | 18 | :return: Dictionary ready for YAML serialization. 19 | """ 20 | fields = self.model_dump(exclude_none=True) 21 | # Remove empty values 22 | return {k: v for k, v in fields.items() if v} 23 | 24 | 25 | class PipelineInputs(InputOutputBaseModel): 26 | """Pipeline input configuration. 27 | 28 | Defines the components that should receive the Query input and any filters that apply to it. 29 | 30 | :param query: List of components that will receive the `query` input. 31 | Specify each component in the format: '.', for example: 'retriever.query' 32 | :param filters: Optional list of components that will receive the filters input. 33 | Specify each component using the format: '.', for example: 'retriever.filters'. 34 | """ 35 | 36 | model_config = {"extra": "allow"} # Allow additional fields in inputs 37 | 38 | query: List[str] = Field( 39 | ..., 40 | description=( 41 | "List of components and parameters that will receive the `query` input when they are executed. " 42 | "Use the format: '.', for example: 'retriever.query'." 43 | ), 44 | min_items=1, 45 | ) 46 | filters: List[str] = Field( 47 | default_factory=list, 48 | description=( 49 | "List of components and parameters that will receive the `filters` input when they are executed. " 50 | "Use the format: '.', for example: 'retriever.filters'." 51 | ), 52 | ) 53 | 54 | 55 | class PipelineOutputs(InputOutputBaseModel): 56 | """Pipeline output configuration. 57 | 58 | Specify the components that will output `documents`, `answers`, or both. 59 | You must include at least one. The outputs of these components become the final output of the pipeline. 60 | 61 | :param documents: Name of the component and parameter that will provide `documents` as output. 62 | Use the format '.', for example: 'retriever.documents'. 63 | :param answers: Name of the component and parameter that will provide `answers` as output. 64 | Use the format '.', for example: 'reader.answers'. 65 | """ 66 | 67 | model_config = {"extra": "allow"} # Allow additional fields in outputs 68 | 69 | documents: str | None = Field( 70 | default=None, 71 | description="Name of the component that will provide `documents` as output. " 72 | "Format: '.', for example: 'meta_ranker.documents'", 73 | ) 74 | answers: str | None = Field( 75 | default=None, 76 | description="Name of the component that will provide `answers` as output. " 77 | "Format: '.', for example: 'answers_builder.answers'", 78 | ) 79 | 80 | @model_validator(mode="after") 81 | def validate_documents_xor_answers(self) -> "PipelineOutputs": 82 | """Validate that either `documents`, `answers`, or both are defined.""" 83 | if not self.documents and not self.answers: 84 | raise ValueError("Define at least one pipeline output, either 'documents, 'answers' or both.") 85 | return self 86 | 87 | 88 | class IndexOutputs(InputOutputBaseModel): 89 | """Output configuration for the index. 90 | 91 | Index outputs are optional. 92 | """ 93 | 94 | model_config = {"extra": "allow"} # Allow additional fields in outputs 95 | 96 | 97 | class PipelineConfig(BaseModel): 98 | """Configuration required to import the pipeline into deepset AI Platform. 99 | 100 | :param name: Name of the pipeline to be imported 101 | :param inputs: Pipeline input configuration. Use `PipelineInputs` model to define the inputs. 102 | :param outputs: Pipeline output configuration. Use `PipelineOutputs` model to define the outputs. 103 | """ 104 | 105 | model_config = {"extra": "forbid"} 106 | 107 | name: str = Field(..., description="The name of the pipeline to be imported", min_length=1) 108 | inputs: PipelineInputs = Field( 109 | default_factory=PipelineInputs, 110 | description=("Pipeline input configuration. Use `PipelineInputs` model to define the inputs."), 111 | ) 112 | outputs: PipelineOutputs = Field( 113 | default_factory=PipelineOutputs, 114 | description=("Pipeline output configuration. Use `PipelineOutputs` model to define the outputs."), 115 | ) 116 | 117 | 118 | class IndexInputs(InputOutputBaseModel): 119 | """Configuration required to import an index into deepset AI Platform. 120 | 121 | Defines the index components that should receive the `Files` input. 122 | 123 | :param files: List of components and parameters that should receive files as input. 124 | Specify the components using the format: '.', for example: 'file_type_router.sources'. 125 | """ 126 | 127 | model_config = {"extra": "allow"} # Allow additional fields in inputs 128 | 129 | files: List[str] = Field( 130 | default_factory=list, 131 | description=( 132 | "List of components and parameters that will receive files as input when they're executed. " 133 | "Format: '.', for example: 'file_type_router.sources'." 134 | ), 135 | ) 136 | 137 | 138 | class IndexConfig(BaseModel): 139 | """Index configuration for importing an index to deepset AI platform. 140 | 141 | :param name: Name of the index to be imported. 142 | :param inputs: Index input configuration. Use `IndexInputs` model to define the inputs. 143 | :param outputs: Index output configuration. Optional. Use `IndexOutputs` model to define the outputs. 144 | """ 145 | 146 | model_config = {"extra": "forbid"} 147 | 148 | name: str = Field(..., description="Name of the index to be imported.", min_length=1) 149 | inputs: IndexInputs = Field( 150 | default_factory=IndexInputs, 151 | description=("Input configuration for the index. Use `IndexInputs` model to define the inputs."), 152 | ) 153 | outputs: IndexOutputs | None = Field( 154 | default_factory=IndexOutputs, 155 | description=("Optional output configuration for the index. Use `IndexOutputs` model to define the outputs."), 156 | ) 157 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/pipeline_client/pipeline_client.py: -------------------------------------------------------------------------------- 1 | """Pipeline client for importing pipelines and indexes to deepset AI Platform.""" 2 | import asyncio 3 | 4 | import structlog 5 | 6 | from deepset_cloud_sdk._api.config import ( 7 | API_KEY, 8 | API_URL, 9 | DEFAULT_WORKSPACE_NAME, 10 | CommonConfig, 11 | ) 12 | from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI 13 | from deepset_cloud_sdk.workflows.pipeline_client.models import ( 14 | IndexConfig, 15 | PipelineConfig, 16 | ) 17 | from deepset_cloud_sdk.workflows.pipeline_client.pipeline_service import ( 18 | PipelineProtocol, 19 | PipelineService, 20 | ) 21 | 22 | logger = structlog.get_logger(__name__) 23 | 24 | 25 | class PipelineClient: 26 | """Client for importing Haystack pipelines and indexes to deepset AI platform. 27 | 28 | This class provides functionality to import Haystack pipelines and indexes into the deepset AI platform. 29 | 30 | Example for importing a Haystack pipeline or index to deepset AI platform: 31 | ```python 32 | from deepset_cloud_sdk import PipelineClient, PipelineConfig, PipelineInputs, PipelineOutputs, IndexConfig, IndexInputs 33 | from haystack import Pipeline 34 | 35 | # Initialize the client with configuration from environment variables (after running `deepset-cloud login`) 36 | client = PipelineClient() 37 | 38 | # or initialize the client with explicit configuration 39 | client = PipelineClient( 40 | api_key="your-api-key", 41 | workspace_name="your-workspace", 42 | api_url="https://api.deepset.ai" 43 | ) 44 | 45 | # Configure your pipeline 46 | pipeline = Pipeline() 47 | 48 | # Configure import 49 | # if importing a pipeline, use PipelineConfig 50 | config = PipelineConfig( 51 | name="my-pipeline", 52 | inputs=PipelineInputs( 53 | query=["prompt_builder.query"], 54 | filters=["bm25_retriever.filters", "embedding_retriever.filters"], 55 | ), 56 | outputs=PipelineOutputs( 57 | answers="answers_builder.answers", 58 | documents="ranker.documents", 59 | ), 60 | ) 61 | 62 | # if importing an index, use IndexConfig 63 | config = IndexConfig( 64 | name="my-index", 65 | inputs=IndexInputs(files=["file_type_router.sources"]), 66 | ) 67 | 68 | # sync execution 69 | client.import_into_deepset(pipeline, config) 70 | 71 | # async execution 72 | await client.import_into_deepset_async(pipeline, config) 73 | ``` 74 | """ 75 | 76 | def __init__( 77 | self, 78 | api_key: str | None = None, 79 | workspace_name: str | None = None, 80 | api_url: str | None = None, 81 | ) -> None: 82 | """Initialize the Pipeline Client. 83 | 84 | The client can be configured in two ways: 85 | 86 | 1. Using environment variables (recommended): 87 | - Run `deepset-cloud login` to set up the following environment variables: 88 | - `API_KEY`: Your deepset AI platform API key 89 | - `API_URL`: The URL of the deepset AI platform API 90 | - `DEFAULT_WORKSPACE_NAME`: The workspace name to use. 91 | 92 | 2. Using explicit parameters: 93 | - Provide the values directly to this constructor 94 | - Any missing parameters will fall back to environment variables 95 | 96 | :param api_key: Your deepset AI platform API key. Falls back to `API_KEY` environment variable. 97 | :param workspace_name: The workspace to use. Falls back to `DEFAULT_WORKSPACE_NAME` environment variable. 98 | :param api_url: The URL of the deepset AI platform API. Falls back to `API_URL` environment variable. 99 | :raises ValueError: If no api key or workspace name is provided and `API_KEY` or `DEFAULT_WORKSPACE_NAME` is not set in the environment. 100 | """ 101 | self._api_config = CommonConfig( 102 | api_key=api_key or API_KEY, 103 | api_url=api_url or API_URL, 104 | ) 105 | self._workspace_name = workspace_name or DEFAULT_WORKSPACE_NAME 106 | if not self._workspace_name: 107 | raise ValueError( 108 | "Workspace not configured. Provide a workspace name or set the `DEFAULT_WORKSPACE_NAME` environment variable." 109 | ) 110 | 111 | async def import_into_deepset_async(self, pipeline: PipelineProtocol, config: IndexConfig | PipelineConfig) -> None: 112 | """Import a Haystack `Pipeline` or `AsyncPipeline` into deepset AI Platform asynchronously. 113 | 114 | The pipeline must be imported as either an index or a pipeline: 115 | - An index: Processes files and stores them in a document store, making them available for 116 | pipelines to search. 117 | - A pipeline: For other use cases, for example, searching through documents stored by index pipelines. 118 | 119 | :param pipeline: The Haystack `Pipeline` or `AsyncPipeline` to import. 120 | :param config: Configuration for importing, use either `IndexConfig` or `PipelineConfig`. 121 | If importing an index, the config argument is expected to be of type `IndexConfig`, 122 | if importing a pipeline, the config argument is expected to be of type `PipelineConfig`. 123 | """ 124 | async with DeepsetCloudAPI.factory(self._api_config) as api: 125 | service = PipelineService(api, self._workspace_name) 126 | await service.import_async(pipeline, config) 127 | 128 | def import_into_deepset(self, pipeline: PipelineProtocol, config: IndexConfig | PipelineConfig) -> None: 129 | """Import a Haystack `Pipeline` or `AsyncPipeline` into deepset AI Platform synchronously. 130 | 131 | The pipeline must be imported as either an index or a pipeline: 132 | - An index: Processes files and stores them in a document store, making them available for 133 | pipelines to search. 134 | - A pipeline: For other use cases, for example, searching through documents stored by index pipelines. 135 | 136 | :param pipeline: The Haystack `Pipeline` or `AsyncPipeline` to import. 137 | :param config: Configuration for importing into deepset, use either `IndexConfig` or `PipelineConfig`. 138 | If importing an index, the config argument is expected to be of type `IndexConfig`, 139 | if importing a pipeline, the config argument is expected to be of type `PipelineConfig`. 140 | """ 141 | try: 142 | loop = asyncio.get_event_loop() 143 | # do not close if event loop already exists, e.g. in Jupyter notebooks 144 | should_close = False 145 | except RuntimeError: 146 | loop = asyncio.new_event_loop() 147 | asyncio.set_event_loop(loop) 148 | should_close = True 149 | 150 | try: 151 | return loop.run_until_complete(self.import_into_deepset_async(pipeline, config)) 152 | finally: 153 | if should_close: 154 | loop.close() 155 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/pipeline_client/pipeline_service.py: -------------------------------------------------------------------------------- 1 | """Pipeline importing service for deepset Cloud SDK.""" 2 | # pylint: disable=unnecessary-ellipsis,import-outside-toplevel 3 | from __future__ import annotations 4 | 5 | from http import HTTPStatus 6 | from io import StringIO 7 | from typing import Any, Optional, Protocol, runtime_checkable 8 | 9 | import structlog 10 | from ruamel.yaml import YAML 11 | 12 | from deepset_cloud_sdk._api.config import DEFAULT_WORKSPACE_NAME, CommonConfig 13 | from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI 14 | from deepset_cloud_sdk.workflows.pipeline_client.models import ( 15 | IndexConfig, 16 | PipelineConfig, 17 | ) 18 | 19 | logger = structlog.get_logger(__name__) 20 | 21 | 22 | @runtime_checkable 23 | class PipelineProtocol(Protocol): 24 | """Protocol defining the required methods for a Haystack Pipeline or AsyncPipeline.""" 25 | 26 | def dumps(self) -> str: 27 | """Convert the pipeline to a YAML string. 28 | 29 | :return: YAML string representation of the pipeline. 30 | """ 31 | ... 32 | 33 | def add_component(self, name: str, instance: Any) -> None: 34 | """Add a component to the pipeline. 35 | 36 | :param name: Name of the component. 37 | :param instance: Component instance to add. 38 | """ 39 | ... 40 | 41 | 42 | class PipelineService: 43 | """Handles the importing of Haystack pipelines and indexes into deepset AI platform.""" 44 | 45 | def __init__(self, api: DeepsetCloudAPI, workspace_name: Optional[str] = None) -> None: 46 | """Initialize the pipeline service. 47 | 48 | :param api: An initialized DeepsetCloudAPI instance. 49 | :param workspace_name: Optional workspace name to use instead of environment variable. 50 | """ 51 | self._api = api 52 | self._workspace_name = workspace_name or DEFAULT_WORKSPACE_NAME 53 | self._yaml = YAML() 54 | self._yaml.preserve_quotes = True 55 | self._yaml.indent(mapping=2, sequence=2) 56 | 57 | @classmethod 58 | async def factory(cls, config: CommonConfig, workspace_name: Optional[str] = None) -> PipelineService: 59 | """Create a new instance of the pipeline service. 60 | 61 | :param config: CommonConfig object. 62 | :param workspace_name: Optional workspace name to use instead of environment variable. 63 | """ 64 | async with DeepsetCloudAPI.factory(config) as api: 65 | return cls(api, workspace_name) 66 | 67 | async def import_async(self, pipeline: PipelineProtocol, config: IndexConfig | PipelineConfig) -> None: 68 | """Import a pipeline or an index into deepset AI platform. 69 | 70 | :param pipeline: The pipeline or index to import. Must be a Haystack Pipeline or AsyncPipeline. 71 | :param config: Configuration for importing, either `IndexConfig` or `PipelineConfig`. 72 | If importing an index, the config argument is expected to be of type `IndexConfig`, 73 | if importing a pipeline, the config argument is expected to be of type `PipelineConfig`. 74 | 75 | :raises TypeError: If the pipeline object isn't a Haystack Pipeline or AsyncPipeline. 76 | :raises ValueError: If no workspace is configured. 77 | :raises ImportError: If haystack-ai is not installed. 78 | """ 79 | logger.debug(f"Starting async importing for {config.name}") 80 | 81 | # import locally to avoid Haystack dependency to be installed in the SDK 82 | try: 83 | from haystack import AsyncPipeline as HaystackAsyncPipeline 84 | from haystack import Pipeline as HaystackPipeline 85 | except ImportError as err: 86 | raise ImportError( 87 | "Can't import Pipeline or AsyncPipeline because haystack-ai is not installed. Run 'pip install haystack-ai'." 88 | ) from err 89 | 90 | if not isinstance(pipeline, (HaystackPipeline, HaystackAsyncPipeline)): 91 | raise TypeError( 92 | "Haystack Pipeline or AsyncPipeline object expected. " 93 | "Make sure you have installed haystack-ai and use Pipeline or AsyncPipeline " 94 | "to define your pipeline or index." 95 | ) 96 | 97 | if not self._workspace_name: 98 | raise ValueError( 99 | "The workspace to import into is not configured. " 100 | "Run 'deepset-cloud login' and follow the instructions or configure the workspace name on the SDK instance." 101 | ) 102 | 103 | if isinstance(config, IndexConfig): 104 | logger.debug(f"Importing index into workspace {self._workspace_name}") 105 | await self._import_index(pipeline, config) 106 | else: 107 | logger.debug(f"Importing pipeline into workspace {self._workspace_name}") 108 | await self._import_pipeline(pipeline, config) 109 | 110 | async def _import_index(self, pipeline: PipelineProtocol, config: IndexConfig) -> None: 111 | """Import an index into deepset AI Platform. 112 | 113 | :param pipeline: The Haystack pipeline to import. 114 | :param config: Configuration for importing an index. 115 | """ 116 | pipeline_yaml = self._from_haystack_pipeline(pipeline, config) 117 | response = await self._api.post( 118 | workspace_name=self._workspace_name, 119 | endpoint="indexes", 120 | json={"name": config.name, "config_yaml": pipeline_yaml}, 121 | ) 122 | response.raise_for_status() 123 | if response.status_code == HTTPStatus.NO_CONTENT: 124 | logger.debug(f"Index {config.name} successfully created.") 125 | 126 | async def _import_pipeline(self, pipeline: PipelineProtocol, config: PipelineConfig) -> None: 127 | """Import a pipeline into deepset AI Platform. 128 | 129 | :param pipeline: The Haystack pipeline to import. 130 | :param config: Configuration for importing a pipeline. 131 | """ 132 | logger.debug(f"Importing pipeline {config.name}") 133 | pipeline_yaml = self._from_haystack_pipeline(pipeline, config) 134 | response = await self._api.post( 135 | workspace_name=self._workspace_name, 136 | endpoint="pipelines", 137 | json={"name": config.name, "query_yaml": pipeline_yaml}, 138 | ) 139 | response.raise_for_status() 140 | if response.status_code == HTTPStatus.NO_CONTENT: 141 | logger.debug(f"Pipeline {config.name} successfully created.") 142 | 143 | def _from_haystack_pipeline(self, pipeline: PipelineProtocol, config: IndexConfig | PipelineConfig) -> str: 144 | """Create a YAML configuration from the pipeline. 145 | 146 | :param pipeline: The Haystack pipeline to create the configuration for. 147 | :param config: Configuration for importing. 148 | :return: YAML configuration as a string. 149 | """ 150 | # Parse the pipeline YAML 151 | pipeline_dict = self._yaml.load(pipeline.dumps()) 152 | self._add_inputs_and_outputs(pipeline_dict, config) 153 | self._add_async_flag_if_needed(pipeline, pipeline_dict) 154 | 155 | # Convert back to string 156 | yaml_str = StringIO() 157 | self._yaml.dump(pipeline_dict, yaml_str) 158 | return yaml_str.getvalue() 159 | 160 | def _add_inputs_and_outputs(self, pipeline_dict: dict, config: IndexConfig | PipelineConfig) -> None: 161 | """Add inputs and outputs to the pipeline dictionary from config. 162 | 163 | :param pipeline_dict: The pipeline dictionary to add inputs and outputs to. 164 | :param config: Configuration for importing. 165 | """ 166 | if config.inputs and (converted_inputs := config.inputs.to_yaml_dict()): 167 | pipeline_dict["inputs"] = converted_inputs 168 | if config.outputs and (converted_outputs := config.outputs.to_yaml_dict()): 169 | pipeline_dict["outputs"] = converted_outputs 170 | 171 | def _add_async_flag_if_needed(self, pipeline: PipelineProtocol, pipeline_dict: dict) -> None: 172 | """Add async_enabled flag to pipeline dict if pipeline is AsyncPipeline. 173 | 174 | This enables running pipelines asynchronously in deepset. 175 | 176 | :param pipeline: The Haystack pipeline to check. 177 | :param pipeline_dict: The pipeline dictionary to modify. 178 | """ 179 | try: 180 | from haystack import AsyncPipeline as HaystackAsyncPipeline 181 | 182 | if isinstance(pipeline, HaystackAsyncPipeline): 183 | pipeline_dict["async_enabled"] = True 184 | except ImportError: 185 | # If haystack-ai is not available, we can't check the type 186 | # This should not happen since we already checked in import_async 187 | pass 188 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/sync_client/__init__.py: -------------------------------------------------------------------------------- 1 | """Sync implementation of workflows client.""" 2 | -------------------------------------------------------------------------------- /deepset_cloud_sdk/workflows/sync_client/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for making async code sync.""" 2 | from asyncio import AbstractEventLoop 3 | from typing import AsyncIterator, Generator, Optional, Tuple, TypeVar 4 | 5 | T = TypeVar("T") 6 | 7 | 8 | def iter_over_async(ait: AsyncIterator[T], loop: AbstractEventLoop) -> Generator[T, None, None]: 9 | """Convert an async generator to a sync generator. 10 | 11 | :param ait: Async generator to convert. 12 | :param loop: Event loop to run the async generator on. 13 | :return: Sync generator. 14 | """ 15 | # Taken from 16 | # https://stackoverflow.com/questions/63587660/yielding-asyncio-generator-data-back-from-event-loop-possible/63595496#63595496 17 | ait = ait.__aiter__() # pylint: disable=unnecessary-dunder-call 18 | 19 | async def get_next() -> Tuple[bool, Optional[T]]: 20 | try: 21 | obj = await ait.__anext__() # pylint: disable=unnecessary-dunder-call 22 | return False, obj 23 | except StopAsyncIteration: 24 | return True, None 25 | 26 | while True: 27 | done, obj = loop.run_until_complete(get_next()) 28 | if done: 29 | break 30 | # object will always be not `None` 31 | yield obj # type: ignore 32 | -------------------------------------------------------------------------------- /docs/_images/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/_pydoc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/docs/_pydoc/__init__.py -------------------------------------------------------------------------------- /docs/_pydoc/config/async_client.yml: -------------------------------------------------------------------------------- 1 | loaders: 2 | - type: python 3 | search_path: [../../../deepset_cloud_sdk/workflows/async_client] 4 | modules: ["files"] 5 | ignore_when_discovered: ["__init__"] 6 | processors: 7 | - type: filter 8 | expression: 9 | documented_only: true 10 | do_not_filter_modules: false 11 | skip_empty_modules: true 12 | - type: smart 13 | - type: crossref 14 | renderer: 15 | type: renderers.ReadmeRenderer 16 | excerpt: An asynchronous client for the deepset Cloud API. 17 | category_slug: sdk-10 18 | title: Asynchronous Client 19 | slug: async_client 20 | order: 0 21 | markdown: 22 | descriptive_class_title: false 23 | descriptive_module_title: true 24 | add_method_class_prefix: true 25 | add_member_class_prefix: false 26 | filename: async_client.md 27 | -------------------------------------------------------------------------------- /docs/_pydoc/config/cli.yml: -------------------------------------------------------------------------------- 1 | loaders: 2 | - type: python 3 | search_path: [../../../deepset_cloud_sdk] 4 | modules: ["cli"] 5 | ignore_when_discovered: ["__init__"] 6 | processors: 7 | - type: filter 8 | expression: 9 | documented_only: true 10 | do_not_filter_modules: false 11 | skip_empty_modules: true 12 | - type: smart 13 | - type: crossref 14 | renderer: 15 | type: renderers.ReadmeRenderer 16 | excerpt: A cli tool for the deepset Cloud API. 17 | category_slug: sdk-10 18 | title: deepset Cloud CLI 19 | slug: cli 20 | order: 0 21 | markdown: 22 | descriptive_class_title: false 23 | descriptive_module_title: true 24 | add_method_class_prefix: true 25 | add_member_class_prefix: false 26 | filename: cli.md 27 | -------------------------------------------------------------------------------- /docs/_pydoc/config/sync_client.yml: -------------------------------------------------------------------------------- 1 | loaders: 2 | - type: python 3 | search_path: [../../../deepset_cloud_sdk/workflows/sync_client] 4 | modules: ["files"] 5 | ignore_when_discovered: ["__init__"] 6 | processors: 7 | - type: filter 8 | expression: 9 | documented_only: true 10 | do_not_filter_modules: false 11 | skip_empty_modules: true 12 | - type: smart 13 | - type: crossref 14 | renderer: 15 | type: renderers.ReadmeRenderer 16 | excerpt: A synchronous client for the deepset Cloud API. 17 | category_slug: sdk-10 18 | title: Synchronous Client 19 | slug: sync_client 20 | order: 0 21 | markdown: 22 | descriptive_class_title: false 23 | descriptive_module_title: true 24 | add_method_class_prefix: true 25 | add_member_class_prefix: false 26 | filename: sync_client.md 27 | -------------------------------------------------------------------------------- /docs/_pydoc/renderers.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import dataclasses 3 | import io 4 | import os 5 | import sys 6 | import typing as t 7 | 8 | import docspec 9 | from pydoc_markdown.contrib.renderers.markdown import MarkdownRenderer 10 | from pydoc_markdown.interfaces import Context, Renderer 11 | 12 | README_FRONTMATTER = """--- 13 | title: {title} 14 | excerpt: {excerpt} 15 | slug: {slug} 16 | order: {order} 17 | hidden: false 18 | --- 19 | 20 | """ 21 | 22 | 23 | @dataclasses.dataclass 24 | class ReadmeRenderer(Renderer): 25 | """ 26 | This custom Renderer is heavily based on the `MarkdownRenderer`, 27 | it just prepends a front matter so that the output can be published 28 | directly to readme.io. 29 | """ 30 | 31 | # These settings will be used in the front matter output 32 | title: str 33 | category_slug: str 34 | excerpt: str 35 | slug: str 36 | order: int 37 | # Docs categories fetched from Readme.io 38 | categories: t.Dict[str, str] = dataclasses.field(init=False) 39 | # This exposes a special `markdown` settings value that can be used to pass 40 | # parameters to the underlying `MarkdownRenderer` 41 | markdown: MarkdownRenderer = dataclasses.field(default_factory=MarkdownRenderer) 42 | 43 | def init(self, context: Context) -> None: 44 | self.markdown.init(context) 45 | version = self._doc_version() 46 | 47 | def _doc_version(self) -> str: 48 | """ 49 | Returns the docs version. 50 | """ 51 | # full_version = about.__version__ 52 | # major, minor = full_version.split(".")[:2] 53 | # return f"v{major}.{minor}" 54 | 55 | # The readme.io version is hardcoded for now to manually maintain the guides 56 | # within the same documentation page as the deepset Cloud Docs. 57 | return "v1.0" 58 | 59 | def render(self, modules: t.List[docspec.Module]) -> None: 60 | if self.markdown.filename is None: 61 | sys.stdout.write(self._frontmatter()) 62 | self.markdown.render_single_page(sys.stdout, modules) 63 | else: 64 | with io.open(self.markdown.filename, "w", encoding=self.markdown.encoding) as fp: 65 | fp.write(self._frontmatter()) 66 | self.markdown.render_single_page(t.cast(t.TextIO, fp), modules) 67 | 68 | def _frontmatter(self) -> str: 69 | return README_FRONTMATTER.format( 70 | title=self.title, 71 | excerpt=self.excerpt, 72 | slug=self.slug, 73 | order=self.order, 74 | ) 75 | -------------------------------------------------------------------------------- /docs/_pydoc/requirements.txt: -------------------------------------------------------------------------------- 1 | pydoc-markdown==4.8.2 2 | PyYAML==6.0.1 3 | # pin docspec while waiting for https://github.com/NiklasRosenstein/docspec/issues/91 to be fixed 4 | docspec-python==2.2.1 5 | requests==2.31.0 6 | -------------------------------------------------------------------------------- /docs/_stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-primary-fg-color: #2b2f55; 3 | --md-accent-fg-color: #1890ff; 4 | /* accent is used for over*/ 5 | } 6 | 7 | /* Hide the title deepset Cloud SDK, but still show the name in the tab */ 8 | .md-nav__title { 9 | display: none; 10 | } -------------------------------------------------------------------------------- /docs/examples/cli/README.md: -------------------------------------------------------------------------------- 1 | # deepset Cloud CLI 2 | The deepset Cloud CLI is a command-line interface tool that you can use to interact with the deepset Cloud SDK and perform various operations, such as uploading files and folders to your deepset Cloud workspace. 3 | 4 | ## Installation 5 | To install the deepset Cloud CLI, use `pip`: 6 | 7 | ```shell 8 | pip install deepset-cloud-sdk 9 | ``` 10 | ## Configuration 11 | Before using the deepset Cloud CLI, log in and provide your credentials. You can do this by running the command: 12 | 13 | On MacOS and Linux: 14 | 15 | ```shell 16 | deepset-cloud login 17 | ``` 18 | On Windows: 19 | 20 | ```shell 21 | python -m deepset_cloud_sdk.cli login 22 | ``` 23 | 24 | This command prompts you to enter your API key and default workspace name. Once you provide these details, the CLI stores your credentials in the `~/.deepset-cloud/.env` file. This file is used as the default configuration for subsequent CLI commands. 25 | 26 | Alternatively, to use a different environment file for your configuration, you can create an `.env` file in the local directory. Additionally, you have the flexibility to provide the credentials directly as command-line arguments or set them programmatically in your code. 27 | 28 | ## Usage 29 | You can use the deepset Cloud CLI by running the following command: 30 | 31 | On MacOS and Linux: 32 | 33 | ```shell 34 | deepset-cloud 35 | ``` 36 | 37 | On Windows: 38 | 39 | ```shell 40 | python -m deepset_cloud_sdk.cli 41 | ``` 42 | 43 | Replace with one of the supported commands. To list all available commands, use the `--help` flag. 44 | 45 | ## Example Commands 46 | 47 | ### Upload Files and Folders 48 | 49 | You don't have to follow any special folder structure. If there are multiple files with the same name in your folder, they're all uploaded by default. You can change this behavior with the `--write-mode` flag. See the examples below. 50 | 51 | This command uploads the file example.txt to your deepset Cloud workspace. 52 | On MacOS and Linux: 53 | 54 | ```shell 55 | deepset-cloud upload ./examples/data/example.txt 56 | ``` 57 | 58 | On Windows: 59 | 60 | ```shell 61 | python -m deepset_cloud_sdk.cli upload ./examples/data/example.txt 62 | ``` 63 | 64 | This command uploads all `.txt` and `.pdf` files from the folder located in the _examples_ directory to your deepset Cloud workspace. By default only `.txt` and `.pdf` files are uploaded. To upload different file types see below. 65 | 66 | The paths in the examples are relative to the current working directory. 67 | 68 | On MacOS and Linux: 69 | 70 | ```shell 71 | deepset-cloud upload ./examples/data 72 | ``` 73 | On Windows: 74 | ```shell 75 | python -m deepset_cloud_sdk.cli upload ./examples/data 76 | ``` 77 | To overwrite existing files in your project, use the `--write-mode` flag. For example: 78 | 79 | On MacOS and Linux: 80 | ```shell 81 | deepset-cloud upload ./examples/data --write-mode OVERWRITE 82 | ``` 83 | On Windows: 84 | ```shell 85 | python -m deepset_cloud_sdk.cli upload ./examples/data --write-mode OVERWRITE 86 | ``` 87 | This syncs your local files with the files in your deepset Cloud workspace without having to manually delete the files in your workspace. 88 | 89 | ## Upload different file types 90 | 91 | To upload other file types than text, specify the desired file types using the flag `--use-type`. 92 | The command below uploads all file types from the ./example/data directory that are supported by deepset Cloud. 93 | 94 | ```shell 95 | deepset-cloud upload ./examples/data --use-type .csv --use-type .docx --use-type .html --use-type .json --use-type .md --use-type .txt --use-type .pdf --use-type .pptx --use-type .xlsx --use-type .xml 96 | 97 | ``` 98 | On Windows: 99 | ```shell 100 | python -m deepset_cloud_sdk.cli upload ./examples/data --use-type .csv --use-type .docx --use-type .html --use-type .json --use-type .md --use-type .txt --use-type .pdf --use-type .pptx --use-type .xlsx --use-type .xml 101 | ``` 102 | 103 | 104 | ### Downloading Files from deepset Cloud 105 | This command downloads all files from a workspace to a local directory. For example: 106 | 107 | On MacOS and Linux: 108 | 109 | ```shell 110 | deepset-cloud download --workspace-name 111 | ``` 112 | On Windows: 113 | ```shell 114 | python -m deepset_cloud_sdk.cli download --workspace-name 115 | ``` 116 | 117 | To filter for specific files, use the same filters as for listing files. 118 | 119 | 120 | ### List Files 121 | You can run the `list-files` operation to search files in your deepset Cloud workspace. For example: 122 | 123 | On MacOS and Linux: 124 | ```shell 125 | deepset-cloud list-files 126 | ``` 127 | On Windows: 128 | ```shell 129 | python -m deepset_cloud_sdk.cli list-files 130 | ``` 131 | with optional arguments: 132 | 133 | ```shell 134 | --name "" # search by file name 135 | --content "content" # search by file content 136 | --odata-filter "key eq 'value'" # search by odata filter 137 | ``` 138 | 139 | ### Support 140 | If you encounter issues or have questions, reach out to our team on [Discord](https://discord.com/invite/qZxjM4bAHU). 141 | 142 | We hope you find the deepset Cloud CLI useful in your projects. Happy coding! 143 | -------------------------------------------------------------------------------- /docs/examples/data/example.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.3 2 | %���� 3 | 4 | 1 0 obj 5 | << 6 | /Type /Catalog 7 | /Outlines 2 0 R 8 | /Pages 3 0 R 9 | >> 10 | endobj 11 | 12 | 2 0 obj 13 | << 14 | /Type /Outlines 15 | /Count 0 16 | >> 17 | endobj 18 | 19 | 3 0 obj 20 | << 21 | /Type /Pages 22 | /Count 2 23 | /Kids [ 4 0 R 6 0 R ] 24 | >> 25 | endobj 26 | 27 | 4 0 obj 28 | << 29 | /Type /Page 30 | /Parent 3 0 R 31 | /Resources << 32 | /Font << 33 | /F1 9 0 R 34 | >> 35 | /ProcSet 8 0 R 36 | >> 37 | /MediaBox [0 0 612.0000 792.0000] 38 | /Contents 5 0 R 39 | >> 40 | endobj 41 | 42 | 5 0 obj 43 | << /Length 1074 >> 44 | stream 45 | 2 J 46 | BT 47 | 0 0 0 rg 48 | /F1 0027 Tf 49 | 57.3750 722.2800 Td 50 | ( A Simple PDF File ) Tj 51 | ET 52 | BT 53 | /F1 0010 Tf 54 | 69.2500 688.6080 Td 55 | ( This is a small demonstration .pdf file - ) Tj 56 | ET 57 | BT 58 | /F1 0010 Tf 59 | 69.2500 664.7040 Td 60 | ( just for use in the Virtual Mechanics tutorials. More text. And more ) Tj 61 | ET 62 | BT 63 | /F1 0010 Tf 64 | 69.2500 652.7520 Td 65 | ( text. And more text. And more text. And more text. ) Tj 66 | ET 67 | BT 68 | /F1 0010 Tf 69 | 69.2500 628.8480 Td 70 | ( And more text. And more text. And more text. And more text. And more ) Tj 71 | ET 72 | BT 73 | /F1 0010 Tf 74 | 69.2500 616.8960 Td 75 | ( text. And more text. Boring, zzzzz. And more text. And more text. And ) Tj 76 | ET 77 | BT 78 | /F1 0010 Tf 79 | 69.2500 604.9440 Td 80 | ( more text. And more text. And more text. And more text. And more text. ) Tj 81 | ET 82 | BT 83 | /F1 0010 Tf 84 | 69.2500 592.9920 Td 85 | ( And more text. And more text. ) Tj 86 | ET 87 | BT 88 | /F1 0010 Tf 89 | 69.2500 569.0880 Td 90 | ( And more text. And more text. And more text. And more text. And more ) Tj 91 | ET 92 | BT 93 | /F1 0010 Tf 94 | 69.2500 557.1360 Td 95 | ( text. And more text. And more text. Even more. Continued on page 2 ...) Tj 96 | ET 97 | endstream 98 | endobj 99 | 100 | 6 0 obj 101 | << 102 | /Type /Page 103 | /Parent 3 0 R 104 | /Resources << 105 | /Font << 106 | /F1 9 0 R 107 | >> 108 | /ProcSet 8 0 R 109 | >> 110 | /MediaBox [0 0 612.0000 792.0000] 111 | /Contents 7 0 R 112 | >> 113 | endobj 114 | 115 | 7 0 obj 116 | << /Length 676 >> 117 | stream 118 | 2 J 119 | BT 120 | 0 0 0 rg 121 | /F1 0027 Tf 122 | 57.3750 722.2800 Td 123 | ( Simple PDF File 2 ) Tj 124 | ET 125 | BT 126 | /F1 0010 Tf 127 | 69.2500 688.6080 Td 128 | ( ...continued from page 1. Yet more text. And more text. And more text. ) Tj 129 | ET 130 | BT 131 | /F1 0010 Tf 132 | 69.2500 676.6560 Td 133 | ( And more text. And more text. And more text. And more text. And more ) Tj 134 | ET 135 | BT 136 | /F1 0010 Tf 137 | 69.2500 664.7040 Td 138 | ( text. Oh, how boring typing this stuff. But not as boring as watching ) Tj 139 | ET 140 | BT 141 | /F1 0010 Tf 142 | 69.2500 652.7520 Td 143 | ( paint dry. And more text. And more text. And more text. And more text. ) Tj 144 | ET 145 | BT 146 | /F1 0010 Tf 147 | 69.2500 640.8000 Td 148 | ( Boring. More, a little more text. The end, and just as well. ) Tj 149 | ET 150 | endstream 151 | endobj 152 | 153 | 8 0 obj 154 | [/PDF /Text] 155 | endobj 156 | 157 | 9 0 obj 158 | << 159 | /Type /Font 160 | /Subtype /Type1 161 | /Name /F1 162 | /BaseFont /Helvetica 163 | /Encoding /WinAnsiEncoding 164 | >> 165 | endobj 166 | 167 | 10 0 obj 168 | << 169 | /Creator (Rave \(http://www.nevrona.com/rave\)) 170 | /Producer (Nevrona Designs) 171 | /CreationDate (D:20060301072826) 172 | >> 173 | endobj 174 | 175 | xref 176 | 0 11 177 | 0000000000 65535 f 178 | 0000000019 00000 n 179 | 0000000093 00000 n 180 | 0000000147 00000 n 181 | 0000000222 00000 n 182 | 0000000390 00000 n 183 | 0000001522 00000 n 184 | 0000001690 00000 n 185 | 0000002423 00000 n 186 | 0000002456 00000 n 187 | 0000002574 00000 n 188 | 189 | trailer 190 | << 191 | /Size 11 192 | /Root 1 0 R 193 | /Info 10 0 R 194 | >> 195 | 196 | startxref 197 | 2714 198 | %%EOF 199 | -------------------------------------------------------------------------------- /docs/examples/data/example.txt: -------------------------------------------------------------------------------- 1 | This is text 2 | -------------------------------------------------------------------------------- /docs/examples/data/example.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value" 3 | } 4 | -------------------------------------------------------------------------------- /docs/examples/sdk/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## Upload files to deepset Cloud 4 | 5 | You can upload files in three different ways: 6 | 1. Upload multiple files by providing explicit file paths. 7 | 2. Upload all files from a folder. 8 | 3. Upload raw text. 9 | 10 | For uploading files from your local machine to deepset Cloud, you can use `upload`. 11 | 12 | ## Authentication 13 | 14 | You will need to either explicitly pass an api_key to the `upload` function or set the environment variable 15 | `DEEPSET_CLOUD_API_KEY` to your api key. 16 | By running `deepset-cloud login` you can also store your api key globally on your machine. 17 | This will allow you to omit the api_key parameter in the following examples. 18 | 19 | ## Example 1: Upload all files from a folder 20 | Uploads all files from a folder to the default workspace. 21 | 22 | ```python 23 | upload( 24 | # workspace_name="my_workspace", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used 25 | paths=[Path("./examples/data")], 26 | blocking=True, # optional, by default True 27 | timeout_s=300, # optional, by default 300 28 | show_progress=True, # optional, by default True 29 | recursive=False, # optional, by default False 30 | ) 31 | ``` 32 | 33 | ## Example 2: Upload raw texts 34 | 35 | Uploads a list of raw texts to the default workspace. 36 | This can be useful if you want to process your text first and later upload the content of the files. 37 | 38 | ```python 39 | upload_texts( 40 | # workspace_name="my_workspace", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used 41 | files=[ 42 | DeepsetCloudFile( 43 | name="example.txt", 44 | text="this is text", 45 | meta={"key": "value"}, # optional 46 | ) 47 | ], 48 | blocking=True, # optional, by default True 49 | timeout_s=300, # optional, by default 300 50 | ) 51 | ``` 52 | ## Colab Notebook 53 | 54 | We created this Colab notebook with different upload scenarios that you can test out: [Upload files with SDK in Collab](https://colab.research.google.com/drive/1y2KMB606h-57BafCkhuiaXFWo4gDKtG3?authuser=1#scrollTo=QpIbW_nNA_fT). -------------------------------------------------------------------------------- /docs/examples/sdk/upload.py: -------------------------------------------------------------------------------- 1 | ## Authentication 2 | ## -------------- 3 | ## Either explicitly pass an api_key to the `upload` function or set the environment variable 4 | ## `DEEPSET_CLOUD_API_KEY` to your API key. 5 | ## By running `deepset-cloud login` you can also store your API key globally on your machine. 6 | ## This omits the `api_key`` parameter in the following examples. 7 | 8 | ## Example 1: Upload all files from a folder 9 | ## ----------------------------------------- 10 | ## Uploads all files from a folder to the default workspace. 11 | 12 | from pathlib import Path 13 | 14 | from deepset_cloud_sdk.workflows.sync_client.files import upload 15 | 16 | upload( 17 | # workspace_name="my_workspace", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used 18 | paths=[Path("./examples/data")], 19 | blocking=True, # optional, by default True 20 | timeout_s=300, # optional, by default 300 21 | show_progress=True, # optional, by default True 22 | recursive=False, # optional, by default False 23 | ) 24 | 25 | 26 | ## Example 2: Upload raw texts 27 | ## --------------------------- 28 | ## Uploads a list of raw texts to the default workspace. 29 | ## This is useful if you want to process your text first and upload the content of the files later. 30 | 31 | from deepset_cloud_sdk.workflows.sync_client.files import upload_texts 32 | 33 | upload_texts( 34 | # workspace_name="my_workspace", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used 35 | files=[ 36 | DeepsetCloudFile( 37 | name="example.txt", 38 | text="this is text", 39 | meta={"key": "value"}, # optional 40 | ) 41 | ], 42 | blocking=True, # optional, by default True 43 | timeout_s=300, # optional, by default 300 44 | ) 45 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 |

2 | deepset Cloud SDK 3 |

4 | 5 | [![Coverage badge](https://github.com/deepset-ai/deepset-cloud-sdk/raw/python-coverage-comment-action-data/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/tree/python-coverage-comment-action-data) 6 | [![Tests](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/continuous-integration.yml/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/continuous-integration.yml) 7 | [![Deploy PyPi](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/deploy-prod.yml/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/deploy-prod.yml) 8 | [![Compliance Checks](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/compliance.yml/badge.svg)](https://github.com/deepset-ai/deepset-cloud-sdk/actions/workflows/compliance.yml) 9 | 10 | The deepset Cloud SDK is an open source software development kit that provides convenient access and integration with deepset Cloud, a powerful cloud offering for various natural language processing (NLP) tasks. To learn more about deepset Cloud, please have a look at the [official Documentation](https://docs.cloud.deepset.ai/). 11 | 12 | # Supported Features 13 | The following examples demonstrate how to use the deepset Cloud SDK to interact with deepset Cloud using Python. 14 | You can use the deepset Cloud SDK in the command line as well. For more information, see the [CLI documentation](/deepset-cloud-sdk/examples/cli). 15 | - [SDK Examples - Upload datasets](/deepset-cloud-sdk/examples/sdk) 16 | - [CLI Examples - Upload datasets](/deepset-cloud-sdk/examples/cli/) 17 | 18 | ## Installation 19 | The deepset Cloud SDK is available on PyPI and you can install it using pip: 20 | ```bash 21 | pip install deepset-cloud-sdk 22 | ``` 23 | 24 | After installing the deepset Cloud SDK, you can use it to interact with deepset Cloud. It comes with a command line interface (CLI), that you can use by calling: 25 | ```bash 26 | deepset-cloud --help 27 | ``` 28 | 29 |

30 | deepset Cloud CLI 31 |

32 | 33 | ### Development Installation 34 | To install the deepset Cloud SDK for development, clone the repository and install the package in editable mode: 35 | ```bash 36 | pip install hatch==1.7.0 37 | hatch build 38 | ``` 39 | 40 | Instead of calling the cli from the build package, you can call it directly from the source code: 41 | ```bash 42 | python3 -m deepset_cloud_sdk.cli --help 43 | ``` 44 | 45 | --- 46 | ## Interested in deepset Cloud? 47 | If you are interested in exploring deepset Cloud, visit cloud.deepset.ai. 48 | deepset Cloud provides a range of NLP capabilities and services to help you build and deploy powerful 49 | natural language processing applications. 50 | 51 | ## Interested in Haystack? 52 | deepset Cloud is powered by Haystack, an open source framework for building end-to-end NLP pipelines. 53 | - [Project website](https://haystack.deepset.ai/) 54 | - [GitHub repository](https://github.com/deepset-ai/haystack) 55 | -------------------------------------------------------------------------------- /docs/upload_files.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Uploading with SDK is the fastest way if you have many files. It uses sessions under the hood. That means, you create a session and then upload files to this session. Each session has an ID and you can check its status. The upload starts when you close a session. If you leave a session open, it expires after 24 hours. 4 | 5 | After your files are uploaded, it can take a while for them to be listed in deepset Cloud. This means that if you deployed a pipeline, you may need to wait a while for it to run on the newly uploaded files. 6 | 7 | You can use the CLI or the SDK Python methods to upload your files. 8 | 9 | ## Folder Structure 10 | 11 | You don't need to follow any specific folder structure. If your folder contains files with the same name, all these files are uploaded, by default. You can set the `--write-mode` to overwrite the files, keep them all, or fail the upload. For more information, see [CLI examples](/examples/cli/README.md) and [SDK examples](/examples/sdk/README.md). 12 | 13 | # Upload Files 14 | 15 | ## Upload text files: 16 | 17 | By default it is allowed to upload .txt and .pdf files. See below to upload different file types. 18 | 19 | 1. Log in to the sdk: `deepset-cloud login` (MacOS and Linux) or `python -m deepset_cloud_sdk.cli login` (Windows). 20 | 2. When prompted, paste your deepset Cloud API key. 21 | 3. Type the name of the deepset Cloud workspace you want to set as default for all operations. 22 | 4. Choose if you want to use the CLI or a Python script to upload: 23 | - To upload files from a folder using CLI, run: `deepset-cloud upload ` (MacOS and Linux) or `python -m deepset_cloud_sdk.cli upload ` (On Windows) 24 | - To upload files from a folder using a Python script, create the script and run it. Here's an example you can use: 25 | 26 | ```python 27 | from pathlib import Path 28 | from deepset_cloud_sdk.workflows.sync_client.files import upload 29 | 30 | ## Uploads all txt and pdf files from a given path 31 | upload( 32 | paths=[Path("")], 33 | blocking=True, # waits until the files are displayed in deepset Cloud, 34 | # this may take a couple of minutes 35 | timeout_s=300, # the timeout for the `blocking` parameter in number of seconds 36 | show_progress=True, # shows the progress bar 37 | recursive=True, # uploads text files from all subfolders as well 38 | ) 39 | ``` 40 | 41 | ## Upload other file types 42 | 43 | Deepset Cloud currently supports uploading : .csv, .docx, .html, .json, .md, .txt, .pdf, .pptx, .xlsx and .xml. 44 | 45 | 46 | ```python 47 | from pathlib import Path 48 | from deepset_cloud_sdk.workflows.sync_client.files import upload 49 | 50 | ## Uploads supported files from a given path 51 | upload( 52 | paths=[Path("")], 53 | blocking=True, 54 | timeout_s=300, 55 | show_progress=True, 56 | recursive=True, 57 | desired_file_types=[ # list of desired file types to upload 58 | ".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml" 59 | ] 60 | ) 61 | ``` 62 | 63 | For more examples, see [CLI examples](/examples/cli/README.md) and [SDK examples](/examples/sdk/README.md). 64 | 65 | # Metadata 66 | 67 | To add metadata to your files, create one metadata file for each file you upload. The metadata file must be a JSON with the same name as the file whose metadata it contains and the extension `meta.json`. 68 | 69 | For example, if you're uploading a file called `example.txt`, the metadata file should be called `example.txt.meta.json`. If you're uploading a file called `example.pdf`, the metadata file should be `example.pdf.meta.json`. 70 | 71 | The format your metadata in your metadata files should follow is: `{"meta_key1": "value1", "meta_key2": "value2"}`. See the [example metadata file](/examples/data/example.txt.meta.json). 72 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: deepset Cloud SDK 2 | site_description: deepset Cloud SDK Documentation 3 | site_url: https://deepset-ai.github.io/deepset-cloud-sdk/ 4 | 5 | # Repository 6 | repo_name: deepset-ai/deepset-cloud-sdk 7 | repo_url: https://github.com/deepset-ai/deepset-cloud-sdk 8 | edit_uri: "" 9 | 10 | theme: 11 | name: material 12 | favicon: _images/favicon.svg 13 | logo: _images/white-logo.svg 14 | features: 15 | - content.code.copy 16 | palette: 17 | primary: custom 18 | 19 | plugins: 20 | - search 21 | - mermaid2 22 | - mkdocstrings 23 | 24 | markdown_extensions: 25 | - pymdownx.highlight: 26 | anchor_linenums: true 27 | line_spans: __span 28 | pygments_lang_class: true 29 | - pymdownx.inlinehilite 30 | - pymdownx.snippets 31 | - pymdownx.superfences: 32 | preserve_tabs: true 33 | custom_fences: 34 | - name: mermaid 35 | class: mermaid 36 | format: !!python/name:pymdownx.superfences.fence_code_format 37 | 38 | extra: 39 | version: 40 | provider: mike 41 | 42 | extra_javascript: 43 | - optionalConfig.js 44 | - https://unpkg.com/mermaid@9.4.0/dist/mermaid.min.js 45 | - extra-loader.js 46 | 47 | extra_css: 48 | - _stylesheets/extra.css 49 | 50 | # mkdocs uses the `docs` folder as root folder 51 | nav: 52 | - Get Started: index.md 53 | - Upload Files: upload_files.md 54 | - Examples: 55 | - CLI: examples/cli/README.md 56 | - SDK: examples/sdk/README.md 57 | - API Docs: # autogenerated within the /docs/_pydoc folder 58 | - Synchronous SDK: _pydoc/temp/sync_client.md 59 | - Asynchronous SDK: _pydoc/temp/async_client.md 60 | - CLI: _pydoc/temp/cli.md 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "deepset-cloud-sdk" 7 | dynamic = ["version"] 8 | description = 'deepset Cloud SDK' 9 | readme = "README.md" 10 | requires-python = ">= 3.8" 11 | license = "Apache-2.0" 12 | keywords = [] 13 | authors = [{ name = "deepset", email = "rohan.janjua@deepset.ai" }] 14 | classifiers = [ 15 | "Development Status :: 4 - Beta", 16 | "Programming Language :: Python", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: Implementation :: CPython", 22 | "Programming Language :: Python :: Implementation :: PyPy", 23 | ] 24 | dependencies = [ 25 | "structlog>=24.0.0", 26 | "httpx>=0.27.2", 27 | "python-dotenv>=1.0.1", 28 | "typer>=0.16.0", 29 | "click==8.2.0", # fixed because of bug in 8.2.1, see https://github.com/pallets/click/issues/2939 30 | "tenacity>=8.3.0", 31 | "aiohttp>=3.10.10", 32 | "aiofiles>=24.1.0", 33 | "tabulate>=0.9.0", 34 | "tqdm>=4.66.4", 35 | "yaspin>=3.0.0", 36 | "pyrate-limiter>=3.7.0", 37 | "pydantic>=2.11.4", 38 | "ruamel.yaml>=0.18.10", 39 | ] 40 | 41 | [project.urls] 42 | Documentation = "https://github.com/deepset-ai/deepset-cloud-sdk#readme" 43 | Issues = "https://github.com/deepset-ai/deepset-cloud-sdk/issues" 44 | Source = "https://github.com/deepset-ai/deepset-cloud-sdk" 45 | 46 | 47 | [project.scripts] 48 | deepset-cloud = "deepset_cloud_sdk.cli:run_packaged" 49 | 50 | [tool.hatch.version] 51 | path = "deepset_cloud_sdk/__about__.py" 52 | 53 | [tool.hatch.envs.default.scripts] 54 | tests-with-cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=deepset_cloud_sdk tests/unit" 55 | tests-unit = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=deepset_cloud_sdk tests/unit" 56 | tests-integration = "pytest tests/integration" 57 | 58 | [[tool.hatch.envs.all.matrix]] 59 | python = ["3.10"] 60 | 61 | [tool.hatch.envs.default] 62 | dependencies = [] 63 | 64 | [tool.hatch.envs.test.scripts] 65 | unit-with-cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=deepset_cloud_sdk tests/unit" 66 | integration = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=deepset_cloud_sdk tests/integration" 67 | 68 | [tool.hatch.envs.test] 69 | template = 'default' 70 | dependencies = [ 71 | "pytest-cov==4.0.0", 72 | "pytest==7.3.1", 73 | "pytest-asyncio==0.21.0", 74 | "haystack-ai>=2.13.2", # only for testing 75 | "respx==0.22.0", 76 | ] 77 | 78 | 79 | [tool.hatch.envs.code-quality] 80 | python = "3.10" 81 | template = 'default' 82 | detached = false 83 | # Please keep these aligned with the versions defined in .pre-commit-config.yaml 84 | dependencies = [ 85 | "pylint==2.17.4", 86 | "pydocstyle==6.3.0", 87 | "black==23.3.0", 88 | "isort==5.12.0", 89 | "mypy==1.1.1", 90 | "pre-commit==2.20.0", 91 | "types-aiofiles==23.1.0.2", 92 | "types-tabulate==0.9.0.2", 93 | "autoflake==2.1.1", 94 | ] 95 | 96 | [tool.hatch.envs.code-quality.scripts] 97 | types = "mypy deepset_cloud_sdk tests" 98 | format = "black deepset_cloud_sdk tests --check" 99 | format-fix = "black deepset_cloud_sdk tests" 100 | lint = "pylint deepset_cloud_sdk" 101 | sort = "isort --check --profile black ." 102 | sort-fix = "isort --profile black ." 103 | hooks = "pre-commit install" 104 | docstrings = "pydocstyle deepset_cloud_sdk" 105 | flake = "autoflake --remove-all-unused-imports --remove-duplicate-keys --remove-unused-variables -v -r ./deepset_cloud_sdk" 106 | all = "hatch run types && hatch run format-fix && hatch run lint && hatch run sort && hatch run docstrings && hatch run flake" 107 | 108 | [tool.hatch.envs.tools] 109 | detached = false 110 | # Please keep these aligned with the versions defined in .pre-commit-config.yaml 111 | dependencies = ["pip-tools==6.13.0"] 112 | 113 | [tool.hatch.envs.tools.scripts] 114 | requirements = "pip-compile -o requirements.txt pyproject.toml" 115 | 116 | [tool.coverage.run] 117 | branch = true 118 | relative_files = true 119 | omit = ["deepset_cloud_sdk/__about__.py"] 120 | 121 | [tool.coverage.report] 122 | exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] 123 | 124 | [tool.black] 125 | line-length = 120 126 | 127 | [tool.mypy] 128 | python_version = "3.10" 129 | warn_return_any = true 130 | warn_unused_configs = true 131 | ignore_missing_imports = true 132 | disallow_incomplete_defs = true 133 | disallow_untyped_defs = true 134 | 135 | [tool.pylint.'MESSAGES CONTROL'] 136 | max-line-length = 150 137 | disable = [ 138 | "fixme", 139 | "c-extension-no-member", 140 | "wrong-spelling-in-comment", 141 | "wrong-spelling-in-docstring", 142 | "missing-module-docstring", 143 | ] 144 | [tool.pylint.'DESIGN'] 145 | max-args = 9 146 | 147 | [tool.pylint.'SIMILARITIES'] 148 | min-similarity-lines = 10 149 | 150 | [tool.pylint.'BASIC'] 151 | good-names = ["i", "k", "v", "_", "f1"] 152 | 153 | [tool.hatch.build.targets.sdist] 154 | exclude = ["/.github", "/tests"] 155 | 156 | [tool.hatch.build.targets.wheel] 157 | packages = ["deepset_cloud_sdk"] 158 | -------------------------------------------------------------------------------- /test-upload/example.txt: -------------------------------------------------------------------------------- 1 | this is my text 2 | -------------------------------------------------------------------------------- /test-upload/example.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value", 3 | "key2": "value2" 4 | } 5 | -------------------------------------------------------------------------------- /test-upload/example2.txt: -------------------------------------------------------------------------------- 1 | this is my text 2 2 | -------------------------------------------------------------------------------- /test-upload/example2.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value", 3 | "key2": "value2" 4 | } 5 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import os 4 | from http import HTTPStatus 5 | from typing import Generator, List 6 | from unittest.mock import AsyncMock, Mock 7 | from uuid import uuid4 8 | 9 | import httpx 10 | import pytest 11 | import structlog 12 | from dotenv import load_dotenv 13 | 14 | # from faker import Faker 15 | from tenacity import retry, stop_after_delay, wait_fixed 16 | 17 | from deepset_cloud_sdk._api.config import CommonConfig 18 | from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI 19 | from deepset_cloud_sdk._api.files import FilesAPI 20 | from deepset_cloud_sdk._api.upload_sessions import ( 21 | AWSPrefixedRequestConfig, 22 | UploadSession, 23 | UploadSessionsAPI, 24 | ) 25 | from deepset_cloud_sdk._s3.upload import S3 26 | 27 | load_dotenv() 28 | 29 | logger = structlog.get_logger(__name__) 30 | 31 | 32 | def _get_file_names(integration_config: CommonConfig, workspace_name: str) -> List[str]: 33 | list_response = httpx.get( 34 | f"{integration_config.api_url}/workspaces/{workspace_name}/files", 35 | headers={"Authorization": f"Bearer {integration_config.api_key}"}, 36 | params={"limit": 100}, 37 | ) 38 | assert list_response.status_code == HTTPStatus.OK 39 | file_names: List[str] = list_response.json()["data"] 40 | logger.info("Found files", file_names=file_names) 41 | return file_names 42 | 43 | 44 | @pytest.fixture(scope="session") 45 | def integration_config() -> CommonConfig: 46 | config = CommonConfig( 47 | api_key=os.getenv("API_KEY", ""), 48 | api_url=os.getenv("API_URL", ""), 49 | ) 50 | assert config.api_key != "", "API_KEY environment variable must be set" 51 | assert config.api_url != "", "API_URL environment variable must be set" 52 | return config 53 | 54 | 55 | @pytest.fixture(scope="session") 56 | def integration_config_safe_mode() -> CommonConfig: 57 | config = CommonConfig( 58 | api_key=os.getenv("API_KEY", ""), 59 | api_url=os.getenv("API_URL", ""), 60 | safe_mode=True, 61 | ) 62 | assert config.api_key != "", "API_KEY environment variable must be set" 63 | assert config.api_url != "", "API_URL environment variable must be set" 64 | return config 65 | 66 | 67 | @pytest.fixture 68 | def unit_config() -> CommonConfig: 69 | return CommonConfig(api_key="test_api_key", api_url="https://fake.dc.api/api/v1") 70 | 71 | 72 | @pytest.fixture 73 | def mocked_client() -> Mock: 74 | return Mock(spec=httpx.AsyncClient) 75 | 76 | 77 | @pytest.fixture 78 | def mocked_deepset_cloud_api() -> Mock: 79 | return Mock(spec=DeepsetCloudAPI) 80 | 81 | 82 | @pytest.fixture 83 | def mocked_upload_sessions_api() -> Mock: 84 | return Mock(spec=UploadSessionsAPI) 85 | 86 | 87 | @pytest.fixture 88 | def mocked_files_api() -> Mock: 89 | return Mock(spec=FilesAPI) 90 | 91 | 92 | @pytest.fixture 93 | def mocked_s3() -> Mock: 94 | # TODO: add aws client mock that sends files to aws 95 | return AsyncMock(spec=S3) 96 | 97 | 98 | @pytest.fixture 99 | def deepset_cloud_api(unit_config: CommonConfig, mocked_client: Mock) -> DeepsetCloudAPI: 100 | return DeepsetCloudAPI(config=unit_config, client=mocked_client) 101 | 102 | 103 | @pytest.fixture 104 | def upload_session_response() -> UploadSession: 105 | return UploadSession( 106 | session_id=uuid4(), 107 | documentation_url="Documentation URL", 108 | expires_at=datetime.datetime.now(), 109 | aws_prefixed_request_config=AWSPrefixedRequestConfig(url="uploadURL", fields={"key": "value"}), 110 | ) 111 | 112 | 113 | @retry( 114 | stop=stop_after_delay(120), 115 | wait=wait_fixed(1), 116 | reraise=True, 117 | ) 118 | def _wait_for_file_to_be_available( 119 | integration_config: CommonConfig, workspace_name: str, expected_file_count: int = 15 120 | ) -> None: 121 | assert len(_get_file_names(integration_config, workspace_name)) >= expected_file_count 122 | 123 | 124 | @pytest.fixture(scope="session") 125 | def workspace_name(integration_config: CommonConfig) -> Generator[str, None, None]: 126 | """Create a workspace for the tests and delete it afterwards.""" 127 | workspace_name = f"sdktest_{uuid4()}" 128 | 129 | logger.info("Creating workspace", workspace_name=workspace_name) 130 | 131 | # try creating workspace 132 | response = httpx.post( 133 | f"{integration_config.api_url}/workspaces", 134 | json={"name": workspace_name}, 135 | headers={"Authorization": f"Bearer {integration_config.api_key}"}, 136 | ) 137 | assert response.status_code in (HTTPStatus.CREATED, HTTPStatus.CONFLICT) 138 | 139 | try: 140 | if len(_get_file_names(integration_config=integration_config, workspace_name=workspace_name)) == 0: 141 | for i in range(15): 142 | response = httpx.post( 143 | f"{integration_config.api_url}/workspaces/{workspace_name}/files", 144 | data={"text": "This is text"}, 145 | files={ 146 | "meta": (None, json.dumps({"find": "me"}).encode("utf-8")), 147 | }, 148 | params={"file_name": f"example{i}.txt"}, 149 | headers={"Authorization": f"Bearer {integration_config.api_key}"}, 150 | ) 151 | assert response.status_code == HTTPStatus.CREATED 152 | 153 | _wait_for_file_to_be_available(integration_config, workspace_name, expected_file_count=15) 154 | 155 | yield workspace_name 156 | 157 | finally: 158 | response = httpx.delete( 159 | f"{integration_config.api_url}/workspaces/{workspace_name}", 160 | headers={"Authorization": f"Bearer {integration_config.api_key}"}, 161 | ) 162 | 163 | assert response.status_code in (HTTPStatus.OK, HTTPStatus.NO_CONTENT) 164 | -------------------------------------------------------------------------------- /tests/data/.fake-env: -------------------------------------------------------------------------------- 1 | API_KEY="fake-api-key" 2 | -------------------------------------------------------------------------------- /tests/data/direct_upload/example.txt: -------------------------------------------------------------------------------- 1 | asdf 2 | -------------------------------------------------------------------------------- /tests/data/direct_upload/example.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value" 3 | } 4 | -------------------------------------------------------------------------------- /tests/data/example.txt: -------------------------------------------------------------------------------- 1 | This is text -------------------------------------------------------------------------------- /tests/data/upload_folder/example.csv: -------------------------------------------------------------------------------- 1 | example 1,example 2 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.csv.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value" 3 | } 4 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.docx: -------------------------------------------------------------------------------- 1 | This is text 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.html: -------------------------------------------------------------------------------- 1 |

example 1

2 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder/example.jpg -------------------------------------------------------------------------------- /tests/data/upload_folder/example.json: -------------------------------------------------------------------------------- 1 | { 2 | "example": "This is an example of a JSON object." 3 | } 4 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.md: -------------------------------------------------------------------------------- 1 | # Example 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder/example.pdf -------------------------------------------------------------------------------- /tests/data/upload_folder/example.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder/example.pptx -------------------------------------------------------------------------------- /tests/data/upload_folder/example.txt: -------------------------------------------------------------------------------- 1 | This is text 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": "value" 3 | } 4 | -------------------------------------------------------------------------------- /tests/data/upload_folder/example.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder/example.xlsx -------------------------------------------------------------------------------- /tests/data/upload_folder/example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example 1 4 | 5 | -------------------------------------------------------------------------------- /tests/data/upload_folder_nested/example.txt: -------------------------------------------------------------------------------- 1 | This is text 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder_nested/meta/example.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"a": "b"} 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder_nested/nested_folder/second.txt: -------------------------------------------------------------------------------- 1 | This is text 2 2 | -------------------------------------------------------------------------------- /tests/data/upload_folder_with_duplicates/file1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder_with_duplicates/file1.txt -------------------------------------------------------------------------------- /tests/data/upload_folder_with_duplicates/file2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder_with_duplicates/file2.txt -------------------------------------------------------------------------------- /tests/data/upload_folder_with_duplicates/old_files/file1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder_with_duplicates/old_files/file1.txt -------------------------------------------------------------------------------- /tests/data/upload_folder_with_duplicates/old_files/file2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/data/upload_folder_with_duplicates/old_files/file2.txt -------------------------------------------------------------------------------- /tests/integration/api/test_integration_files.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import pytest 4 | import tenacity 5 | 6 | from deepset_cloud_sdk._api.config import CommonConfig 7 | from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI 8 | from deepset_cloud_sdk._api.files import FilesAPI 9 | 10 | 11 | @pytest.mark.asyncio 12 | class TestListFiles: 13 | async def test_list_paginated( 14 | self, 15 | integration_config: CommonConfig, 16 | workspace_name: str, 17 | ) -> None: 18 | async with DeepsetCloudAPI.factory(integration_config) as deepset_cloud_api: 19 | files_api = FilesAPI(deepset_cloud_api) 20 | 21 | # We need to retry fetching this, because the file itself is available 22 | # immediately, but the search index might not be updated yet. 23 | # We are searching by context here which is otherwise not available. 24 | for attempt in tenacity.Retrying( 25 | stop=tenacity.stop_after_delay(300), 26 | wait=tenacity.wait_fixed(wait=timedelta(seconds=0.5)), 27 | reraise=True, 28 | ): 29 | with attempt: 30 | result = await files_api.list_paginated( 31 | workspace_name=workspace_name, 32 | limit=10, 33 | name="example0.txt", 34 | odata_filter="find eq 'me'", 35 | ) 36 | assert result.total == 1 37 | assert result.has_more is False 38 | assert len(result.data) == 1 39 | found_file = result.data[0] 40 | assert found_file.name == "example0.txt" 41 | assert found_file.size > 0 42 | assert found_file.meta == {"find": "me"} 43 | -------------------------------------------------------------------------------- /tests/integration/api/test_integration_upload_sessions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from deepset_cloud_sdk._api.config import CommonConfig 4 | from deepset_cloud_sdk._api.deepset_cloud_api import DeepsetCloudAPI 5 | from deepset_cloud_sdk._api.upload_sessions import ( 6 | UploadSession, 7 | UploadSessionDetailList, 8 | UploadSessionIngestionStatus, 9 | UploadSessionsAPI, 10 | ) 11 | 12 | 13 | @pytest.mark.asyncio 14 | @pytest.mark.parametrize("integration_config", ["integration_config", "integration_config_safe_mode"], indirect=True) 15 | class TestCreateUploadSessions: 16 | async def test_create_and_close_upload_session(self, integration_config: CommonConfig, workspace_name: str) -> None: 17 | async with DeepsetCloudAPI.factory(integration_config) as deepset_cloud_api: 18 | upload_session_client = UploadSessionsAPI(deepset_cloud_api) 19 | 20 | result: UploadSession = await upload_session_client.create(workspace_name=workspace_name) 21 | assert result.session_id is not None 22 | assert result.documentation_url is not None 23 | assert result.expires_at is not None 24 | 25 | assert "-user-files-upload.s3.amazonaws.com/" in result.aws_prefixed_request_config.url 26 | 27 | assert result.aws_prefixed_request_config.fields["key"] is not None 28 | 29 | await upload_session_client.close(workspace_name=workspace_name, session_id=result.session_id) 30 | 31 | session_status = await upload_session_client.status( 32 | workspace_name=workspace_name, session_id=result.session_id 33 | ) 34 | assert session_status.session_id is not None 35 | assert session_status.documentation_url is not None 36 | assert session_status.expires_at is not None 37 | assert session_status.ingestion_status == UploadSessionIngestionStatus(failed_files=0, finished_files=0) 38 | 39 | async def test_list_upload_session(self, integration_config: CommonConfig, workspace_name: str) -> None: 40 | async with DeepsetCloudAPI.factory(integration_config) as deepset_cloud_api: 41 | upload_session_client = UploadSessionsAPI(deepset_cloud_api) 42 | 43 | await upload_session_client.create(workspace_name=workspace_name) 44 | 45 | result: UploadSessionDetailList = await upload_session_client.list( 46 | workspace_name=workspace_name, limit=1, page_number=1 47 | ) 48 | 49 | assert result.total > 0 50 | assert result.data is not None 51 | assert len(result.data) == 1 52 | -------------------------------------------------------------------------------- /tests/integration/workflows/test_integration_pipeline_client.py: -------------------------------------------------------------------------------- 1 | """Integration tests for importing Haystack pipelines into deepset AI Platform.""" 2 | import json 3 | 4 | import pytest 5 | import respx 6 | from haystack import AsyncPipeline, Pipeline 7 | from haystack.components.builders.answer_builder import AnswerBuilder 8 | from haystack.components.builders.prompt_builder import PromptBuilder 9 | from haystack.components.converters.txt import TextFileToDocument 10 | from haystack.components.embedders.sentence_transformers_document_embedder import ( 11 | SentenceTransformersDocumentEmbedder, 12 | ) 13 | from haystack.components.generators.openai import OpenAIGenerator 14 | from haystack.components.routers.file_type_router import FileTypeRouter 15 | from haystack.utils import Secret 16 | from httpx import Response 17 | 18 | from deepset_cloud_sdk.workflows.pipeline_client import PipelineClient 19 | from deepset_cloud_sdk.workflows.pipeline_client.models import ( 20 | IndexConfig, 21 | IndexInputs, 22 | PipelineConfig, 23 | PipelineInputs, 24 | PipelineOutputs, 25 | ) 26 | 27 | 28 | @pytest.mark.parametrize("pipeline_class", [Pipeline, AsyncPipeline]) 29 | class TestImportIndexIntoDeepset: 30 | @pytest.fixture 31 | def sample_index(self, pipeline_class: Pipeline | AsyncPipeline) -> Pipeline: 32 | """Create a simple index for testing.""" 33 | file_type_router = FileTypeRouter(mime_types=["text/plain"]) 34 | text_converter = TextFileToDocument(encoding="utf-8") 35 | document_embedder = SentenceTransformersDocumentEmbedder(normalize_embeddings=True, model="intfloat/e5-base-v2") 36 | 37 | # Create and configure pipeline 38 | index = pipeline_class() 39 | 40 | # Add components 41 | index.add_component("file_type_router", file_type_router) 42 | index.add_component("text_converter", text_converter) 43 | index.add_component("document_embedder", document_embedder) 44 | 45 | # Connect components 46 | index.connect("file_type_router.text/plain", "text_converter.sources") 47 | index.connect("text_converter.documents", "document_embedder.documents") 48 | 49 | return index 50 | 51 | @pytest.mark.integration 52 | @respx.mock 53 | def test_import_index_into_deepset(self, sample_index: Pipeline) -> None: 54 | """Test synchronously importing an index into deepset.""" 55 | route = respx.post("https://test-api-url.com/workspaces/test-workspace/indexes").mock( 56 | return_value=Response(status_code=201, json={"id": "test-index-id"}) 57 | ) 58 | 59 | # Initialize client with explicit configuration 60 | client = PipelineClient( 61 | api_key="test-api-key", api_url="https://test-api-url.com", workspace_name="test-workspace" 62 | ) 63 | 64 | index_config = IndexConfig( 65 | name="test-index", 66 | inputs=IndexInputs( 67 | files=["file_type_router.sources"], 68 | ), 69 | ) 70 | 71 | client.import_into_deepset(sample_index, index_config) 72 | 73 | assert route.called 74 | request = route.calls.last.request 75 | assert request.headers["Authorization"] == "Bearer test-api-key" 76 | 77 | request_body = json.loads(request.content) 78 | assert request_body["name"] == "test-index" 79 | assert request_body["config_yaml"].startswith("components:\n document_embedder:\n") 80 | 81 | @pytest.mark.asyncio 82 | @pytest.mark.integration 83 | @respx.mock 84 | async def test_import_index_into_deepset_async(self, sample_index: Pipeline) -> None: 85 | """Test asynchronously importing an index into deepset.""" 86 | route = respx.post("https://test-api-url.com/workspaces/test-workspace/indexes").mock( 87 | return_value=Response(status_code=201, json={"id": "test-index-id"}) 88 | ) 89 | 90 | # Initialize client with explicit configuration 91 | client = PipelineClient( 92 | api_key="test-api-key", api_url="https://test-api-url.com", workspace_name="test-workspace" 93 | ) 94 | 95 | index_config = IndexConfig( 96 | name="test-index-async", 97 | inputs=IndexInputs( 98 | files=["file_type_router.sources"], 99 | ), 100 | ) 101 | 102 | await client.import_into_deepset_async(sample_index, index_config) 103 | 104 | assert route.called 105 | request = route.calls.last.request 106 | assert request.headers["Authorization"] == "Bearer test-api-key" 107 | 108 | request_body = json.loads(request.content) 109 | assert request_body["name"] == "test-index-async" 110 | assert request_body["config_yaml"].startswith("components:\n document_embedder:\n") 111 | 112 | 113 | @pytest.mark.parametrize("pipeline_class", [Pipeline, AsyncPipeline]) 114 | class TestImportPipelineIntoDeepset: 115 | @pytest.fixture 116 | def sample_pipeline(self, pipeline_class: Pipeline | AsyncPipeline, monkeypatch: pytest.MonkeyPatch) -> Pipeline: 117 | """Create a sample pipeline for testing.""" 118 | monkeypatch.setenv("OPENAI_API_KEY", "test-openai-api-key") 119 | 120 | # Initialize components 121 | prompt_builder = PromptBuilder( 122 | template="""You are a technical expert. 123 | You summary should be no longer than five sentences. 124 | Passage: {{ question }} 125 | Your summary: """, 126 | required_variables=["*"], 127 | ) 128 | 129 | llm = OpenAIGenerator(api_key=Secret.from_env_var("OPENAI_API_KEY", strict=False), model="gpt-4") 130 | 131 | answer_builder = AnswerBuilder() 132 | 133 | # Create and configure pipeline 134 | pipeline = pipeline_class() 135 | 136 | # Add components 137 | pipeline.add_component("prompt_builder", prompt_builder) 138 | pipeline.add_component("llm", llm) 139 | pipeline.add_component("answer_builder", answer_builder) 140 | 141 | # Connect components 142 | pipeline.connect("prompt_builder.prompt", "llm.prompt") 143 | pipeline.connect("llm.replies", "answer_builder.replies") 144 | 145 | return pipeline 146 | 147 | @pytest.mark.integration 148 | @respx.mock 149 | def test_import_pipeline_into_deepset(self, sample_pipeline: Pipeline) -> None: 150 | """Test synchronously importing a pipeline into deepset AI Platform.""" 151 | route = respx.post("https://test-api-url.com/workspaces/test-workspace/pipelines").mock( 152 | return_value=Response(status_code=201, json={"id": "test-pipeline-id"}) 153 | ) 154 | 155 | client = PipelineClient( 156 | api_key="test-api-key", api_url="https://test-api-url.com", workspace_name="test-workspace" 157 | ) 158 | 159 | pipeline_config = PipelineConfig( 160 | name="test-pipeline", 161 | inputs=PipelineInputs(query=["prompt_builder.prompt", "answer_builder.query"]), 162 | outputs=PipelineOutputs(answers="answer_builder.answers"), 163 | ) 164 | client.import_into_deepset(sample_pipeline, pipeline_config) 165 | 166 | assert route.called 167 | request = route.calls.last.request 168 | assert request.headers["Authorization"] == "Bearer test-api-key" 169 | 170 | request_body = json.loads(request.content) 171 | assert request_body["name"] == "test-pipeline" 172 | assert request_body["query_yaml"].startswith("components:\n answer_builder:\n init_parameters:\n") 173 | 174 | @pytest.mark.asyncio 175 | @pytest.mark.integration 176 | @respx.mock 177 | async def test_import_pipeline_into_deepset_async(self, sample_pipeline: Pipeline) -> None: 178 | """Test asynchronously importing a pipeline into deepset.""" 179 | route = respx.post("https://test-api-url.com/workspaces/test-workspace/pipelines").mock( 180 | return_value=Response(status_code=200, json={"name": "test-pipeline-id"}) 181 | ) 182 | 183 | client = PipelineClient( 184 | api_key="test-api-key", api_url="https://test-api-url.com", workspace_name="test-workspace" 185 | ) 186 | 187 | pipeline_config = PipelineConfig( 188 | name="test-pipeline", 189 | inputs=PipelineInputs(query=["prompt_builder.prompt", "answer_builder.query"]), 190 | outputs=PipelineOutputs(answers="answer_builder.answers"), 191 | ) 192 | await client.import_into_deepset_async(sample_pipeline, pipeline_config) 193 | 194 | assert route.called 195 | request = route.calls.last.request 196 | assert request.headers["Authorization"] == "Bearer test-api-key" 197 | 198 | request_body = json.loads(request.content) 199 | assert request_body["name"] == "test-pipeline" 200 | assert request_body["query_yaml"].startswith("components:\n answer_builder:\n init_parameters:\n") 201 | -------------------------------------------------------------------------------- /tests/test_data/basic.txt: -------------------------------------------------------------------------------- 1 | this is a file 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/103275.txt: -------------------------------------------------------------------------------- 1 | For example the median expected hourly pay for a typical Physician - Pediatric Neonatology in the United States is $116 an hour, so 50% of the people who perform the job of Physician - Pediatric Neonatology in the United States are expected to make less than $116. Source: HR Reported data as of January 02, 2018 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/103275.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "103275", "source": "msmarco", "meta_1": "category_4", "meta_2": 1, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/103291.txt: -------------------------------------------------------------------------------- 1 | Origin of the name Cassandra: Derived from the Greek Kassandra, the mythological daughter of Priam and Hecuba who had the power of prophesy. Var: Casaundra, Kasandra, Kassandra, Kasaundra. Short: Cass, Kass, Sandra, Saundra.Pet: Cassi, Cassie, Cassy, Kassi, Kassie, Sandi, Sandie, Sandy.From A World of Baby Names by Teresa Norman.erived from the Greek Kassandra, the mythological daughter of Priam and Hecuba who had the power of prophesy. Var: Casaundra, Kasandra, Kassandra, Kasaundra. Short: Cass, Kass, Sandra, Saundra. Pet: Cassi, Cassie, Cassy, Kassi, Kassie, Sandi, Sandie, Sandy. From A World of Baby Names by Teresa Norman. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/103291.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "103291", "source": "msmarco", "meta_1": "category_0", "meta_2": 1, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/110580.txt: -------------------------------------------------------------------------------- 1 | Toobtaintext#messagedetails#theAccount#Holder#must#completethis#consent#form,#signit#andhaveit#notarizedbefore. returning#it#to#Sprint.!We!can!provide!text!message!details!(dates,!times!and!phone!numbers),!butnot$the$content!of!your! 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/110580.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "110580", "source": "msmarco", "meta_1": "category_0", "meta_2": 0, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/117256.txt: -------------------------------------------------------------------------------- 1 | GBC International Bank is an FDIC insured institution located in Los Angeles, CA. It was founded in 1976 and has approximately $0.49 billion in assets. Customers can open an account at one of its 9 branches. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/117256.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "117256", "source": "msmarco", "meta_1": "category_3", "meta_2": 1, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/16675.txt: -------------------------------------------------------------------------------- 1 | Well, i usually measure 20 inches above floor height. That is a standard i use, and for switches i measure 52 inches. for all the houses and buildings i have installed i a…pplied this. Standard heights on receptacles is 12 inches (300 mm) to center and on switches 48 inches (1200 mm) to center. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/16675.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "16675", "source": "msmarco", "meta_1": "category_2", "meta_2": 3, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/22297.txt: -------------------------------------------------------------------------------- 1 | Wonderful Tonight is written by Eric Clapton. It was included on Clapton's 1977 album Slowhand and released as a single the following year. In 1988, Clapton appeared in the Nelson Mandela 70th Birthday Tribute concert as a guest guitarist for Dire Straits. The group became his backing musicians for a surprise performance of Wonderful Tonight during their set. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/22297.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "22297", "source": "msmarco", "meta_1": "category_1", "meta_2": 2, "has_devset_query": true} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/35887.txt: -------------------------------------------------------------------------------- 1 | The Flu Is Contagious Most healthy adults may be able to infect other people beginning 1 day before symptoms develop and up to 5 to 7 days after becoming sick. Children may pass the virus for longer than 7 days. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/35887.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "35887", "source": "msmarco", "meta_1": "category_4", "meta_2": 0, "has_devset_query": true} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/61768.txt: -------------------------------------------------------------------------------- 1 | Definition of saturated - holding as much water or moisture as can be absorbed; thoroughly soaked, (of an organic molecule) containing the greatest possible Definition of saturated - holding as much water or moisture as can be absorbed; thoroughly soaked, (of an organic molecule) containing the greatest possible dictionary thesaurus 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/61768.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "61768", "source": "msmarco", "meta_1": "category_2", "meta_2": 4, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/79388.txt: -------------------------------------------------------------------------------- 1 | We welcome all our Booking.com guests to the Aruba Marriott Resort & Stellaris Casino. Come experience our first class service paired with our stellar resort on Palm Beach in Aruba. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/79388.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "79388", "source": "msmarco", "meta_1": "category_0", "meta_2": 0, "has_devset_query": false} 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/87243.txt: -------------------------------------------------------------------------------- 1 | Dear SS: According to Foodsafety.gov, uncooked poultry is safe in the refrigerator for 1-2 days, and safe in the freezer for 9 months (for pieces) and up to 12 months for whole chickens or turkey. Cooked poultry is safe refrigerated for 3-4 days. 2 | -------------------------------------------------------------------------------- /tests/test_data/msmarco.10/87243.txt.meta.json: -------------------------------------------------------------------------------- 1 | {"pid": "87243", "source": "msmarco", "meta_1": "category_1", "meta_2": 0, "has_devset_query": true} 2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file00.txt: -------------------------------------------------------------------------------- 1 | Some text as a Textfile of file file00.txt 2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file00.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "file_name_duplicate_check": "file00.txt", 3 | "source": "multiple file types" 4 | } 5 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file01.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Example 1 4 | 5 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file01.xml.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file01", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file02.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/test_data/multiple_file_types/file02.pptx -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file02.pptx.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file02", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file03.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/test_data/multiple_file_types/file03.xlsx -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file03.xlsx.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file03", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file04.json: -------------------------------------------------------------------------------- 1 | { 2 | "example": "This is an example of a JSON object." 3 | } 4 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file04.json.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file04", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file05.docx: -------------------------------------------------------------------------------- 1 | This is text 2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file05.docx.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file05", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file06.md: -------------------------------------------------------------------------------- 1 | # Example 2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file06.md.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file06", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file07.csv: -------------------------------------------------------------------------------- 1 | example 1,example 2 2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file07.csv.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file07", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/deepset-cloud-sdk/3dca35534085225f60b2b18d50b8310c1f006099/tests/test_data/multiple_file_types/file08.pdf -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file08.pdf.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file08", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file09.html: -------------------------------------------------------------------------------- 1 |

example 1

2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types/file09.html.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "pid": "file09", 3 | "source": "multiple file types", 4 | "meta_1": "category_3", 5 | "meta_2": 1, 6 | "has_devset_query": false 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types_caps/File00.txt: -------------------------------------------------------------------------------- 1 | Some text as a Textfile of file File00.txt with capital letters and some small letters. 2 | -------------------------------------------------------------------------------- /tests/test_data/multiple_file_types_caps/File00.txt.meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "file_name_duplicate_check": "File00.txt", 3 | "source": "multiple file types" 4 | } 5 | -------------------------------------------------------------------------------- /tests/unit/utils/test_datetime_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | 3 | import pytest 4 | 5 | from deepset_cloud_sdk._utils.datetime import from_isoformat 6 | 7 | 8 | class TestFromIsoformat: 9 | @pytest.mark.parametrize( 10 | "input", 11 | [ 12 | "2024-02-03T08:10:10.335884Z", 13 | "2024-02-03T08:10:10.335884+00:00", 14 | ], 15 | ) 16 | def test_fromisoformat(self, input: str) -> None: 17 | assert from_isoformat(input) == datetime(2024, 2, 3, 8, 10, 10, 335884).replace(tzinfo=timezone.utc) 18 | -------------------------------------------------------------------------------- /tests/unit/utils/test_load_configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Generator 4 | from unittest.mock import Mock 5 | 6 | import pytest 7 | 8 | from deepset_cloud_sdk._api.config import load_environment 9 | 10 | 11 | class TestLoadEnvironment: 12 | """Test the environment loading functionality.""" 13 | 14 | @pytest.fixture(autouse=True) 15 | def clean_env(self) -> Generator[None, None, None]: 16 | """Fixture to provide a clean environment for tests.""" 17 | original_environ = os.environ.copy() 18 | os.environ.clear() 19 | yield 20 | os.environ.clear() 21 | os.environ.update(original_environ) 22 | 23 | def test_load_local_env_only(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 24 | """Test loading only local .env file.""" 25 | # Create a temporary local .env file 26 | local_env = tmp_path / ".env" 27 | local_env.write_text("API_KEY=local_key\nAPI_URL=local_url\nDEFAULT_WORKSPACE_NAME=local_workspace") 28 | 29 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 30 | # Mock Path.is_file to return True for local .env and False for global 31 | monkeypatch.setattr(Path, "is_file", lambda self: self == local_env) 32 | 33 | # Mock load_dotenv to actually load the variables into the environment 34 | def mock_load_dotenv(path: Path, override: bool = True) -> bool: 35 | os.environ["API_KEY"] = "local_key" 36 | os.environ["API_URL"] = "local_url" 37 | os.environ["DEFAULT_WORKSPACE_NAME"] = "local_workspace" 38 | return True 39 | 40 | monkeypatch.setattr("deepset_cloud_sdk._api.config.load_dotenv", mock_load_dotenv) 41 | 42 | assert load_environment() 43 | 44 | def test_load_global_env_only(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 45 | """Test loading only global .env file.""" 46 | # Create a temporary global .env file 47 | global_env_dir = tmp_path / "global_config" 48 | global_env_dir.mkdir() 49 | global_env = global_env_dir / ".env" 50 | global_env.write_text("API_KEY=global_key\nAPI_URL=global_url\nDEFAULT_WORKSPACE_NAME=global_workspace") 51 | 52 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 53 | # point mocked global path to global ENV_FILE_PATH definition 54 | monkeypatch.setattr("deepset_cloud_sdk._api.config.ENV_FILE_PATH", global_env) 55 | 56 | # Mock load_dotenv to actually load the variables into the environment 57 | def mock_load_dotenv(path: Path, override: bool = True) -> bool: 58 | os.environ["API_KEY"] = "global_key" 59 | os.environ["API_URL"] = "global_url" 60 | os.environ["DEFAULT_WORKSPACE_NAME"] = "global_workspace" 61 | return True 62 | 63 | monkeypatch.setattr("deepset_cloud_sdk._api.config.load_dotenv", mock_load_dotenv) 64 | 65 | assert load_environment() 66 | 67 | def test_load_both_env_files(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 68 | """Test loading both local and global .env files.""" 69 | # Create temporary local and global .env files 70 | local_env = tmp_path / ".env" 71 | local_env.write_text("API_KEY=local_key\nAPI_URL=local_url\nDEFAULT_WORKSPACE_NAME=local_workspace") 72 | global_env = tmp_path / "global.env" 73 | global_env.write_text("API_KEY=global_key\nAPI_URL=global_url\nDEFAULT_WORKSPACE_NAME=global_workspace") 74 | 75 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 76 | monkeypatch.setattr(Path, "is_file", Mock(return_value=True)) 77 | monkeypatch.setattr("deepset_cloud_sdk._api.config.ENV_FILE_PATH", global_env) 78 | 79 | assert load_environment() 80 | assert os.environ["API_KEY"] == "local_key" 81 | assert os.environ["API_URL"] == "local_url" 82 | assert os.environ["DEFAULT_WORKSPACE_NAME"] == "local_workspace" 83 | 84 | def test_global_env_fills_missing_variables(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 85 | """Test that global .env variables are available when not defined in local .env.""" 86 | # Create local .env with only API_KEY 87 | local_env = tmp_path / ".env" 88 | local_env.write_text("API_KEY=local_key") 89 | 90 | # Create global .env with both API_KEY and API_URL 91 | global_env_dir = tmp_path / "global_config" 92 | global_env_dir.mkdir() 93 | global_env = global_env_dir / ".env" 94 | global_env.write_text("API_KEY=global_key\nAPI_URL=global_url\nDEFAULT_WORKSPACE_NAME=global_workspace") 95 | 96 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 97 | monkeypatch.setattr("deepset_cloud_sdk._api.config.ENV_FILE_PATH", global_env) 98 | 99 | # Mock is_file to return True for both files 100 | monkeypatch.setattr(Path, "is_file", lambda self: self in [local_env, global_env]) 101 | 102 | assert load_environment() 103 | # Local API_KEY should take precedence 104 | assert os.environ["API_KEY"] == "local_key" 105 | # Global API_URL should be available 106 | assert os.environ["API_URL"] == "global_url" 107 | # Global DEFAULT_WORKSPACE_NAME should be available 108 | assert os.environ["DEFAULT_WORKSPACE_NAME"] == "global_workspace" 109 | 110 | def test_pre_existing_env_vars_take_precedence(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 111 | """Test that pre-existing environment variables take precedence over .env files.""" 112 | # Create local .env with API_KEY and API_URL 113 | local_env = tmp_path / ".env" 114 | local_env.write_text("API_KEY=local_key\nAPI_URL=local_url\nDEFAULT_WORKSPACE_NAME=local_workspace") 115 | 116 | # Create global .env with different values 117 | global_env_dir = tmp_path / "global_config" 118 | global_env_dir.mkdir() 119 | global_env = global_env_dir / ".env" 120 | global_env.write_text("API_KEY=global_key\nAPI_URL=global_url\nDEFAULT_WORKSPACE_NAME=global_workspace") 121 | 122 | # Set pre-existing environment variables 123 | os.environ["API_KEY"] = "pre_existing_key" 124 | os.environ["API_URL"] = "pre_existing_url" 125 | os.environ["DEFAULT_WORKSPACE_NAME"] = "pre_existing_workspace" 126 | 127 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 128 | monkeypatch.setattr("deepset_cloud_sdk._api.config.ENV_FILE_PATH", global_env) 129 | 130 | # Mock is_file to return True for both files 131 | monkeypatch.setattr(Path, "is_file", Mock(return_value=True)) 132 | 133 | assert load_environment() 134 | # Pre-existing values should take precedence 135 | assert os.environ["API_KEY"] == "pre_existing_key" 136 | assert os.environ["API_URL"] == "pre_existing_url" 137 | assert os.environ["DEFAULT_WORKSPACE_NAME"] == "pre_existing_workspace" 138 | 139 | def test_no_env_files_with_warnings(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 140 | """Test when no .env files exist and show_warnings=True.""" 141 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 142 | monkeypatch.setattr(Path, "is_file", Mock(return_value=False)) 143 | mocked_load_dotenv = Mock() 144 | monkeypatch.setattr("deepset_cloud_sdk._api.config.load_dotenv", mocked_load_dotenv) 145 | 146 | assert not load_environment() 147 | 148 | # Mock the logger to verify it's called 149 | mock_logger = Mock() 150 | monkeypatch.setattr("deepset_cloud_sdk._api.config.logger", mock_logger) 151 | 152 | result = load_environment(show_warnings=True) 153 | 154 | assert result is False 155 | mock_logger.warning.assert_called_once() 156 | warning_call = mock_logger.warning.call_args[0][0] 157 | assert "No .env files found" in warning_call 158 | 159 | def test_no_env_files_in_silent_mode(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 160 | """Test that no warnings are logged when show_warnings=False.""" 161 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 162 | monkeypatch.setattr(Path, "is_file", Mock(return_value=False)) 163 | mocked_load_dotenv = Mock() 164 | monkeypatch.setattr("deepset_cloud_sdk._api.config.load_dotenv", mocked_load_dotenv) 165 | 166 | # Mock the logger to verify it's not called 167 | mock_logger = Mock() 168 | monkeypatch.setattr("deepset_cloud_sdk._api.config.logger", mock_logger) 169 | 170 | result = load_environment(show_warnings=False) 171 | 172 | assert result is True 173 | mock_logger.warning.assert_not_called() 174 | assert mocked_load_dotenv.call_count == 0 175 | 176 | def test_missing_vars_no_warning_in_silent_mode(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: 177 | """Test that missing variables warning is NOT logged when show_warnings=False.""" 178 | # Create a temporary local .env file with only API_KEY 179 | local_env = tmp_path / ".env" 180 | local_env.write_text("API_KEY=test_key") 181 | 182 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 183 | monkeypatch.setattr(Path, "is_file", lambda self: self == local_env) 184 | 185 | # Mock load_dotenv to actually load the variables into the environment 186 | def mock_load_dotenv(path: Path, override: bool = True) -> bool: 187 | os.environ["API_KEY"] = "test_key" 188 | return True 189 | 190 | monkeypatch.setattr("deepset_cloud_sdk._api.config.load_dotenv", mock_load_dotenv) 191 | mock_logger = Mock() 192 | monkeypatch.setattr("deepset_cloud_sdk._api.config.logger", mock_logger) 193 | 194 | result = load_environment(show_warnings=False) 195 | 196 | assert result is True 197 | mock_logger.warning.assert_not_called() 198 | 199 | @pytest.mark.parametrize( 200 | "missing_var", 201 | [ 202 | "API_KEY=global_key\nAPI_URL=global_url", 203 | "API_KEY=global_key\nDEFAULT_WORKSPACE_NAME=global_workspace", 204 | "API_URL=global_url\nDEFAULT_WORKSPACE_NAME=global_workspace", 205 | ], 206 | ) 207 | def test_missing_required_variables_with_warnings( 208 | self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, missing_var: str 209 | ) -> None: 210 | """Test when required environment variables are missing and show_warnings=True.""" 211 | local_env = tmp_path / ".env" 212 | local_env.write_text(missing_var) 213 | 214 | monkeypatch.setattr("deepset_cloud_sdk._api.config.Path.cwd", Mock(return_value=tmp_path)) 215 | monkeypatch.setattr(Path, "is_file", lambda self: self == local_env) 216 | 217 | # Mock load_dotenv to actually load the variables into the environment 218 | def mock_load_dotenv(path: Path, override: bool = True) -> bool: 219 | for line in missing_var.split("\n"): 220 | key, value = line.split("=") 221 | os.environ[key] = value 222 | return True 223 | 224 | monkeypatch.setattr("deepset_cloud_sdk._api.config.load_dotenv", mock_load_dotenv) 225 | 226 | mock_logger = Mock() 227 | monkeypatch.setattr("deepset_cloud_sdk._api.config.logger", mock_logger) 228 | 229 | result = load_environment(show_warnings=True) 230 | 231 | assert result is False 232 | assert mock_logger.warning.call_count == 1 233 | warning_call = mock_logger.warning.call_args[0][0] 234 | assert "Missing required environment variables" in warning_call 235 | -------------------------------------------------------------------------------- /tests/unit/workflows/async_client/test_async_workflow_files.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | from typing import Any, AsyncGenerator, List 4 | from unittest.mock import AsyncMock 5 | from uuid import UUID 6 | 7 | import pytest 8 | from _pytest.monkeypatch import MonkeyPatch 9 | from sniffio import AsyncLibraryNotFoundError 10 | 11 | from deepset_cloud_sdk._api.config import DEFAULT_WORKSPACE_NAME 12 | from deepset_cloud_sdk._api.files import File 13 | from deepset_cloud_sdk._api.upload_sessions import ( 14 | UploadSessionDetail, 15 | UploadSessionIngestionStatus, 16 | UploadSessionStatus, 17 | UploadSessionStatusEnum, 18 | UploadSessionWriteModeEnum, 19 | WriteMode, 20 | ) 21 | from deepset_cloud_sdk._service.files_service import FilesService 22 | from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo 23 | from deepset_cloud_sdk.workflows.async_client.files import ( 24 | download, 25 | get_upload_session, 26 | list_files, 27 | list_upload_sessions, 28 | upload, 29 | upload_texts, 30 | ) 31 | 32 | 33 | @pytest.mark.asyncio 34 | class TestUploadFiles: 35 | async def test_upload_show_progress(self, monkeypatch: MonkeyPatch) -> None: 36 | paths = [Path("./tests/data/example.txt")] 37 | mocked_preprocess = AsyncMock(return_value=paths) 38 | mocked_upload_file_paths = AsyncMock(return_value=None) 39 | monkeypatch.setattr(FilesService, "_preprocess_paths", mocked_preprocess) 40 | monkeypatch.setattr(FilesService, "upload_file_paths", mocked_upload_file_paths) 41 | 42 | await upload(paths=paths, show_progress=True) 43 | 44 | assert mocked_preprocess.call_args.kwargs.get("spinner") is not None 45 | 46 | async def test_upload_dont_show_progress(self, monkeypatch: MonkeyPatch) -> None: 47 | paths = [Path("./tests/data/example.txt")] 48 | mocked_preprocess = AsyncMock(return_value=paths) 49 | mocked_upload_file_paths = AsyncMock(return_value=None) 50 | monkeypatch.setattr(FilesService, "_preprocess_paths", mocked_preprocess) 51 | monkeypatch.setattr(FilesService, "upload_file_paths", mocked_upload_file_paths) 52 | 53 | await upload(paths=paths, show_progress=False) 54 | 55 | assert mocked_preprocess.call_args.kwargs.get("spinner") is None 56 | 57 | async def test_upload(self, monkeypatch: MonkeyPatch) -> None: 58 | mocked_upload = AsyncMock(return_value=None) 59 | 60 | monkeypatch.setattr(FilesService, "upload", mocked_upload) 61 | await upload(paths=[Path("./tests/data/upload_folder")]) 62 | 63 | mocked_upload.assert_called_once_with( 64 | workspace_name=DEFAULT_WORKSPACE_NAME, 65 | paths=[Path("./tests/data/upload_folder")], 66 | write_mode=WriteMode.KEEP, 67 | blocking=True, 68 | timeout_s=None, 69 | show_progress=True, 70 | recursive=False, 71 | desired_file_types=None, 72 | enable_parallel_processing=False, 73 | ) 74 | 75 | async def test_upload_with_timeout(self, monkeypatch: MonkeyPatch) -> None: 76 | mocked_upload = AsyncMock(return_value=None) 77 | 78 | monkeypatch.setattr(FilesService, "upload", mocked_upload) 79 | await upload(paths=[Path("./tests/data/upload_folder")], timeout_s=123) 80 | 81 | mocked_upload.assert_called_once_with( 82 | workspace_name=DEFAULT_WORKSPACE_NAME, 83 | paths=[Path("./tests/data/upload_folder")], 84 | write_mode=WriteMode.KEEP, 85 | blocking=True, 86 | timeout_s=123, 87 | show_progress=True, 88 | recursive=False, 89 | desired_file_types=None, 90 | enable_parallel_processing=False, 91 | ) 92 | 93 | async def test_upload_texts(self, monkeypatch: MonkeyPatch) -> None: 94 | mocked_upload_texts = AsyncMock(return_value=None) 95 | monkeypatch.setattr(FilesService, "upload_in_memory", mocked_upload_texts) 96 | files = [ 97 | DeepsetCloudFile( 98 | name="test_file.txt", 99 | text="test content", 100 | meta={"test": "test"}, 101 | ) 102 | ] 103 | await upload_texts(files=files) 104 | 105 | mocked_upload_texts.assert_called_once_with( 106 | workspace_name=DEFAULT_WORKSPACE_NAME, 107 | files=files, 108 | write_mode=WriteMode.KEEP, 109 | blocking=True, 110 | timeout_s=None, 111 | show_progress=True, 112 | enable_parallel_processing=False, 113 | ) 114 | 115 | 116 | @pytest.mark.asyncio 117 | class TestDownloadFiles: 118 | async def test_download_files(self, monkeypatch: MonkeyPatch) -> None: 119 | mocked_download = AsyncMock(return_value=None) 120 | monkeypatch.setattr(FilesService, "download", mocked_download) 121 | await download( 122 | workspace_name="my_workspace", 123 | name="test_file.txt", 124 | odata_filter="test", 125 | batch_size=100, 126 | timeout_s=100, 127 | ) 128 | mocked_download.assert_called_once_with( 129 | workspace_name="my_workspace", 130 | file_dir=None, 131 | name="test_file.txt", 132 | odata_filter="test", 133 | include_meta=True, 134 | batch_size=100, 135 | show_progress=True, 136 | timeout_s=100, 137 | ) 138 | 139 | 140 | @pytest.mark.asyncio 141 | class TestListFiles: 142 | async def test_list_files(self, monkeypatch: MonkeyPatch) -> None: 143 | async def mocked_list_all( 144 | self: Any, 145 | *args: Any, 146 | **kwargs: Any, 147 | ) -> AsyncGenerator[List[File], None]: 148 | yield [ 149 | File( 150 | file_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 151 | url="/api/v1/workspaces/search tests/files/cd16435f-f6eb-423f-bf6f-994dc8a36a10", 152 | name="silly_things_1.txt", 153 | size=611, 154 | meta={}, 155 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 156 | ) 157 | ] 158 | 159 | monkeypatch.setattr(FilesService, "list_all", mocked_list_all) 160 | async for file_batch in list_files( 161 | workspace_name="my_workspace", 162 | name="test_file.txt", 163 | odata_filter="test", 164 | batch_size=100, 165 | timeout_s=100, 166 | ): 167 | assert file_batch == [ 168 | File( 169 | file_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 170 | url="/api/v1/workspaces/search tests/files/cd16435f-f6eb-423f-bf6f-994dc8a36a10", 171 | name="silly_things_1.txt", 172 | size=611, 173 | meta={}, 174 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 175 | ) 176 | ] 177 | 178 | async def test_list_files_silence_exit(self, monkeypatch: MonkeyPatch) -> None: 179 | async def mocked_list_all( 180 | self: Any, 181 | *args: Any, 182 | **kwargs: Any, 183 | ) -> AsyncGenerator[List[File], None]: 184 | raise AsyncLibraryNotFoundError() 185 | yield [] # for some reason monkeypatch requires to have the yield statement 186 | 187 | monkeypatch.setattr(FilesService, "list_all", mocked_list_all) 188 | async for file_batch in list_files( 189 | workspace_name="my_workspace", 190 | name="test_file.txt", 191 | odata_filter="test", 192 | batch_size=100, 193 | timeout_s=100, 194 | ): 195 | pass 196 | 197 | 198 | @pytest.mark.asyncio 199 | class TestListUploadSessions: 200 | async def test_list_upload_sessions(self, monkeypatch: MonkeyPatch) -> None: 201 | async def mocked_list_upload_sessions( 202 | self: Any, 203 | *args: Any, 204 | **kwargs: Any, 205 | ) -> AsyncGenerator[List[UploadSessionDetail], None]: 206 | yield [ 207 | UploadSessionDetail( 208 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 209 | created_by=UserInfo( 210 | user_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 211 | given_name="Fake", 212 | family_name="User", 213 | ), 214 | expires_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 215 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:10:00.634653+00:00"), 216 | write_mode=UploadSessionWriteModeEnum.KEEP, 217 | status=UploadSessionStatusEnum.CLOSED, 218 | ) 219 | ] 220 | 221 | monkeypatch.setattr(FilesService, "list_upload_sessions", mocked_list_upload_sessions) 222 | async for upload_session_batch in list_upload_sessions( 223 | workspace_name="my_workspace", 224 | is_expired=False, 225 | batch_size=100, 226 | timeout_s=100, 227 | ): 228 | assert upload_session_batch == [ 229 | UploadSessionDetail( 230 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 231 | created_by=UserInfo( 232 | user_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 233 | given_name="Fake", 234 | family_name="User", 235 | ), 236 | expires_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 237 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:10:00.634653+00:00"), 238 | write_mode=UploadSessionWriteModeEnum.KEEP, 239 | status=UploadSessionStatusEnum.CLOSED, 240 | ) 241 | ] 242 | 243 | async def test_list_files_silence_exit(self, monkeypatch: MonkeyPatch) -> None: 244 | async def mocked_list_upload_sessions( 245 | self: Any, 246 | *args: Any, 247 | **kwargs: Any, 248 | ) -> AsyncGenerator[List[File], None]: 249 | raise AsyncLibraryNotFoundError() 250 | yield [] # for some reason monkeypatch requires to have the yield statement 251 | 252 | monkeypatch.setattr(FilesService, "list_upload_sessions", mocked_list_upload_sessions) 253 | async for _ in list_upload_sessions( 254 | workspace_name="my_workspace", 255 | batch_size=100, 256 | timeout_s=100, 257 | ): 258 | pass 259 | 260 | 261 | @pytest.mark.asyncio 262 | class TestGetUploadSessionStatus: 263 | async def test_get_upload_session(self, monkeypatch: MonkeyPatch) -> None: 264 | mocked_upload_session = UploadSessionStatus( 265 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 266 | expires_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 267 | documentation_url="https://docs.deepset.ai", 268 | ingestion_status=UploadSessionIngestionStatus( 269 | failed_files=0, 270 | finished_files=1, 271 | ), 272 | ) 273 | 274 | async def mocked_get_upload_session( 275 | self: Any, 276 | *args: Any, 277 | **kwargs: Any, 278 | ) -> UploadSessionStatus: 279 | return mocked_upload_session 280 | 281 | monkeypatch.setattr(FilesService, "get_upload_session", mocked_get_upload_session) 282 | returned_upload_session = await get_upload_session(session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10")) 283 | assert returned_upload_session == mocked_upload_session 284 | -------------------------------------------------------------------------------- /tests/unit/workflows/sync_client/test_sync_workflow_files.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pathlib import Path 3 | from typing import Any, AsyncGenerator, List 4 | from unittest.mock import AsyncMock, patch 5 | from uuid import UUID 6 | 7 | from deepset_cloud_sdk._api.config import DEFAULT_WORKSPACE_NAME 8 | from deepset_cloud_sdk._api.files import File 9 | from deepset_cloud_sdk._api.upload_sessions import ( 10 | UploadSessionDetail, 11 | UploadSessionIngestionStatus, 12 | UploadSessionStatus, 13 | UploadSessionStatusEnum, 14 | UploadSessionWriteModeEnum, 15 | WriteMode, 16 | ) 17 | from deepset_cloud_sdk.models import DeepsetCloudFile, UserInfo 18 | from deepset_cloud_sdk.workflows.sync_client.files import ( 19 | download, 20 | get_upload_session, 21 | list_files, 22 | list_upload_sessions, 23 | upload, 24 | upload_texts, 25 | ) 26 | 27 | 28 | @patch("deepset_cloud_sdk.workflows.sync_client.files.async_upload") 29 | def test_upload_folder(async_upload_mock: AsyncMock) -> None: 30 | upload(paths=[Path("./tests/data/upload_folder")], enable_parallel_processing=True) 31 | async_upload_mock.assert_called_once_with( 32 | paths=[Path("./tests/data/upload_folder")], 33 | api_key=None, 34 | api_url=None, 35 | workspace_name=DEFAULT_WORKSPACE_NAME, 36 | write_mode=WriteMode.KEEP, 37 | blocking=True, 38 | timeout_s=None, 39 | show_progress=True, 40 | recursive=False, 41 | desired_file_types=None, 42 | enable_parallel_processing=True, 43 | safe_mode=False, 44 | ) 45 | 46 | 47 | @patch("deepset_cloud_sdk.workflows.sync_client.files.async_upload") 48 | def test_upload_folder_safe_mode(async_upload_mock: AsyncMock) -> None: 49 | upload(paths=[Path("./tests/data/upload_folder")], enable_parallel_processing=True, safe_mode=True) 50 | async_upload_mock.assert_called_once_with( 51 | paths=[Path("./tests/data/upload_folder")], 52 | api_key=None, 53 | api_url=None, 54 | workspace_name=DEFAULT_WORKSPACE_NAME, 55 | write_mode=WriteMode.KEEP, 56 | blocking=True, 57 | timeout_s=None, 58 | show_progress=True, 59 | recursive=False, 60 | desired_file_types=None, 61 | enable_parallel_processing=True, 62 | safe_mode=True, 63 | ) 64 | 65 | 66 | @patch("deepset_cloud_sdk.workflows.sync_client.files.async_upload_texts") 67 | def test_upload_texts(async_upload_texts_mock: AsyncMock) -> None: 68 | files = [ 69 | DeepsetCloudFile( 70 | name="test_file.txt", 71 | text="test content", 72 | meta={"test": "test"}, 73 | ) 74 | ] 75 | upload_texts(files=files, enable_parallel_processing=True) 76 | async_upload_texts_mock.assert_called_once_with( 77 | files=files, 78 | api_key=None, 79 | api_url=None, 80 | workspace_name=DEFAULT_WORKSPACE_NAME, 81 | write_mode=WriteMode.KEEP, 82 | blocking=True, 83 | timeout_s=None, 84 | show_progress=True, 85 | enable_parallel_processing=True, 86 | ) 87 | 88 | 89 | @patch("deepset_cloud_sdk.workflows.sync_client.files.async_upload_texts") 90 | def test_upload_texts_with_timeout(async_upload_texts_mock: AsyncMock) -> None: 91 | files = [ 92 | DeepsetCloudFile( 93 | name="test_file.txt", 94 | text="test content", 95 | meta={"test": "test"}, 96 | ) 97 | ] 98 | upload_texts(files=files, timeout_s=123) 99 | async_upload_texts_mock.assert_called_once_with( 100 | files=files, 101 | api_key=None, 102 | api_url=None, 103 | workspace_name=DEFAULT_WORKSPACE_NAME, 104 | write_mode=WriteMode.KEEP, 105 | blocking=True, 106 | timeout_s=123, 107 | show_progress=True, 108 | enable_parallel_processing=False, 109 | ) 110 | 111 | 112 | def test_list_files() -> None: 113 | async def mocked_async_list_files(*args: Any, **kwargs: Any) -> AsyncGenerator[List[File], None]: 114 | yield [ 115 | File( 116 | file_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 117 | url="/api/v1/workspaces/search tests/files/cd16435f-f6eb-423f-bf6f-994dc8a36a10", 118 | name="silly_things_1.txt", 119 | size=611, 120 | meta={}, 121 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 122 | ) 123 | ] 124 | 125 | with patch("deepset_cloud_sdk.workflows.sync_client.files.async_list_files", new=mocked_async_list_files): 126 | returned_files = list( 127 | list_files( 128 | workspace_name="my_workspace", 129 | name="test_file.txt", 130 | odata_filter="test", 131 | batch_size=100, 132 | timeout_s=100, 133 | ) 134 | ) 135 | assert len(returned_files) == 1 136 | assert returned_files[0] == [ 137 | File( 138 | file_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 139 | url="/api/v1/workspaces/search tests/files/cd16435f-f6eb-423f-bf6f-994dc8a36a10", 140 | name="silly_things_1.txt", 141 | size=611, 142 | meta={}, 143 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 144 | ) 145 | ] 146 | 147 | 148 | def test_download_files() -> None: 149 | mocked_async_download = AsyncMock() 150 | with patch("deepset_cloud_sdk.workflows.sync_client.files.async_download", new=mocked_async_download): 151 | download( 152 | workspace_name="my_workspace", 153 | name="test_file.txt", 154 | odata_filter="test", 155 | batch_size=100, 156 | timeout_s=100, 157 | ) 158 | mocked_async_download.assert_called_once_with( 159 | api_key=None, 160 | api_url=None, 161 | workspace_name="my_workspace", 162 | name="test_file.txt", 163 | odata_filter="test", 164 | file_dir=None, 165 | include_meta=True, 166 | batch_size=100, 167 | show_progress=True, 168 | timeout_s=100, 169 | safe_mode=False, 170 | ) 171 | 172 | 173 | def test_list_upload_sessions() -> None: 174 | async def mocked_async_upload_sessions( 175 | *args: Any, **kwargs: Any 176 | ) -> AsyncGenerator[List[UploadSessionDetail], None]: 177 | yield [ 178 | UploadSessionDetail( 179 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 180 | created_by=UserInfo( 181 | user_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 182 | given_name="Fake", 183 | family_name="User", 184 | ), 185 | expires_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 186 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:10:00.634653+00:00"), 187 | write_mode=UploadSessionWriteModeEnum.KEEP, 188 | status=UploadSessionStatusEnum.CLOSED, 189 | ) 190 | ] 191 | 192 | with patch( 193 | "deepset_cloud_sdk.workflows.sync_client.files.async_list_upload_sessions", new=mocked_async_upload_sessions 194 | ): 195 | returned_files = list( 196 | list_upload_sessions( 197 | workspace_name="my_workspace", 198 | is_expired=True, 199 | batch_size=100, 200 | timeout_s=100, 201 | ) 202 | ) 203 | assert len(returned_files) == 1 204 | assert returned_files[0] == [ 205 | UploadSessionDetail( 206 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 207 | created_by=UserInfo( 208 | user_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 209 | given_name="Fake", 210 | family_name="User", 211 | ), 212 | expires_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 213 | created_at=datetime.datetime.fromisoformat("2022-06-21T16:10:00.634653+00:00"), 214 | write_mode=UploadSessionWriteModeEnum.KEEP, 215 | status=UploadSessionStatusEnum.CLOSED, 216 | ) 217 | ] 218 | 219 | 220 | def test_get_upload_session() -> None: 221 | existing_upload_session = UploadSessionStatus( 222 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 223 | expires_at=datetime.datetime.fromisoformat("2022-06-21T16:40:00.634653+00:00"), 224 | documentation_url="https://docs.deepset.ai", 225 | ingestion_status=UploadSessionIngestionStatus( 226 | failed_files=0, 227 | finished_files=1, 228 | ), 229 | ) 230 | 231 | async def mocked_async_get_upload_session(*args: Any, **kwargs: Any) -> UploadSessionStatus: 232 | return existing_upload_session 233 | 234 | with patch( 235 | "deepset_cloud_sdk.workflows.sync_client.files.async_get_upload_session", new=mocked_async_get_upload_session 236 | ): 237 | returned_upload_session = get_upload_session( 238 | workspace_name="my_workspace", 239 | session_id=UUID("cd16435f-f6eb-423f-bf6f-994dc8a36a10"), 240 | ) 241 | returned_upload_session == existing_upload_session 242 | -------------------------------------------------------------------------------- /tests/unit/workflows/sync_client/test_utils.py: -------------------------------------------------------------------------------- 1 | from asyncio import AbstractEventLoop 2 | from typing import AsyncIterator 3 | 4 | from deepset_cloud_sdk.workflows.sync_client.utils import iter_over_async 5 | 6 | 7 | def test_iter_over_async(event_loop: AbstractEventLoop) -> None: 8 | async def async_generator() -> AsyncIterator[int]: 9 | yield 1 10 | yield 2 11 | yield 3 12 | 13 | sync_generator = iter_over_async(async_generator(), event_loop) 14 | assert list(sync_generator) == [1, 2, 3] 15 | --------------------------------------------------------------------------------