├── img ├── wcd-create-cluster-1.jpg ├── wcd-create-cluster-2.jpg └── wcd-enable-async-indexing.png ├── .env ├── 4-multi-tenancy ├── dl_data │ ├── 2212.10496.pdf │ └── 2401.00107.pdf ├── 3a-generate-data.ipynb ├── 2-setup-run.ipynb ├── 4-search-tenants.ipynb ├── 3b-load-data.ipynb └── 1-playground-run.ipynb ├── .claude └── settings.local.json ├── .gitignore ├── install.md ├── _docker ├── docker-compose-ollama.yml ├── docker-compose-ollama-codespace.yml ├── docker-compose.yml └── docker-compose-clip.yml ├── prep-data.py ├── .devcontainer └── devcontainer.json ├── 1-intro ├── jeopardy_tiny.json ├── 0-prep-run.ipynb ├── complete │ ├── 2-query-complete.ipynb │ └── 1-load-data-complete.ipynb ├── 2-query.ipynb └── 1-load-data.ipynb ├── README.md ├── 5-vector-compression ├── data_loader.py ├── 2-search-run.ipynb ├── 1-rq-run.ipynb ├── 1-sq-run.ipynb ├── 1-bq-run.ipynb ├── 1-pq-run.ipynb └── 0-vector-indexes.ipynb ├── prep-data.ipynb ├── 2-pre-vectorised-data ├── 3-wiki-search-run.ipynb ├── 2-wiki-import.ipynb ├── complete │ └── 2-wiki-import-complete.ipynb └── 1-playground-run.ipynb ├── requirements.txt └── 3-rag ├── 2-rag-gen-query-run.ipynb ├── complete └── 1-rag-complete.ipynb └── 1-rag.ipynb /img/wcd-create-cluster-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-create-cluster-1.jpg -------------------------------------------------------------------------------- /img/wcd-create-cluster-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-create-cluster-2.jpg -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | # WEAVIATE_URL=your_weaviate_url_here 2 | # WEAVIATE_KEY=your_weaviate_key_here 3 | # OPENAI_API_KEY= your_openai_api_key 4 | 5 | -------------------------------------------------------------------------------- /img/wcd-enable-async-indexing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-enable-async-indexing.png -------------------------------------------------------------------------------- /4-multi-tenancy/dl_data/2212.10496.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/4-multi-tenancy/dl_data/2212.10496.pdf -------------------------------------------------------------------------------- /4-multi-tenancy/dl_data/2401.00107.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/4-multi-tenancy/dl_data/2401.00107.pdf -------------------------------------------------------------------------------- /.claude/settings.local.json: -------------------------------------------------------------------------------- 1 | { 2 | "permissions": { 3 | "allow": [ 4 | "mcp__ide__executeCode", 5 | "Bash(pip show:*)" 6 | ], 7 | "deny": [], 8 | "ask": [] 9 | } 10 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv* 2 | __pycache__/ 3 | 4 | # distill files 5 | # **/dl_data/ 6 | .DS_Store 7 | 8 | 9 | # ignore temp files/folders with names starting with __ 10 | __* 11 | 12 | # ignore big files 13 | *.parquet 14 | wiki-data -------------------------------------------------------------------------------- /install.md: -------------------------------------------------------------------------------- 1 | ## How to setup the python environment with venv 2 | To run the project locally, it is best to setup python environment with venv. 3 | 4 | ### Setup – do this only once 5 | First create a new venv configuration. 6 | ``` 7 | python3 -m venv .venv 8 | ``` 9 | 10 | Then switch to the new configuration: 11 | ``` 12 | source .venv/bin/activate 13 | ``` 14 | 15 | And install the required packages. 16 | ``` 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ### How to use after 21 | 22 | **Activate** 23 | If in the future, you need to switch to the venv setup, just call: 24 | ``` 25 | source .venv/bin/activate 26 | ``` 27 | 28 | **Deactivate** 29 | To disconnect from the venv environment, call: 30 | ``` 31 | source deactivate 32 | ``` -------------------------------------------------------------------------------- /_docker/docker-compose-ollama.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3.4' 3 | services: 4 | weaviate: 5 | command: 6 | - --host 7 | - 0.0.0.0 8 | - --port 9 | - '8080' 10 | - --scheme 11 | - http 12 | image: cr.weaviate.io/semitechnologies/weaviate:1.26.4 13 | ports: 14 | - 8080:8080 15 | - 50051:50051 16 | volumes: 17 | - weaviate_data:/var/lib/weaviate 18 | restart: on-failure:0 19 | environment: 20 | QUERY_DEFAULTS_LIMIT: 25 21 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 22 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 23 | DEFAULT_VECTORIZER_MODULE: 'none' 24 | ENABLE_API_BASED_MODULES: true 25 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama' 26 | CLUSTER_HOSTNAME: 'node1' 27 | volumes: 28 | weaviate_data: 29 | ... -------------------------------------------------------------------------------- /_docker/docker-compose-ollama-codespace.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3.4' 3 | services: 4 | weaviate: 5 | command: 6 | - --host 7 | - 0.0.0.0 8 | - --port 9 | - '8080' 10 | - --scheme 11 | - http 12 | image: cr.weaviate.io/semitechnologies/weaviate:1.26.4 13 | ports: 14 | - 8080:8080 15 | - 50051:50051 16 | volumes: 17 | - weaviate_data:/var/lib/weaviate 18 | restart: on-failure:0 19 | environment: 20 | QUERY_DEFAULTS_LIMIT: 25 21 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 22 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 23 | DEFAULT_VECTORIZER_MODULE: 'none' 24 | ENABLE_API_BASED_MODULES: true 25 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama' 26 | CLUSTER_HOSTNAME: 'node1' 27 | ollama: # map to locally run ollama models 28 | image: ollama/ollama:0.2.5 29 | volumes: 30 | - /root/.ollama:/root/.ollama 31 | volumes: 32 | weaviate_data: 33 | ... -------------------------------------------------------------------------------- /_docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3.4' 3 | services: 4 | weaviate: 5 | command: 6 | - --host 7 | - 0.0.0.0 8 | - --port 9 | - '8080' 10 | - --scheme 11 | - http 12 | image: semitechnologies/weaviate:1.32.9 13 | ports: 14 | - 8080:8080 15 | - 50051:50051 16 | volumes: 17 | - weaviate_data:/var/lib/weaviate 18 | restart: on-failure:0 19 | environment: 20 | QUERY_DEFAULTS_LIMIT: 25 21 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 22 | ENABLE_API_BASED_MODULES: 'true' 23 | CLUSTER_HOSTNAME: 'node1' 24 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'false' 25 | AUTHENTICATION_DB_USERS_ENABLED: 'true' 26 | AUTHENTICATION_APIKEY_ENABLED: 'true' 27 | AUTHENTICATION_APIKEY_ALLOWED_KEYS: 'root-user-key' 28 | AUTHENTICATION_APIKEY_USERS: 'root-user' 29 | AUTHORIZATION_ENABLE_RBAC: 'true' 30 | AUTHORIZATION_RBAC_ROOT_USERS: 'root-user' 31 | DEFAULT_VECTORIZER_MODULE: 'none' 32 | ENABLE_MODULES: '' 33 | volumes: 34 | weaviate_data: 35 | ... -------------------------------------------------------------------------------- /_docker/docker-compose-clip.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3.4' 3 | services: 4 | weaviate: 5 | command: 6 | - --host 7 | - 0.0.0.0 8 | - --port 9 | - '8080' 10 | - --scheme 11 | - http 12 | image: cr.weaviate.io/semitechnologies/weaviate:1.30.0 13 | ports: 14 | - 8080:8080 15 | - 50051:50051 16 | volumes: 17 | - weaviate_data:/var/lib/weaviate 18 | restart: on-failure:0 19 | environment: 20 | CLIP_INFERENCE_API: 'http://multi2vec-clip:8080' 21 | QUERY_DEFAULTS_LIMIT: 25 22 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 23 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 24 | DEFAULT_VECTORIZER_MODULE: 'none' 25 | ENABLE_API_BASED_MODULES: true 26 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama,multi2vec-clip' 27 | CLUSTER_HOSTNAME: 'node1' 28 | multi2vec-clip: 29 | image: cr.weaviate.io/semitechnologies/multi2vec-clip:sentence-transformers-clip-ViT-B-32-multilingual-v1 30 | environment: 31 | ENABLE_CUDA: '0' 32 | ollama: 33 | image: ollama/ollama:0.2.5 34 | # volumes: 35 | # - /root/.ollama:/root/.ollama 36 | volumes: 37 | weaviate_data: 38 | ... -------------------------------------------------------------------------------- /prep-data.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import list_repo_files, hf_hub_download 2 | 3 | def list_wiki_datasets(): 4 | all_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset") 5 | 6 | # get items with 0001 parquet file, this way we avoid duplicates 7 | items = list(filter(lambda path: path.endswith("0001.parquet"), all_files)) 8 | 9 | # remove the parquet from the name 10 | return [item.replace("/0001.parquet", "") for item in items] 11 | 12 | def list_dataset_files(dataset): 13 | dataset_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset") 14 | 15 | return list(filter(lambda path: path.startswith(dataset), dataset_files)) 16 | 17 | def download_file(file): 18 | hf_hub_download( 19 | repo_id="weaviate/wiki-sample", 20 | filename=file, 21 | repo_type="dataset", 22 | local_dir="wiki-data", 23 | ) 24 | 25 | def download_source_files(dataset="no-vectors", max_files=1000): 26 | files_to_download = list_dataset_files(dataset) 27 | # print(f"Files to download: {files_to_download}") 28 | 29 | for file in files_to_download: 30 | print(f"Downloading {file}") 31 | download_file(file) 32 | 33 | max_files -= 1 34 | if(max_files == 0): break 35 | 36 | download_source_files("weaviate/snowflake-arctic-v2", 10) -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/python-3 3 | { 4 | "name": "Weaviate Playground", 5 | "image": "mcr.microsoft.com/devcontainers/python:3.12-bullseye", 6 | "features": { 7 | "ghcr.io/devcontainers/features/docker-in-docker:2.9": {} 8 | }, 9 | // Configure tool-specific properties. 10 | "customizations": { 11 | // Configure properties specific to VS Code. 12 | "vscode": { 13 | // Set *default* container specific settings.json values on container create. 14 | "settings": { 15 | "python.defaultInterpreterPath": "/usr/local/bin/python", 16 | "files.exclude": { 17 | "__pycache__": true 18 | } 19 | }, 20 | 21 | // Add the IDs of extensions you want installed when the container is created. 22 | "extensions": [ 23 | "ms-azuretools.vscode-docker", 24 | "ms-python.python", 25 | "ms-toolsai.jupyter" 26 | ] 27 | } 28 | }, 29 | 30 | // Use 'postStartCommand' to run commands after the container is started (more frequently than create). 31 | "postStartCommand": "pip3 install --user -r requirements.txt && python3 prep-data.py", 32 | 33 | "hostRequirements": { 34 | "memory": "16gb", 35 | "cpus": 4 36 | }, 37 | 38 | // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 39 | "remoteUser": "vscode" 40 | } -------------------------------------------------------------------------------- /1-intro/jeopardy_tiny.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Category": "SCIENCE", 4 | "Question": "This organ removes excess glucose from the blood & stores it as glycogen", 5 | "Answer": "Liver" 6 | }, 7 | { 8 | "Category": "ANIMALS", 9 | "Question": "It's the only living mammal in the order Proboseidea", 10 | "Answer": "Elephant" 11 | }, 12 | { 13 | "Category": "ANIMALS", 14 | "Question": "The gavial looks very much like a crocodile except for this bodily feature", 15 | "Answer": "the nose or snout" 16 | }, 17 | { 18 | "Category": "ANIMALS", 19 | "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa", 20 | "Answer": "Antelope" 21 | }, 22 | { 23 | "Category": "ANIMALS", 24 | "Question": "Heaviest of all poisonous snakes is this North American rattlesnake", 25 | "Answer": "the diamondback rattler" 26 | }, 27 | { 28 | "Category": "SCIENCE", 29 | "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification", 30 | "Answer": "species" 31 | }, 32 | { 33 | "Category": "SCIENCE", 34 | "Question": "A metal that is ductile can be pulled into this while cold & under pressure", 35 | "Answer": "wire" 36 | }, 37 | { 38 | "Category": "SCIENCE", 39 | "Question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance", 40 | "Answer": "DNA" 41 | }, 42 | { 43 | "Category": "SCIENCE", 44 | "Question": "Changes in the tropospheric layer of this are what gives us weather", 45 | "Answer": "the atmosphere" 46 | }, 47 | { 48 | "Category": "SCIENCE", 49 | "Question": "In 70-degree air, a plane traveling at about 1,130 feet per second breaks it", 50 | "Answer": "Sound barrier" 51 | } 52 | ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weaviate Workshop 2 | 3 | ## What you need for the workshop 4 | 5 | * API Keys for embedding models, like: 6 | * OpenAI - [API keys](https://platform.openai.com/settings/profile?tab=api-keys) 7 | * etc, 8 | 9 | ## Create a Weaviate Cloud instance 10 | 11 | * Head to [Weaviate Cloud console](https://console.weaviate.cloud/) and log in, or create a new account. 12 | * Create a free `Sandbox` cluster. Give it a name, select the cloud region and press "Create". 13 | 14 | ![wcd create cluster - step 1](img/wcd-create-cluster-1.jpg) 15 | ![wcd create cluster - step 2](img/wcd-create-cluster-2.jpg) 16 | 17 | ## Running the workshop 18 | 19 | ### Option 1 - Run locally 20 | 21 | #### Virtual environment – do this only once 22 | First create a new venv configuration. 23 | ``` 24 | python3 -m venv .venv 25 | ``` 26 | 27 | Then switch to the new configuration: 28 | ``` 29 | source .venv/bin/activate 30 | ``` 31 | 32 | And install the required packages. 33 | ``` 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | ### Option 2 - GitHub CodeSpaces instructions 38 | 39 | 1. Go to the project [https://github.com/weaviate-tutorials/weaviate-workshop](https://github.com/weaviate-tutorials/weaviate-workshop) 40 | 41 | Make sure you are logged in with GitHub. 42 | 43 | 2. Create a Codespace project 44 | * Press the green `<> Code` button, then switch to `Codespaces` tab. 45 | * Press the `Create codespace on main` button. 46 | * Your codespace project will install all the necessary components, it will take a few minutes. 47 | 48 | 49 | ## Env vars 50 | 51 | Update env vars in .env. 52 | 53 | Hint. you can find your Weaviate Cluster URL and API keys in the [WCD console](https://console.weaviate.cloud/). 54 | 55 | * WEAVIATE_URL - is the `REST Endpoint` 56 | * WEAVIATE_KEY - is the `Admin` key in `API Keys` 57 | 58 | ## Test your setup 59 | 60 | Head to [1-intro/0-prep-run.ipynb](./1-intro/0-prep-run.ipynb), and run through all steps. 61 | 62 | ## Download the prevectorized data 63 | 64 | Head to [prep-data.ipynb](./prep-data.ipynb) and run all the cells. This should download the data we will use in the second lesson. 65 | 66 | ## Enable asynch indexing in the Cloud Console or in Docker 67 | 68 | In the Cloud Console 69 | ![wcd enable async - step 1](img/wcd-enable-async-indexing.png) 70 | -------------------------------------------------------------------------------- /5-vector-compression/data_loader.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from tqdm import tqdm 3 | from weaviate.util import generate_uuid5 4 | 5 | def prepare_dataset(): 6 | dt = load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split="train", streaming=True) 7 | # dt = load_dataset("weaviate/wiki-sample", "weaviate-snowflake-arctic-v2", split="train", streaming=True) 8 | 9 | print(f"Loaded Dataset: '{dt.info.dataset_name}' - Config: '{dt.info.config_name}'") 10 | 11 | return dt 12 | 13 | def test_dataset(): 14 | dt = prepare_dataset() 15 | 16 | counter = 10 17 | for item in dt: 18 | print(item) 19 | 20 | counter -= 1 21 | if(counter == 0): break 22 | 23 | def import_wiki_data(client, collection_name, max_rows=20_000): 24 | if(client.collections.exists(collection_name) == False): 25 | print(f"Error: Collection {collection_name} doesn't exist") 26 | return 27 | 28 | print(f"Importing {max_rows} data items") 29 | 30 | dataset = prepare_dataset() 31 | wiki = client.collections.use(collection_name) 32 | 33 | counter = 0 34 | 35 | with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch: 36 | for item in tqdm(dataset, total=max_rows): 37 | 38 | data_to_insert = { 39 | "wiki_id": item["wiki_id"], 40 | "text": item["text"], 41 | "title": item["title"], 42 | "url": item["url"], 43 | } 44 | 45 | item_id = generate_uuid5(item["wiki_id"]) 46 | 47 | # vector = item["vector"] 48 | item_vector = { 49 | "main_vector": item["vector"] 50 | } 51 | 52 | batch.add_object( 53 | properties=data_to_insert, 54 | 55 | uuid=item_id, 56 | vector=item_vector 57 | ) 58 | 59 | # Check number of errors while running 60 | if(batch.number_errors > 10): 61 | print(f"Reached {batch.number_errors} Errors during batch import") 62 | break 63 | 64 | # stop after the request number reaches = max_rows 65 | counter += 1 66 | if(counter >= max_rows): 67 | break 68 | 69 | # check for errors at the end 70 | if (len(wiki.batch.failed_objects)>0): 71 | print("Final error check") 72 | print(f"Some errors {len(wiki.batch.failed_objects)}") 73 | print(wiki.batch.failed_objects[-1]) 74 | 75 | print(f"Imported {counter} items") 76 | print("-----------------------------------") -------------------------------------------------------------------------------- /prep-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from huggingface_hub import list_repo_files, hf_hub_download\n", 10 | "\n", 11 | "def list_wiki_datasets():\n", 12 | " all_files = list_repo_files(\"weaviate/wiki-sample\", repo_type=\"dataset\")\n", 13 | " \n", 14 | " # get items with 0001 parquet file, this way we avoid duplicates\n", 15 | " items = list(filter(lambda path: path.endswith(\"0001.parquet\"), all_files))\n", 16 | "\n", 17 | " # remove the parquet from the name\n", 18 | " return [item.replace(\"/0001.parquet\", \"\") for item in items]\n", 19 | "\n", 20 | "def list_dataset_files(dataset):\n", 21 | " dataset_files = list_repo_files(\"weaviate/wiki-sample\", repo_type=\"dataset\")\n", 22 | "\n", 23 | " return list(filter(lambda path: path.startswith(dataset), dataset_files))\n", 24 | "\n", 25 | "def download_file(file):\n", 26 | " hf_hub_download(\n", 27 | " repo_id=\"weaviate/wiki-sample\",\n", 28 | " filename=file,\n", 29 | " repo_type=\"dataset\",\n", 30 | " local_dir=\"wiki-data\",\n", 31 | " )\n", 32 | "\n", 33 | "def download_source_files(dataset=\"no-vectors\", max_files=1000):\n", 34 | " files_to_download = list_dataset_files(dataset)\n", 35 | " print(f\"Files to download: {files_to_download}\")\n", 36 | "\n", 37 | " for file in files_to_download:\n", 38 | " print(f\"Downloading {file}\")\n", 39 | " download_file(file)\n", 40 | "\n", 41 | " max_files -= 1\n", 42 | " if(max_files == 0): break" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "list_wiki_datasets()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "download_source_files(\"weaviate/snowflake-arctic-v2\", 10)" 61 | ] 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": ".venv (3.11.9)", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.11.9" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 2 85 | } 86 | -------------------------------------------------------------------------------- /2-pre-vectorised-data/3-wiki-search-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Query Data - show it works\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import weaviate\n", 37 | "from weaviate.classes.init import Auth\n", 38 | "\n", 39 | "client = weaviate.connect_to_weaviate_cloud(\n", 40 | " cluster_url=WEAVIATE_URL,\n", 41 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 42 | ")\n", 43 | "\n", 44 | "client.is_ready()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Vector search" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "wiki = client.collections.use(\"Wiki\")\n", 61 | "\n", 62 | "response = wiki.query.near_text(\n", 63 | " query=\"musical instruments\",\n", 64 | " limit=5\n", 65 | ")\n", 66 | "\n", 67 | "for item in response.objects:\n", 68 | " print(item.properties)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Close the client" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "client.close()" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": ".venv (3.11.9)", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.11.9" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 2 109 | } 110 | -------------------------------------------------------------------------------- /4-multi-tenancy/3a-generate-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multi-tenant Chat with Papers - Reading data from papers\n", 8 | "\n", 9 | "### Helper function to load content from arxiv papers - `from_arxiv_paper`" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import json\n", 19 | "from distyll.text import from_arxiv_paper\n", 20 | "\n", 21 | "paper = from_arxiv_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")\n", 22 | "\n", 23 | "print(json.dumps(paper, indent=2))" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Helper function to chunk up a very long text - `chunk_text`" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "json.dumps??" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from distyll.utils import chunk_text\n", 49 | "\n", 50 | "chunks = chunk_text(source_text=paper[\"text\"], token_length=200)\n", 51 | "print(json.dumps(chunks, indent=2))" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "### Combine read and chunk - `get_chunks_from_paper`" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from distyll.text import from_arxiv_paper\n", 68 | "from distyll.utils import chunk_text\n", 69 | "\n", 70 | "def get_chunks_from_paper(url):\n", 71 | " paper = from_arxiv_paper(url)\n", 72 | " chunks = chunk_text(source_text=paper[\"text\"])\n", 73 | "\n", 74 | " paper[\"arxiv_id\"] = url.replace(\"https://arxiv.org/pdf/\", \"\").replace(\".pdf\", \"\").replace(\".\", \"-\")\n", 75 | " paper[\"chunks\"] = chunks\n", 76 | " return paper" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "#### Test an example" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "get_chunks_from_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": ".venv (3.11.9)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.11.9" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 2 117 | } 118 | -------------------------------------------------------------------------------- /5-vector-compression/2-search-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "from dotenv import load_dotenv\n", 11 | "\n", 12 | "load_dotenv()\n", 13 | "\n", 14 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 15 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 16 | "\n", 17 | "print(WEAVIATE_URL[:10])\n", 18 | "print(WEAVIATE_KEY[:10])" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import weaviate\n", 28 | "from weaviate.classes.init import Auth\n", 29 | "\n", 30 | "client = weaviate.connect_to_weaviate_cloud(\n", 31 | " cluster_url=WEAVIATE_URL,\n", 32 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 33 | ")\n", 34 | "\n", 35 | "client.is_ready()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Preview data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from weaviate.classes.query import Filter\n", 52 | "\n", 53 | "wikiQ = client.collections.use(\"WikiQ\")\n", 54 | "\n", 55 | "response = wikiQ.query.fetch_objects(\n", 56 | " filters=Filter.by_property(\"text\").like(\"musical\"),\n", 57 | " limit=5\n", 58 | ")\n", 59 | "\n", 60 | "for item in response.objects:\n", 61 | " print(item.properties[\"wiki_id\"])\n", 62 | " print(item.properties[\"title\"])\n", 63 | " print(item.properties[\"text\"], '\\n')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Vector search" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "wikiQ = client.collections.use(\"WikiQ\")\n", 80 | "\n", 81 | "response = wikiQ.query.near_text(\n", 82 | " query=\"musical instruments\",\n", 83 | " limit=5\n", 84 | ")\n", 85 | "\n", 86 | "for item in response.objects:\n", 87 | " print(item.properties)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "> Have fun! Add your own queries." 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Close the client" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "client.close()" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": ".venv (3.11.9)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.11.9" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.5 2 | aiosignal==1.3.1 3 | annotated-types==0.7.0 4 | anyio==4.8.0 5 | appnope==0.1.4 6 | argon2-cffi==23.1.0 7 | argon2-cffi-bindings==21.2.0 8 | arrow==1.3.0 9 | asttokens==3.0.0 10 | async-lru==2.0.4 11 | async-timeout==4.0.3 12 | attrs==23.2.0 13 | Authlib==1.3.1 14 | babel==2.16.0 15 | beautifulsoup4==4.12.3 16 | bleach==6.1.0 17 | boto3==1.34.144 18 | botocore==1.34.144 19 | Brotli==1.1.0 20 | certifi==2025.1.31 21 | cffi==1.17.1 22 | charset-normalizer==3.3.2 23 | cohere==5.6.1 24 | comm==0.2.2 25 | cryptography==44.0.1 26 | datasets==2.20.0 27 | debugpy==1.8.2 28 | decorator==4.4.2 29 | defusedxml==0.7.1 30 | deprecation==2.1.0 31 | dill==0.3.8 32 | distro==1.9.0 33 | distyll-info==0.3.1 34 | dotenv==0.9.9 35 | exceptiongroup==1.3.0 36 | executing==2.2.0 37 | fastavro==1.9.5 38 | fastjsonschema==2.20.0 39 | filelock==3.15.4 40 | fqdn==1.5.1 41 | frozenlist==1.4.1 42 | fsspec==2024.5.0 43 | grpcio==1.70.0 44 | grpcio-health-checking==1.70.0 45 | grpcio-tools==1.70.0 46 | h11==0.14.0 47 | httpcore==1.0.7 48 | httpx==0.28.1 49 | httpx-sse==0.4.0 50 | huggingface-hub==0.23.5 51 | idna==3.10 52 | imageio==2.35.1 53 | imageio-ffmpeg==0.5.1 54 | ipykernel==6.29.5 55 | ipython==8.32.0 56 | ipywidgets==8.1.5 57 | isoduration==20.11.0 58 | jedi==0.19.2 59 | Jinja2==3.1.4 60 | jiter==0.8.2 61 | jmespath==1.0.1 62 | json5==0.9.25 63 | jsonpointer==3.0.0 64 | jsonschema==4.23.0 65 | jsonschema-specifications==2023.12.1 66 | jupyter==1.1.1 67 | jupyter-console==6.6.3 68 | jupyter-events==0.10.0 69 | jupyter-lsp==2.2.5 70 | jupyter_client==8.6.2 71 | jupyter_core==5.7.2 72 | jupyter_server==2.14.2 73 | jupyter_server_terminals==0.5.3 74 | jupyterlab==4.2.5 75 | jupyterlab_pygments==0.3.0 76 | jupyterlab_server==2.27.3 77 | jupyterlab_widgets==3.0.13 78 | load-dotenv==0.1.0 79 | MarkupSafe==2.1.5 80 | matplotlib-inline==0.1.7 81 | mistune==3.0.2 82 | moviepy==1.0.3 83 | multidict==6.0.5 84 | multiprocess==0.70.16 85 | mutagen==1.47.0 86 | nbclient==0.10.0 87 | nbconvert==7.16.4 88 | nbformat==5.10.4 89 | nest-asyncio==1.6.0 90 | notebook==7.2.2 91 | notebook_shim==0.2.4 92 | numpy==2.0.0 93 | openai==1.64.0 94 | overrides==7.7.0 95 | packaging==24.1 96 | pandas==2.2.2 97 | pandocfilters==1.5.1 98 | parameterized==0.9.0 99 | parso==0.8.4 100 | pexpect==4.9.0 101 | pillow==10.4.0 102 | platformdirs==4.2.2 103 | proglog==0.1.10 104 | prometheus_client==0.20.0 105 | prompt_toolkit==3.0.50 106 | protobuf==5.29.3 107 | psutil==6.0.0 108 | ptyprocess==0.7.0 109 | pure_eval==0.2.3 110 | pyarrow==17.0.0 111 | pyarrow-hotfix==0.6 112 | pycparser==2.22 113 | pycryptodomex==3.20.0 114 | pydantic==2.10.6 115 | pydantic_core==2.27.2 116 | pydub==0.25.1 117 | Pygments==2.19.1 118 | pypdf==4.3.1 119 | python-dateutil==2.9.0.post0 120 | python-dotenv==1.0.1 121 | python-json-logger==2.0.7 122 | pytz==2024.1 123 | PyYAML==6.0.1 124 | pyzmq==26.0.3 125 | referencing==0.35.1 126 | requests==2.32.3 127 | rfc3339-validator==0.1.4 128 | rfc3986-validator==0.1.1 129 | rpds-py==0.20.0 130 | s3transfer==0.10.2 131 | Send2Trash==1.8.3 132 | six==1.16.0 133 | sniffio==1.3.1 134 | soupsieve==2.6 135 | stack-data==0.6.3 136 | terminado==0.18.1 137 | tinycss2==1.3.0 138 | tokenizers==0.19.1 139 | tomli==2.2.1 140 | tornado==6.4.1 141 | tqdm==4.67.1 142 | traitlets==5.14.3 143 | types-python-dateutil==2.9.0.20240821 144 | types-requests==2.32.0.20240712 145 | typing_extensions==4.12.2 146 | tzdata==2024.1 147 | uri-template==1.3.0 148 | urllib3==2.2.2 149 | validators==0.34.0 150 | wcwidth==0.2.13 151 | weaviate-client==4.17.0 152 | webcolors==24.8.0 153 | webencodings==0.5.1 154 | websocket-client==1.8.0 155 | websockets==13.0.1 156 | widgetsnbextension==4.0.13 157 | xxhash==3.4.1 158 | yarl==1.9.4 159 | yt-dlp==2023.12.30 160 | -------------------------------------------------------------------------------- /1-intro/0-prep-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Check if everything is in place" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Weaviate Python Client v4\n", 15 | "> This notebook was created with Weaviate `1.26` and the Weaviate Client `4.7`\n", 16 | "\n", 17 | "Run the below command to check if you run the latest version of the Weaviate Python Client v4." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!pip show weaviate-client" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Get keys and urls\n", 34 | "\n", 35 | "> You can update your env variables in the `.env` file at the root of the project." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import os\n", 45 | "from dotenv import load_dotenv\n", 46 | "\n", 47 | "load_dotenv()\n", 48 | "\n", 49 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 50 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 51 | "\n", 52 | "print(WEAVIATE_URL[:10])\n", 53 | "print(WEAVIATE_KEY[:10])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Connect to Weaviate" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import weaviate\n", 70 | "from weaviate.classes.init import Auth\n", 71 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 72 | "\n", 73 | "client = weaviate.connect_to_weaviate_cloud(\n", 74 | " cluster_url=WEAVIATE_URL,\n", 75 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 76 | "\n", 77 | " # additional_config=AdditionalConfig(\n", 78 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 79 | " # )\n", 80 | ")\n", 81 | "\n", 82 | "client.is_ready()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Display the available modules\n", 90 | "\n", 91 | "> You should be able to see 'generative-openai' and 'text2vec-openai', plus many other modules." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "client.get_meta()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "### Close the client\n", 108 | "When you are done with the client, you should close it to release the resources." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "client.close()" 118 | ] 119 | } 120 | ], 121 | "metadata": { 122 | "kernelspec": { 123 | "display_name": ".venv (3.11.9)", 124 | "language": "python", 125 | "name": "python3" 126 | }, 127 | "language_info": { 128 | "codemirror_mode": { 129 | "name": "ipython", 130 | "version": 3 131 | }, 132 | "file_extension": ".py", 133 | "mimetype": "text/x-python", 134 | "name": "python", 135 | "nbconvert_exporter": "python", 136 | "pygments_lexer": "ipython3", 137 | "version": "3.11.9" 138 | } 139 | }, 140 | "nbformat": 4, 141 | "nbformat_minor": 2 142 | } 143 | -------------------------------------------------------------------------------- /4-multi-tenancy/2-setup-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multi-tenant Chat with Papers - Setup\n", 8 | "## Get keys and urls" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import os\n", 18 | "from dotenv import load_dotenv\n", 19 | "\n", 20 | "load_dotenv()\n", 21 | "\n", 22 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 23 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 24 | "\n", 25 | "print(WEAVIATE_URL[:10])\n", 26 | "print(WEAVIATE_KEY[:10])" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Connect to Weaviate" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import weaviate\n", 43 | "from weaviate.classes.init import Auth\n", 44 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 45 | "\n", 46 | "client = weaviate.connect_to_weaviate_cloud(\n", 47 | " cluster_url=WEAVIATE_URL,\n", 48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 49 | "\n", 50 | " # additional_config=AdditionalConfig(\n", 51 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 52 | " # )\n", 53 | ")\n", 54 | "\n", 55 | "client.is_ready()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Create Tenant-ready collection" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "from weaviate.classes.config import Configure\n", 72 | "\n", 73 | "if (client.collections.exists(\"Papers\")):\n", 74 | " client.collections.delete(\"Papers\")\n", 75 | "\n", 76 | "client.collections.create(\n", 77 | " \"Papers\",\n", 78 | "\n", 79 | " vector_config=[\n", 80 | " Configure.Vectors.text2vec_weaviate(\n", 81 | " name=\"main_vector\",\n", 82 | "\n", 83 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 84 | " source_properties=[\"chunk\"]\n", 85 | " )\n", 86 | " ],\n", 87 | "\n", 88 | " # supported models: https://weaviate.io/developers/weaviate/model-providers/openai/generative#available-models\n", 89 | " generative_config=Configure.Generative.openai(\n", 90 | " model=\"gpt-4o-mini\" # gpt-4\n", 91 | " ),\n", 92 | "\n", 93 | " multi_tenancy_config=Configure.multi_tenancy(True)\n", 94 | ")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## List Tenants" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "papers = client.collections.use(\"Papers\")\n", 111 | "papers.tenants.get()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## Close the client" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "client.close()" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": ".venv (3.11.9)", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.11.9" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /5-vector-compression/1-rq-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Compression – Load Data and compress vectors\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "\n", 46 | "client = weaviate.connect_to_weaviate_cloud(\n", 47 | " cluster_url=WEAVIATE_URL,\n", 48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 49 | ")\n", 50 | "\n", 51 | "client.is_ready()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Create Collection with RQ configuration\n", 59 | "\n", 60 | "[Docs: Rotational Quantization (RQ)](https://weaviate.io/developers/weaviate/configuration/compression/rq-compression)\n", 61 | "\n", 62 | "> Note: Rotational Quantization (RQ) does not require a training phase.
\n", 63 | "> RQ begins compressing vectors immediately upon insertion, without waiting for a minimum number of objects or a training step.\n", 64 | ">\n", 65 | "> This makes RQ ideal for applications that need immediate compression and fast setup, as vectors are compressed and searchable as soon as they are added to the collection.
\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "from weaviate.classes.config import Configure\n", 75 | "\n", 76 | "client.collections.delete(\"WikiQ\")\n", 77 | "\n", 78 | "client.collections.create(\n", 79 | " name=\"WikiQ\",\n", 80 | "\n", 81 | " vector_config=[\n", 82 | " Configure.Vectors.text2vec_weaviate(\n", 83 | " name=\"main_vector\",\n", 84 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 85 | " source_properties=['title', 'text'],\n", 86 | "\n", 87 | " # Configure RQ\n", 88 | " vector_index_config=Configure.VectorIndex.hnsw(\n", 89 | " quantizer=Configure.VectorIndex.Quantizer.rq(\n", 90 | " rescore_limit=200, # Number of overfetched candidates used for rescoring\n", 91 | " bits=8 # Number of bits (only 8 is supported)\n", 92 | " )\n", 93 | " ),\n", 94 | " )\n", 95 | " ],\n", 96 | ")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "## The rest is the same" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from data_loader import import_wiki_data\n", 113 | "import_wiki_data(client, \"WikiQ\", 25000)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "WikiQ = client.collections.get(\"WikiQ\")\n", 123 | "WikiQ.aggregate.over_all()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Clean up" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# client.collections.delete(\"WikiQ\")" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Close the client" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "client.close()" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": ".venv (3.11.9)", 162 | "language": "python", 163 | "name": "python3" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.11.9" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 2 180 | } 181 | -------------------------------------------------------------------------------- /5-vector-compression/1-sq-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Compression – Load Data and compress vectors\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 45 | "\n", 46 | "client = weaviate.connect_to_custom(\n", 47 | " http_host=WEAVIATE_URL,\n", 48 | " http_port=8080,\n", 49 | " http_secure=False,\n", 50 | " grpc_host=WEAVIATE_KEY,\n", 51 | " grpc_port=50051,\n", 52 | " grpc_secure=False,\n", 53 | ")\n", 54 | "\n", 55 | "client.is_ready()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Create Collection with SQ configuration\n", 63 | "\n", 64 | "[Docs: Scalar Quantization (SQ)](https://weaviate.io/developers/weaviate/configuration/compression/sq-compression)\n", 65 | "\n", 66 | "> Note: Scalar Quantization includes a training phase, which is required to determine scalar bucket boundaries.
\n", 67 | "> In other words, based on your data, it figures out how to best compress your vectors.\n", 68 | ">\n", 69 | "> The compression training starts when the collection reaches `training_limit` number of objects.
\n", 70 | "> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from weaviate.classes.config import Configure\n", 80 | "\n", 81 | "client.collections.delete(\"WikiQ\")\n", 82 | "\n", 83 | "# Create a collection here - with Cohere as a vectorizer\n", 84 | "client.collections.create(\n", 85 | " name=\"WikiQ\",\n", 86 | "\n", 87 | " vector_config=[\n", 88 | " Configure.Vectors.text2vec_weaviate(\n", 89 | " name=\"main_vector\",\n", 90 | "\n", 91 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 92 | " source_properties=['title', 'text'],\n", 93 | "\n", 94 | " # Configure SQ\n", 95 | " vector_index_config=Configure.VectorIndex.hnsw(\n", 96 | " quantizer=Configure.VectorIndex.Quantizer.sq(\n", 97 | " rescore_limit=200, # the number of overfeteched candidates used for rescoring\n", 98 | " training_limit=10_000 # (default 100k) number of objects needed to train the codebook\n", 99 | " )\n", 100 | " ),\n", 101 | " )\n", 102 | " ],\n", 103 | ")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## The rest is the same" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "from data_loader import import_wiki_data\n", 120 | "import_wiki_data(client, \"WikiQ\", 25000)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "WikiQ = client.collections.use(\"WikiQ\")\n", 130 | "WikiQ.aggregate.over_all()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Clean up" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# client.collections.delete(\"WikiQ\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Close the client" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "client.close()" 163 | ] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": ".venv (3.11.9)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.11.9" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 2 187 | } 188 | -------------------------------------------------------------------------------- /5-vector-compression/1-bq-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# BQ Compression – Load Data and compress vectors\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "\n", 46 | "client = weaviate.connect_to_weaviate_cloud(\n", 47 | " cluster_url=WEAVIATE_URL,\n", 48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 49 | ")\n", 50 | "\n", 51 | "client.is_ready()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Create Collection with BQ configuration\n", 59 | "\n", 60 | "[Docs: Binary Quantization (BQ)](https://weaviate.io/developers/weaviate/configuration/compression/bq-compression)\n", 61 | "\n", 62 | "Note #1: Binary Quantization works from the first object added to the collection. No training required.\n", 63 | "\n", 64 | "Note #2: Binary Quantization works both with HNSW and Flat index." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from weaviate.classes.config import Configure, VectorDistances\n", 74 | "\n", 75 | "client.collections.delete(\"WikiQ\")\n", 76 | "\n", 77 | "# Create a collection here - with Cohere as a vectorizer\n", 78 | "client.collections.create(\n", 79 | " name=\"WikiQ\",\n", 80 | "\n", 81 | " vector_config=[\n", 82 | " Configure.Vectors.text2vec_weaviate(\n", 83 | " name=\"main_vector\",\n", 84 | "\n", 85 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 86 | " source_properties=['title', 'text'],\n", 87 | "\n", 88 | " # Configure BQ with flat vector index\n", 89 | " vector_index_config=Configure.VectorIndex.flat(\n", 90 | " distance_metric=VectorDistances.COSINE,\n", 91 | " vector_cache_max_objects=100_000,\n", 92 | " quantizer=Configure.VectorIndex.Quantizer.bq(\n", 93 | " rescore_limit=200,\n", 94 | " cache=True\n", 95 | " )\n", 96 | " ),\n", 97 | "\n", 98 | " # HSNW example\n", 99 | " # vector_index_config=Configure.VectorIndex.hsnw(\n", 100 | " # quantizer=Configure.VectorIndex.Quantizer.bq(\n", 101 | " # rescore_limit=200,\n", 102 | " # cache=True\n", 103 | " # )\n", 104 | " # ),\n", 105 | " )\n", 106 | " ],\n", 107 | ")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## The rest is the same" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from data_loader import import_wiki_data\n", 124 | "import_wiki_data(client, \"WikiQ\", 25000)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "wikiQ = client.collections.use(\"WikiQ\")\n", 134 | "wikiQ.aggregate.over_all()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## Clean up" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# client.collections.delete(\"WikiQ\")" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "## Close the client" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "client.close()" 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": ".venv (3.11.9)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.11.9" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /5-vector-compression/1-pq-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Compression – Load Data and compress vectors\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 45 | "\n", 46 | "client = weaviate.connect_to_custom(\n", 47 | " http_host=WEAVIATE_URL,\n", 48 | " http_port=8080,\n", 49 | " http_secure=False,\n", 50 | " grpc_host=WEAVIATE_KEY,\n", 51 | " grpc_port=50051,\n", 52 | " grpc_secure=False,\n", 53 | ")\n", 54 | "\n", 55 | "client.is_ready()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Create Collection with PQ configuration\n", 63 | "\n", 64 | "[Docs: Product Quantization (PQ)](https://weaviate.io/developers/weaviate/configuration/compression/pq-compression)\n", 65 | "\n", 66 | "> Note: Product Quantization includes a training phase, which is required to create codebooks (codebooks are used to generate centroids for compressed vectors).
\n", 67 | "> In other words, based on your data, it figures out how to best compress your vectors.\n", 68 | ">\n", 69 | "> The compression training starts when the collection reaches `training_limit` number of objects.
\n", 70 | "> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from weaviate.classes.config import Configure\n", 80 | "\n", 81 | "client.collections.delete(\"WikiQ\")\n", 82 | "\n", 83 | "# Create a collection here - with Weaviate as a vectorizer\n", 84 | "client.collections.create(\n", 85 | " name=\"WikiQ\",\n", 86 | "\n", 87 | " vector_config=[\n", 88 | " Configure.Vectors.text2vec_weaviate(\n", 89 | " name=\"main_vector\",\n", 90 | "\n", 91 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 92 | " source_properties=['title', 'text'],\n", 93 | "\n", 94 | " # Configure PQ\n", 95 | " vector_index_config=Configure.VectorIndex.hnsw(\n", 96 | " quantizer=Configure.VectorIndex.Quantizer.pq(\n", 97 | " segments=256, # 1536/6 # new number of dimension segments\n", 98 | " training_limit=10_000 # (default 100k) number of objects needed to train the codebook\n", 99 | " )\n", 100 | " ),\n", 101 | " )\n", 102 | " ],\n", 103 | ")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## The rest is the same" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "from data_loader import import_wiki_data\n", 120 | "import_wiki_data(client, \"WikiQ\", 25000)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "WikiQ = client.collections.use(\"WikiQ\")\n", 130 | "WikiQ.aggregate.over_all()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Clean up" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# client.collections.delete(\"WikiQ\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Close the client" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "client.close()" 163 | ] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": ".venv (3.11.9)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.11.9" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 2 187 | } 188 | -------------------------------------------------------------------------------- /4-multi-tenancy/4-search-tenants.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multi-tenant Chat with Papers - Query papers\n", 8 | "## Get keys and urls" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import os\n", 18 | "from dotenv import load_dotenv\n", 19 | "\n", 20 | "load_dotenv()\n", 21 | "\n", 22 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 23 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 24 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])\n", 28 | "print(OPENAI_API_KEY[:10])" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Connect to Weaviate" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import weaviate\n", 45 | "from weaviate.classes.init import Auth\n", 46 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 47 | "\n", 48 | "client = weaviate.connect_to_weaviate_cloud(\n", 49 | " cluster_url=WEAVIATE_URL,\n", 50 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 51 | "\n", 52 | " headers = {\n", 53 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n", 54 | " },\n", 55 | "\n", 56 | " # additional_config=AdditionalConfig(\n", 57 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 58 | " # )\n", 59 | ")\n", 60 | "\n", 61 | "client.is_ready()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Vector search on tenants" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "papers = client.collections.use(\"Papers\")\n", 78 | "\n", 79 | "ten = papers.with_tenant(\"2212-10496\")\n", 80 | "\n", 81 | "response = ten.query.near_text(\n", 82 | " query=\"Unsupervised learning\",\n", 83 | " limit=5,\n", 84 | ")\n", 85 | "\n", 86 | "for item in response.objects:\n", 87 | " print(item.properties[\"chunk\"])" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Generative Search with tenants" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "papers = client.collections.use(\"Papers\")\n", 104 | "\n", 105 | "ten2212 = papers.with_tenant(\"2212-10496\")\n", 106 | "\n", 107 | "response = ten2212.generate.near_text(\n", 108 | " query=\"Unsupervised learning\",\n", 109 | " limit=5,\n", 110 | " single_prompt=\"What does the following text describe: {chunk}\",\n", 111 | ")\n", 112 | "\n", 113 | "for item in response.objects:\n", 114 | " print(item.properties[\"chunk\"])\n", 115 | " print(item.generative.text, '\\n')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "papers = client.collections.use(\"Papers\")\n", 125 | "\n", 126 | "ten2212 = papers.with_tenant(\"2212-10496\")\n", 127 | "\n", 128 | "response = ten2212.generate.near_text(\n", 129 | " query=\"Unsupervised learning\",\n", 130 | " limit=5,\n", 131 | " grouped_task=\"Explain how unsupervised learning works. Use only the provided content.\",\n", 132 | " grouped_properties=[\"chunk\"]\n", 133 | ")\n", 134 | "\n", 135 | "for item in response.objects:\n", 136 | " print(item.properties[\"chunk\"])\n", 137 | "\n", 138 | "print(response.generative.text)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "def paper_rag(paper_id, query, prompt):\n", 148 | " papers = client.collections.use(\"Papers\")\n", 149 | " ten = papers.with_tenant(paper_id)\n", 150 | "\n", 151 | " response = ten.generate.near_text(\n", 152 | " query=query,\n", 153 | " limit=5,\n", 154 | " grouped_task=prompt + \" Use only the provided content.\",\n", 155 | " grouped_properties=[\"chunk\"],\n", 156 | " )\n", 157 | "\n", 158 | " return {\n", 159 | " \"title\": response.objects[0].properties[\"title\"],\n", 160 | " \"source\": [p.properties[\"chunk\"] for p in response.objects],\n", 161 | " \"generated\": response.generative.text\n", 162 | " }" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "paper_rag(\n", 172 | " \"2212-10496\",\n", 173 | " \"Unsupervised learning\",\n", 174 | " \"Explain how unsupervised learning works\"\n", 175 | ")" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "papers = client.collections.use(\"Papers\")\n", 185 | "papers.tenants.get()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## Close the client" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "client.close()" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": ".venv (3.11.9)", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.11.9" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /3-rag/2-rag-gen-query-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# !pip install openai" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", 26 | "\n", 27 | "print(WEAVIATE_URL[:10])\n", 28 | "print(WEAVIATE_KEY[:10])\n", 29 | "print(OPENAI_API_KEY[:10])" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Generate query from prompt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from openai import OpenAI\n", 46 | "\n", 47 | "openai_client = OpenAI(\n", 48 | " api_key=OPENAI_API_KEY,\n", 49 | " base_url=\"https://api.openai.com/v1\",\n", 50 | ")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "def generate_query_from_promt(prompt):\n", 60 | " response = openai_client.chat.completions.create(\n", 61 | " model=\"gpt-3.5-turbo\",\n", 62 | " messages=[\n", 63 | " { \"role\": \"system\", \"content\": \"Your job is to extract a query from the provided user prompt, the query will then be used to run a query in a vector database.\" },\n", 64 | " { \n", 65 | " \"role\": \"user\",\n", 66 | " \"content\": f\"Please give me a 2-3 word query that can be used to find relevant info to the following prompt - {prompt}\"\n", 67 | " },\n", 68 | " ]\n", 69 | " )\n", 70 | " return response.choices[0].message.content" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Example of how to generate a query from a prompt\n", 80 | "generate_query_from_promt(\"Where do the tallest penguins live?\")" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Connect to Weaviate" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "import weaviate\n", 97 | "from weaviate.classes.init import Auth\n", 98 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 99 | "\n", 100 | "client = weaviate.connect_to_weaviate_cloud(\n", 101 | " cluster_url=WEAVIATE_URL,\n", 102 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 103 | "\n", 104 | " headers = {\n", 105 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n", 106 | " },\n", 107 | "\n", 108 | " # additional_config=AdditionalConfig(\n", 109 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 110 | " # )\n", 111 | ")\n", 112 | "\n", 113 | "client.is_ready()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Two-step RAG" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "def two_step_rag(user_prompt):\n", 130 | " # Step 1\n", 131 | " prompt = user_prompt + \" Please only use the provided content with this prompt. Don't make things up.\"\n", 132 | " \n", 133 | " generated_query = generate_query_from_promt(prompt)\n", 134 | " print(\"=== Generated Query ===\")\n", 135 | " print(f\"Generated query: {generated_query}\")\n", 136 | "\n", 137 | " # Step 2\n", 138 | " wiki = client.collections.use(\"Wiki\")\n", 139 | "\n", 140 | " response = wiki.generate.near_text(\n", 141 | " query=generated_query,\n", 142 | " limit=3,\n", 143 | " grouped_task=prompt,\n", 144 | " grouped_properties=[\"text\", \"title\"]\n", 145 | " )\n", 146 | "\n", 147 | " # Print results\n", 148 | " print(\"\\n=== Generated Response ===\")\n", 149 | " print(response.generative.text)\n", 150 | "\n", 151 | " print(\"\\n=== Source ===\")\n", 152 | " for item in response.objects:\n", 153 | " print(item.properties)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# two_step_rag(\"What wild animals do we know about?\")\n", 163 | "two_step_rag(\"Please provide an explanation at a highschool level. How do airplanes fly?\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "two_step_rag(\"What are the pros and cons of automation using computer?\")" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "two_step_rag(\"How do CPUs work?\")" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Close the client" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "client.close()" 198 | ] 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": ".venv (3.11.9)", 204 | "language": "python", 205 | "name": "python3" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 3 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython3", 217 | "version": "3.11.9" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 2 222 | } 223 | -------------------------------------------------------------------------------- /4-multi-tenancy/3b-load-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multi-tenant Chat with Papers - Load and chunk papers\n", 8 | "## Get keys and urls" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import os\n", 18 | "from dotenv import load_dotenv\n", 19 | "\n", 20 | "load_dotenv()\n", 21 | "\n", 22 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 23 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 24 | "\n", 25 | "print(WEAVIATE_URL[:10])\n", 26 | "print(WEAVIATE_KEY[:10])" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Connect to Weaviate" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import weaviate\n", 43 | "from weaviate.classes.init import Auth\n", 44 | "\n", 45 | "client = weaviate.connect_to_weaviate_cloud(\n", 46 | " cluster_url=WEAVIATE_URL,\n", 47 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 48 | ")\n", 49 | "\n", 50 | "client.is_ready()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Load Data from arxiv\n", 58 | "\n", 59 | "1. Get chunks from paper - `get_chunks_from_paper`\n", 60 | "2. Create a tenant for the paper - `create_tenant`\n", 61 | "3. Batch import chunks - `batch_import_chunks`" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### 1. Get chunks from paper - `get_chunks_from_paper`" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from distyll.text import from_arxiv_paper\n", 78 | "from distyll.utils import chunk_text\n", 79 | "\n", 80 | "def get_chunks_from_paper(url):\n", 81 | " paper = from_arxiv_paper(url)\n", 82 | " chunks = chunk_text(source_text=paper[\"text\"])\n", 83 | "\n", 84 | " paper[\"arxiv_id\"] = url.replace(\"https://arxiv.org/pdf/\", \"\").replace(\".pdf\", \"\").replace(\".\", \"-\")\n", 85 | " paper[\"chunks\"] = chunks\n", 86 | " return paper" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "chunked_2212 = get_chunks_from_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")\n", 96 | "chunked_2212" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### 2. Create a tenant for the paper - `create_tenant`" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from weaviate.classes.tenants import Tenant\n", 113 | "papers = client.collections.use(\"Papers\")\n", 114 | "\n", 115 | "def create_tenant(chunked_paper):\n", 116 | " tenant_name = chunked_paper[\"arxiv_id\"]\n", 117 | "\n", 118 | " papers.tenants.create([\n", 119 | " Tenant(name=tenant_name)\n", 120 | " ])\n", 121 | "\n", 122 | " return tenant_name" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "create_tenant(chunked_2212)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "papers.tenants.get()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### 3. Batch import chunks - `batch_import_chunks`" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def batch_import_chunks(chunked_paper):\n", 157 | " ten = papers.with_tenant(chunked_paper[\"arxiv_id\"])\n", 158 | "\n", 159 | " i=0\n", 160 | " with ten.batch.dynamic() as batch:\n", 161 | " for chunk in chunked_paper[\"chunks\"]:\n", 162 | " batch.add_object({\n", 163 | " \"title\": chunked_paper[\"title\"],\n", 164 | " \"url\": chunked_paper[\"url\"],\n", 165 | " \"chunk\": chunk,\n", 166 | " \"chunk_no\": i,\n", 167 | " })\n", 168 | " i+=1\n", 169 | "\n", 170 | " # if(len(papers.batch.failed_objects)>0):\n", 171 | " if(len(ten.batch.failed_objects)>0):\n", 172 | " print(\"Import complete with errors\")\n", 173 | " for err in papers.batch.failed_objects:\n", 174 | " print(err)\n", 175 | " else:\n", 176 | " print(\"Import complete with no errors\")" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "batch_import_chunks(chunked_2212)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## End-to-end paper load" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "def import_paper(url):\n", 202 | " cp = get_chunks_from_paper(url)\n", 203 | " tenant_name = create_tenant(cp)\n", 204 | " batch_import_chunks(cp)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "import_paper(\"https://arxiv.org/pdf/2401.00107.pdf\")" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Close the client" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "client.close()" 230 | ] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": ".venv (3.11.9)", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.11.9" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 2 254 | } 255 | -------------------------------------------------------------------------------- /5-vector-compression/0-vector-indexes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Vector Indexes\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "\n", 46 | "client = weaviate.connect_to_weaviate_cloud(\n", 47 | " cluster_url=WEAVIATE_URL,\n", 48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 49 | ")\n", 50 | "\n", 51 | "client.is_ready()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Collection with HNSW index (default)\n", 59 | "\n", 60 | "[HNSW params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#hnsw-index-parameters)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from weaviate.classes.config import Configure, VectorDistances\n", 70 | "\n", 71 | "client.collections.delete(\"IndexExample\")\n", 72 | "\n", 73 | "# Create a collection here - with Weaviate as a vectorizer\n", 74 | "client.collections.create(\n", 75 | " name=\"IndexExample\",\n", 76 | "\n", 77 | " vector_config=[\n", 78 | " Configure.Vectors.text2vec_weaviate(\n", 79 | " name=\"main_vector\",\n", 80 | "\n", 81 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 82 | " source_properties=['title', 'text'],\n", 83 | "\n", 84 | " # HSNW example \n", 85 | " vector_index_config=Configure.VectorIndex.hnsw()\n", 86 | " )\n", 87 | " ],\n", 88 | ")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Collection with Flat index\n", 96 | "\n", 97 | "[Flat params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#flat-indexes)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from weaviate.classes.config import Configure, VectorDistances\n", 107 | "\n", 108 | "client.collections.delete(\"IndexExample\")\n", 109 | "\n", 110 | "# Create a collection here - with Cohere as a vectorizer\n", 111 | "client.collections.create(\n", 112 | " name=\"IndexExample\",\n", 113 | "\n", 114 | " vector_config=[\n", 115 | " Configure.Vectors.text2vec_weaviate(\n", 116 | " name=\"main_vector\",\n", 117 | "\n", 118 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 119 | " source_properties=['title', 'text'],\n", 120 | "\n", 121 | " # Flat example\n", 122 | " vector_index_config=Configure.VectorIndex.flat(\n", 123 | " # distance_metric=VectorDistances.COSINE, # optional\n", 124 | " vector_cache_max_objects=100_000,\n", 125 | " ),\n", 126 | " ),\n", 127 | " ],\n", 128 | ")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Collection with Dynamic index\n", 136 | "\n", 137 | "[Dynamic params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#dynamic-index-parameters)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from weaviate.classes.config import Configure\n", 147 | "\n", 148 | "client.collections.delete(\"IndexExample\")\n", 149 | "\n", 150 | "# Create a collection here - with Cohere as a vectorizer\n", 151 | "client.collections.create(\n", 152 | " name=\"IndexExample\",\n", 153 | "\n", 154 | " vector_config=[\n", 155 | " Configure.Vectors.text2vec_weaviate(\n", 156 | " name=\"main_vector\",\n", 157 | "\n", 158 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 159 | " source_properties=['title', 'text'],\n", 160 | "\n", 161 | " # Dynamic example\n", 162 | " vector_index_config=Configure.VectorIndex.dynamic(\n", 163 | " threshold=10_000, # when to switch to HNSW\n", 164 | "\n", 165 | " flat=Configure.VectorIndex.flat(\n", 166 | " vector_cache_max_objects=100_000,\n", 167 | " # note: can also include a quantizer\n", 168 | " quantizer=Configure.VectorIndex.Quantizer.bq()\n", 169 | " ),\n", 170 | "\n", 171 | " hnsw=Configure.VectorIndex.hnsw(\n", 172 | " max_connections=32, # optional\n", 173 | " # note: the quantizer can be different between flat and hnsw\n", 174 | " quantizer=Configure.VectorIndex.Quantizer.pq()\n", 175 | " ),\n", 176 | " ),\n", 177 | " )\n", 178 | " ],\n", 179 | ")" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## The rest is the same" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "# comment this out if you want to import 25k objects to your collection to test it\n", 196 | "# from data_loader import import_wiki_data\n", 197 | "# import_wiki_data(client, \"IndexExample\", 25_000)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# index_example = client.collections.use(\"IndexExample\")\n", 207 | "# index_example.aggregate.over_all()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Clean up" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "client.collections.delete(\"IndexExample\")" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "## Close the client" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "client.close()" 240 | ] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": ".venv (3.11.9)", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.11.9" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 2 264 | } 265 | -------------------------------------------------------------------------------- /2-pre-vectorised-data/2-wiki-import.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Data with Vectors\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "\n", 46 | "client = weaviate.connect_to_weaviate_cloud(\n", 47 | " cluster_url=WEAVIATE_URL,\n", 48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 49 | ")\n", 50 | "\n", 51 | "client.is_ready()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "weaviate.__version__" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from weaviate.classes.config import Configure\n", 70 | "\n", 71 | "def create_wiki_collection():\n", 72 | " if client.collections.exists(\"Wiki\"):\n", 73 | " client.collections.delete(\"Wiki\")\n", 74 | "\n", 75 | " # Create a collection here - with Weaviate vectorizer and define source properties\n", 76 | " client.collections.create(\n", 77 | " name=\"Wiki\",\n", 78 | "\n", 79 | " vector_config=[\n", 80 | " Configure.Vectors.text2vec_weaviate(\n", 81 | " name=\"main_vector\",\n", 82 | "\n", 83 | " # TODO: use model Snowflake/snowflake-arctic-embed-l-v2.0\n", 84 | " # TODO: set source properties to title and text\n", 85 | " \n", 86 | " )\n", 87 | " ],\n", 88 | " )\n", 89 | "\n", 90 | "create_wiki_collection()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Load the data from parquet files" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from datasets import load_dataset\n", 107 | "\n", 108 | "def prepare_dataset():\n", 109 | " return load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split=\"train\", streaming=True)\n", 110 | " # return load_dataset(\"weaviate/wiki-sample\", \"weaviate-snowflake-arctic-v2\", split=\"train\", streaming=True)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Dataset Test\n", 118 | "" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "dataset = prepare_dataset()\n", 128 | "\n", 129 | "counter = 10\n", 130 | "for i in dataset:\n", 131 | " print(i)\n", 132 | "\n", 133 | " counter -= 1\n", 134 | " if(counter == 0): break" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "### The import function\n", 142 | "\n", 143 | "`TODO:`\n", 144 | "* add a function to add objects to batch" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "from tqdm import tqdm\n", 154 | "from weaviate.util import generate_uuid5\n", 155 | "\n", 156 | "def import_wiki_data(max_rows=10_000):\n", 157 | " print(f\"Importing {max_rows} data items\")\n", 158 | "\n", 159 | " dataset = prepare_dataset()\n", 160 | " wiki = client.collections.use(\"Wiki\")\n", 161 | "\n", 162 | " counter = 0\n", 163 | "\n", 164 | " with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n", 165 | " for item in tqdm(dataset, total=max_rows):\n", 166 | "\n", 167 | " data_to_insert = { \n", 168 | " \"wiki_id\": item[\"wiki_id\"],\n", 169 | " \"text\": item[\"text\"],\n", 170 | " \"title\": item[\"title\"],\n", 171 | " \"url\": item[\"url\"],\n", 172 | " }\n", 173 | "\n", 174 | " item_id = generate_uuid5(item[\"wiki_id\"])\n", 175 | "\n", 176 | " item_vector = {\n", 177 | " \"main_vector\": item[\"vector\"]\n", 178 | " }\n", 179 | "\n", 180 | " # TODO: add objects to batch using\n", 181 | " batch.add_object(\n", 182 | " # * data_to_insert\n", 183 | " # * item_id\n", 184 | " # * item_vector\n", 185 | " )\n", 186 | "\n", 187 | " # Check number of errors while running\n", 188 | " if(batch.number_errors > 10):\n", 189 | " print(f\"Reached {batch.number_errors} errors during batch import\")\n", 190 | " break\n", 191 | " \n", 192 | " # stop after the request number reaches = max_rows\n", 193 | " counter += 1\n", 194 | " if counter >= max_rows:\n", 195 | " break\n", 196 | " \n", 197 | " # check for errors at the end\n", 198 | " if (len(wiki.batch.failed_objects)>0):\n", 199 | " print(\"Final error check\")\n", 200 | " print(f\"Some errors {len(wiki.batch.failed_objects)}\")\n", 201 | " print(wiki.batch.failed_objects[-1])\n", 202 | " \n", 203 | " print(f\"Imported {counter} items\")\n", 204 | " print(\"-----------------------------------\")" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "import_wiki_data(10_000)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Check if data loaded correctly" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "wiki = client.collections.use(\"Wiki\")\n", 230 | "len(wiki)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "res = wiki.query.fetch_objects(limit=1, include_vector=True)\n", 240 | "print(res.objects[0].properties)\n", 241 | "print(res.objects[0].vector)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "client.close()" 251 | ] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": ".venv (3.11.9)", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.11.9" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 2 275 | } 276 | -------------------------------------------------------------------------------- /2-pre-vectorised-data/complete/2-wiki-import-complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Data with Vectors\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "\n", 46 | "client = weaviate.connect_to_weaviate_cloud(\n", 47 | " cluster_url=WEAVIATE_URL,\n", 48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 49 | ")\n", 50 | "\n", 51 | "client.is_ready()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from weaviate.classes.config import Configure\n", 61 | "\n", 62 | "def create_wiki_collection():\n", 63 | " if client.collections.exists(\"Wiki\"):\n", 64 | " client.collections.delete(\"Wiki\")\n", 65 | "\n", 66 | " # Create a collection here - with OpenAI vectorizer and define source properties\n", 67 | " client.collections.create(\n", 68 | " name=\"Wiki\",\n", 69 | "\n", 70 | " vector_config=[\n", 71 | " Configure.Vectors.text2vec_weaviate(\n", 72 | " name=\"main_vector\",\n", 73 | "\n", 74 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 75 | " source_properties=['title', 'text'] # which properties should be used to generate a vector\n", 76 | " )\n", 77 | " ],\n", 78 | " )\n", 79 | "\n", 80 | "create_wiki_collection()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Load the data from parquet files" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from datasets import load_dataset\n", 97 | "\n", 98 | "def prepare_dataset():\n", 99 | " return load_dataset('parquet', data_files={'train': ['../../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split=\"train\", streaming=True)\n", 100 | " # return load_dataset(\"weaviate/wiki-sample\", \"weaviate-snowflake-arctic-v2\", split=\"train\", streaming=True)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "### Dataset Test\n", 108 | "" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "dataset = prepare_dataset()\n", 118 | "\n", 119 | "counter = 10\n", 120 | "for i in dataset:\n", 121 | " print(i)\n", 122 | "\n", 123 | " counter -= 1\n", 124 | " if(counter == 0): break" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### The import function\n", 132 | "\n", 133 | "`TODO:`\n", 134 | "* add a function to add objects to batch" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "from tqdm import tqdm\n", 144 | "from weaviate.util import generate_uuid5\n", 145 | "\n", 146 | "def import_wiki_data(max_rows=10_000):\n", 147 | " print(f\"Importing {max_rows} data items\")\n", 148 | "\n", 149 | " dataset = prepare_dataset()\n", 150 | " wiki = client.collections.use(\"Wiki\")\n", 151 | "\n", 152 | " counter = 0\n", 153 | "\n", 154 | " with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n", 155 | " for item in tqdm(dataset, total=max_rows):\n", 156 | "\n", 157 | " data_to_insert = { \n", 158 | " \"wiki_id\": item[\"wiki_id\"],\n", 159 | " \"text\": item[\"text\"],\n", 160 | " \"title\": item[\"title\"],\n", 161 | " \"url\": item[\"url\"],\n", 162 | " }\n", 163 | "\n", 164 | " item_id = generate_uuid5(item[\"wiki_id\"])\n", 165 | "\n", 166 | " # vector = item[\"vector\"]\n", 167 | " item_vector = {\n", 168 | " \"main_vector\": item[\"vector\"]\n", 169 | " }\n", 170 | "\n", 171 | " batch.add_object(\n", 172 | " properties=data_to_insert,\n", 173 | " \n", 174 | " uuid=item_id,\n", 175 | " vector=item_vector\n", 176 | " )\n", 177 | "\n", 178 | " # Check number of errors while running\n", 179 | " if(batch.number_errors > 10):\n", 180 | " print(f\"Reached {batch.number_errors} Errors during batch import\")\n", 181 | " break\n", 182 | " \n", 183 | " # stop after the request number reaches = max_rows\n", 184 | " counter += 1\n", 185 | " if(counter >= max_rows):\n", 186 | " break\n", 187 | " \n", 188 | " # check for errors at the end\n", 189 | " if (len(wiki.batch.failed_objects)>0):\n", 190 | " print(\"Final error check\")\n", 191 | " print(f\"Some errors {len(wiki.batch.failed_objects)}\")\n", 192 | " print(wiki.batch.failed_objects[-1])\n", 193 | " \n", 194 | " print(f\"Imported {counter} items\")\n", 195 | " print(\"-----------------------------------\")" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "import_wiki_data(10_000)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## Check if data loaded correctly" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "wiki = client.collections.use(\"Wiki\")\n", 221 | "len(wiki)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "res = wiki.query.fetch_objects(limit=1, include_vector=True)\n", 231 | "print(res.objects[0].properties)\n", 232 | "print(res.objects[0].vector)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "## Close the client" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "client.close()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [] 257 | } 258 | ], 259 | "metadata": { 260 | "kernelspec": { 261 | "display_name": ".venv (3.11.9)", 262 | "language": "python", 263 | "name": "python3" 264 | }, 265 | "language_info": { 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 3 269 | }, 270 | "file_extension": ".py", 271 | "mimetype": "text/x-python", 272 | "name": "python", 273 | "nbconvert_exporter": "python", 274 | "pygments_lexer": "ipython3", 275 | "version": "3.11.9" 276 | } 277 | }, 278 | "nbformat": 4, 279 | "nbformat_minor": 2 280 | } 281 | -------------------------------------------------------------------------------- /3-rag/complete/1-rag-complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# RAG - How to query" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "from dotenv import load_dotenv\n", 18 | "\n", 19 | "load_dotenv()\n", 20 | "\n", 21 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 22 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 23 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", 24 | "\n", 25 | "print(WEAVIATE_URL[:10])\n", 26 | "print(WEAVIATE_KEY[:10])\n", 27 | "print(OPENAI_API_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 46 | "\n", 47 | "client = weaviate.connect_to_weaviate_cloud(\n", 48 | " cluster_url=WEAVIATE_URL,\n", 49 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 50 | "\n", 51 | " headers = {\n", 52 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n", 53 | " },\n", 54 | "\n", 55 | " # additional_config=AdditionalConfig(\n", 56 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 57 | " # )\n", 58 | ")\n", 59 | "\n", 60 | "client.is_ready()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Start with (R) - Retrieval" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "wiki = client.collections.use(\"Wiki\")\n", 77 | "\n", 78 | "response = wiki.query.near_text(\n", 79 | " query=\"How do planes fly\",\n", 80 | " limit=5,\n", 81 | " return_properties=[\"text\", \"title\"]\n", 82 | ")\n", 83 | "\n", 84 | "for item in response.objects:\n", 85 | " print(item.properties)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Add (AG) - augmented generation - to make full RAG" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "#### Single Prompt\n", 100 | "\n", 101 | "> Generate a response per **retrieved** object." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# Let's add some colour to our lives :)\n", 111 | "BLUE = \"\\033[94m\"\n", 112 | "PURPLE = \"\\033[95m\"\n", 113 | "RESET = \"\\033[0\"" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "from weaviate.classes.generate import GenerativeConfig\n", 123 | "\n", 124 | "wiki = client.collections.use(\"Wiki\")\n", 125 | "\n", 126 | "response = wiki.generate.near_text(\n", 127 | " query=\"How do planes fly\",\n", 128 | " # auto_limit=1,\n", 129 | " limit=5,\n", 130 | "\n", 131 | " # TODO: add GenerativeConfig with OpenAI and \"gpt-4o-mini\"\n", 132 | " generative_provider=GenerativeConfig.openai(\n", 133 | " model=\"gpt-4o-mini\",\n", 134 | " ),\n", 135 | " \n", 136 | " # TODO: add a single prompt \"Explain what this is about? {text}\"\n", 137 | " single_prompt=\"Explain what this is about? {text}\"\n", 138 | ")\n", 139 | "\n", 140 | "for item in response.objects:\n", 141 | " print(f\"{BLUE}=== Source ===\")\n", 142 | " print(item.properties)\n", 143 | "\n", 144 | " print(f\"{PURPLE}=== Generated Response ===\")\n", 145 | " print(item.generative.text)\n", 146 | " print(\"\\n\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "#### Grouped Task\n", 154 | "\n", 155 | "> Generate one response based on all **retrieved** objects." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "wiki = client.collections.use(\"Wiki\")\n", 165 | "\n", 166 | "response = wiki.generate.near_text(\n", 167 | " query=\"How do planes fly\",\n", 168 | " # auto_limit=1,\n", 169 | " limit=5,\n", 170 | "\n", 171 | " generative_provider=GenerativeConfig.openai(\n", 172 | " model=\"gpt-4o-mini\",\n", 173 | " ),\n", 174 | "\n", 175 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\"\n", 176 | ")\n", 177 | "\n", 178 | "print(f\"{PURPLE}=== Generated Response ===\")\n", 179 | "print(response.generative.text)\n", 180 | "\n", 181 | "print(f\"{BLUE}=== Source ===\")\n", 182 | "for item in response.objects:\n", 183 | " print(item.properties)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "#### Specify which properties to use for grouped task" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "wiki = client.collections.use(\"Wiki\")\n", 200 | "\n", 201 | "response = wiki.generate.near_text(\n", 202 | " query=\"How do planes fly\",\n", 203 | " auto_limit=1,\n", 204 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n", 205 | " grouped_properties=[\"text\", \"title\"],\n", 206 | "\n", 207 | " generative_provider=GenerativeConfig.openai(\n", 208 | " model=\"gpt-4o-mini\",\n", 209 | " ),\n", 210 | ")\n", 211 | "\n", 212 | "print(f\"{PURPLE}=== Generated Response ===\")\n", 213 | "print(response.generative.text)\n", 214 | "\n", 215 | "print(f\"{BLUE}=== Source ===\")\n", 216 | "for item in response.objects:\n", 217 | " print(item.properties)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "## Set default Generative model" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "from weaviate.classes.config import Reconfigure\n", 234 | "\n", 235 | "wiki = client.collections.use(\"Wiki\")\n", 236 | "\n", 237 | "wiki.config.update(\n", 238 | " generative_config=Reconfigure.Generative.openai(\n", 239 | " model=\"gpt-4o-mini\" # Update the generative model\n", 240 | " )\n", 241 | ")" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "> Try generative query without providing the model" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "response = wiki.generate.near_text(\n", 258 | " query=\"What african animals do we have info on. Please only list those provided in here.\",\n", 259 | " auto_limit=1,\n", 260 | " \n", 261 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n", 262 | ")\n", 263 | "\n", 264 | "print(f\"{PURPLE}=== Generated Response ===\")\n", 265 | "print(response.generative.text)\n", 266 | "\n", 267 | "print(f\"{BLUE}=== Source ===\")\n", 268 | "for item in response.objects:\n", 269 | " print(item.properties)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "## Close the client" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "client.close()" 286 | ] 287 | } 288 | ], 289 | "metadata": { 290 | "kernelspec": { 291 | "display_name": ".venv (3.11.9)", 292 | "language": "python", 293 | "name": "python3" 294 | }, 295 | "language_info": { 296 | "codemirror_mode": { 297 | "name": "ipython", 298 | "version": 3 299 | }, 300 | "file_extension": ".py", 301 | "mimetype": "text/x-python", 302 | "name": "python", 303 | "nbconvert_exporter": "python", 304 | "pygments_lexer": "ipython3", 305 | "version": "3.11.9" 306 | } 307 | }, 308 | "nbformat": 4, 309 | "nbformat_minor": 2 310 | } 311 | -------------------------------------------------------------------------------- /3-rag/1-rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# RAG - How to query" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "from dotenv import load_dotenv\n", 18 | "\n", 19 | "load_dotenv()\n", 20 | "\n", 21 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 22 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 23 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", 24 | "\n", 25 | "print(WEAVIATE_URL[:10])\n", 26 | "print(WEAVIATE_KEY[:10])\n", 27 | "print(OPENAI_API_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 46 | "\n", 47 | "client = weaviate.connect_to_weaviate_cloud(\n", 48 | " cluster_url=WEAVIATE_URL,\n", 49 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 50 | "\n", 51 | " headers = {\n", 52 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n", 53 | " },\n", 54 | "\n", 55 | " # additional_config=AdditionalConfig(\n", 56 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 57 | " # )\n", 58 | ")\n", 59 | "\n", 60 | "client.is_ready()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Start with (R) - Retrieval" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "wiki = client.collections.use(\"Wiki\")\n", 77 | "\n", 78 | "response = wiki.query.near_text(\n", 79 | " query=\"How do planes fly\",\n", 80 | " limit=5,\n", 81 | " return_properties=[\"text\", \"title\"]\n", 82 | ")\n", 83 | "\n", 84 | "for item in response.objects:\n", 85 | " print(item.properties)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Add (AG) - augmented generation - to make full RAG" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "#### Single Prompt\n", 100 | "\n", 101 | "> Generate a response per **retrieved** object." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# Let's add some colour to our lives :)\n", 111 | "BLUE = \"\\033[94m\"\n", 112 | "PURPLE = \"\\033[95m\"\n", 113 | "RESET = \"\\033[0\"" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "from weaviate.classes.generate import GenerativeConfig\n", 123 | "\n", 124 | "wiki = client.collections.use(\"Wiki\")\n", 125 | "\n", 126 | "response = wiki.generate.near_text(\n", 127 | " query=\"How do planes fly\",\n", 128 | " # auto_limit=1,\n", 129 | " limit=5,\n", 130 | "\n", 131 | " # TODO: add GenerativeConfig with OpenAI and \"gpt-4o-mini\"\n", 132 | " # generative_provider=GenerativeConfig.\n", 133 | "\n", 134 | " # TODO: add a single prompt \"Explain what this is about? {text}\"\n", 135 | " # single_prompt=\n", 136 | ")\n", 137 | "\n", 138 | "# NOTE: the generated responses are included with the each object\n", 139 | "\n", 140 | "for item in response.objects:\n", 141 | " print(f\"{BLUE}=== Source ===\")\n", 142 | " print(item.properties)\n", 143 | "\n", 144 | " print(f\"{PURPLE}=== Generated Response ===\")\n", 145 | " # TODO: print the generative.text object\n", 146 | " # print(item.)\n", 147 | "\n", 148 | " print(\"\\n\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "#### Grouped Task\n", 156 | "\n", 157 | "> Generate one response based on all **retrieved** objects." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "wiki = client.collections.use(\"Wiki\")\n", 167 | "\n", 168 | "response = wiki.generate.near_text(\n", 169 | " query=\"How do planes fly\",\n", 170 | " # auto_limit=1,\n", 171 | " limit=5,\n", 172 | " \n", 173 | " generative_provider=GenerativeConfig.openai(\n", 174 | " model=\"gpt-4o-mini\",\n", 175 | " ),\n", 176 | "\n", 177 | " # TODO: add a grouped task \"Explain, how do planes fly? Please only use the provided content.\"\n", 178 | " # grouped_task=\n", 179 | ")\n", 180 | "\n", 181 | "print(f\"{PURPLE}=== Generated Response ===\")\n", 182 | "# NOTE: group task response is at response.generative.text\n", 183 | "# TODO: print the generated text\n", 184 | "# print(response.)\n", 185 | "\n", 186 | "print(f\"{BLUE}=== Source ===\")\n", 187 | "for item in response.objects:\n", 188 | " print(item.properties)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "#### Specify which properties to use for grouped task" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "response = wiki.generate.near_text(\n", 205 | " query=\"How do planes fly\",\n", 206 | " auto_limit=1,\n", 207 | "\n", 208 | " generative_provider=GenerativeConfig.openai(\n", 209 | " model=\"gpt-4o-mini\",\n", 210 | " ),\n", 211 | "\n", 212 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n", 213 | " # TODO: add grouped properties to only use \"text\" and \"title\" \n", 214 | " # grouped_properties=[]\n", 215 | ")\n", 216 | "\n", 217 | "print(\"=== Generated Response ===\")\n", 218 | "print(response.generative.text)\n", 219 | "\n", 220 | "print(\"=== Source ===\")\n", 221 | "for item in response.objects:\n", 222 | " print(item.properties)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Set default Generative model" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "from weaviate.classes.config import Reconfigure\n", 239 | "\n", 240 | "wiki = client.collections.use(\"Wiki\")\n", 241 | "\n", 242 | "wiki.config.update(\n", 243 | " #TODO: set generative model\n", 244 | " # generative_config=Reconfigure.Generative.\n", 245 | ")" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "> Try generative query without providing the model" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "response = wiki.generate.near_text(\n", 262 | " query=\"What african animals do we have info on. Please only list those provided in here.\",\n", 263 | " auto_limit=1,\n", 264 | " \n", 265 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n", 266 | ")\n", 267 | "\n", 268 | "print(f\"{PURPLE}=== Generated Response ===\")\n", 269 | "print(response.generative.text)\n", 270 | "\n", 271 | "print(f\"{BLUE}=== Source ===\")\n", 272 | "for item in response.objects:\n", 273 | " print(item.properties)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "## Close the client" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "client.close()" 290 | ] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": ".venv", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.11.8" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 2 314 | } 315 | -------------------------------------------------------------------------------- /4-multi-tenancy/1-playground-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "from dotenv import load_dotenv\n", 11 | "\n", 12 | "load_dotenv()\n", 13 | "\n", 14 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 15 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 16 | "\n", 17 | "print(WEAVIATE_URL[:10])\n", 18 | "print(WEAVIATE_KEY[:10])" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Setup\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import weaviate\n", 35 | "from weaviate.classes.init import Auth\n", 36 | "\n", 37 | "client = weaviate.connect_to_weaviate_cloud(\n", 38 | " cluster_url=WEAVIATE_URL,\n", 39 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 40 | ")\n", 41 | "\n", 42 | "client.is_ready()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Create Tenant-ready collection" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "from weaviate.classes.config import Configure\n", 59 | "\n", 60 | "if (client.collections.exists(\"Play\")):\n", 61 | " client.collections.delete(\"Play\")\n", 62 | "\n", 63 | "client.collections.create(\n", 64 | " \"Play\",\n", 65 | " vector_config=Configure.Vectors.self_provided(),\n", 66 | "\n", 67 | " multi_tenancy_config=Configure.multi_tenancy(True)\n", 68 | "\n", 69 | " # multi_tenancy_config=Configure.multi_tenancy(\n", 70 | " # enabled=True,\n", 71 | " # auto_tenant_creation=True, #Assign to non-existant tenant will create\n", 72 | " # auto_tenant_activation=True\n", 73 | " # )\n", 74 | ")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Create tenants\n", 82 | "> tenant name – must be made of alphanumeric characters (a-z, A-Z, 0-9), underscore (_), and hyphen (-), with a length between 1 and 64 characters'\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from weaviate.classes.tenants import Tenant\n", 92 | "\n", 93 | "play = client.collections.use(\"Play\")\n", 94 | "\n", 95 | "play.tenants.create([\n", 96 | " Tenant(name=\"ten_A\"),\n", 97 | " Tenant(name=\"ten_B\"),\n", 98 | " Tenant(name=\"ten_C\"),\n", 99 | " Tenant(name=\"ten_D\"),\n", 100 | "])" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## List Tenants" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "play.tenants.get()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "play.tenants.exists(\"ten_E\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Access Tenants" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# this will fail – multi-tenant collections require us to use tenants\n", 142 | "play.aggregate.over_all()" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "tenA = play.with_tenant(\"ten_A\")\n", 152 | "\n", 153 | "tenA.aggregate.over_all()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Insert data" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n", 170 | "play = client.collections.use(\"Play\")\n", 171 | "tenA = play.with_tenant(\"ten_A\")\n", 172 | "\n", 173 | "tenA.data.insert_many([\n", 174 | " {\n", 175 | " \"title\": \"A book about vector databases\"\n", 176 | " },\n", 177 | " {\n", 178 | " \"title\": \"Tutorial for multimodal collections\"\n", 179 | " },\n", 180 | "])\n", 181 | "\n", 182 | "tenA.aggregate.over_all()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### Query Example" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "from weaviate.classes.query import Filter\n", 199 | "\n", 200 | "tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n", 201 | "\n", 202 | "response = tenA.query.fetch_objects(\n", 203 | " filters=Filter.by_property(\"title\").like(\"about\")\n", 204 | ")\n", 205 | "\n", 206 | "for item in response.objects:\n", 207 | " print(item.properties)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "### Delete Tenants" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "play.tenants.remove([\"ten_D\"])\n", 224 | "play.tenants.get()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Update Tenants – Active & Inactive & Offloaded\n", 232 | "Tenants can be:\n", 233 | "* `Active` (default) - active tenants use `HOT` resources (RAM)\n", 234 | "* `Inacative` - inactive tenants cannot be searched on, their index is not loaded into memory, they don't use (RAM)\n", 235 | "* `Offloaded` - offloaded tenants are moved to a cloud storage\n", 236 | "\n", 237 | "> Tenant offloading, requires an extra configuration, which is out of scope for this workshop.
\n", 238 | "> You can learn more from [How-to: Configure - Tenant Offloading](https://weaviate.io/developers/weaviate/configuration/tenant-offloading)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Deactivate – make tenant `Inactive`" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "from weaviate.classes.tenants import Tenant, TenantActivityStatus\n", 255 | "\n", 256 | "play.tenants.update([\n", 257 | " Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.INACTIVE),\n", 258 | "])\n", 259 | "\n", 260 | "play.tenants.get()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "**Cannot search `Inactive` tenants**" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "# tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n", 277 | "\n", 278 | "response = tenA.query.fetch_objects(\n", 279 | " filters=Filter.by_property(\"title\").like(\"about\")\n", 280 | ")\n", 281 | "\n", 282 | "for item in response.objects:\n", 283 | " print(item.properties)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "### Activate - make tenant `Active`\n", 291 | "\n", 292 | "> You can't query an inactive tenant, but you can activate it." 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "play.tenants.update([\n", 302 | " Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.ACTIVE),\n", 303 | "])\n", 304 | "\n", 305 | "play.tenants.get()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "response = tenA.query.fetch_objects(\n", 315 | " filters=Filter.by_property(\"title\").like(\"about\")\n", 316 | ")\n", 317 | "\n", 318 | "for item in response.objects:\n", 319 | " print(item.properties)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "### Offload - make tenant `offloaded`\n", 327 | "\n", 328 | "> Tenant offloading, requires an extra configuration, which is out of scope for this workshop.
\n", 329 | "> You can learn more from [How-to: Configure - Tenant Offloading](https://weaviate.io/developers/weaviate/configuration/tenant-offloading)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "# play.tenants.update([\n", 339 | "# Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.OFFLOADED),\n", 340 | "# ])\n", 341 | "\n", 342 | "# play.tenants.get()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "## Clean up" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "client.collections.delete(\"Play\")" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "## Don't close yet...\n", 366 | "\n", 367 | "> You can try again with `auto_tenant_creation=True` and `auto_tenant_activation=True`" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "## Close the client" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "client.close()" 384 | ] 385 | } 386 | ], 387 | "metadata": { 388 | "kernelspec": { 389 | "display_name": ".venv (3.11.9)", 390 | "language": "python", 391 | "name": "python3" 392 | }, 393 | "language_info": { 394 | "codemirror_mode": { 395 | "name": "ipython", 396 | "version": 3 397 | }, 398 | "file_extension": ".py", 399 | "mimetype": "text/x-python", 400 | "name": "python", 401 | "nbconvert_exporter": "python", 402 | "pygments_lexer": "ipython3", 403 | "version": "3.11.9" 404 | } 405 | }, 406 | "nbformat": 4, 407 | "nbformat_minor": 2 408 | } 409 | -------------------------------------------------------------------------------- /1-intro/complete/2-query-complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Query the data\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Connect to Weaviate" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import weaviate\n", 44 | "from weaviate.classes.init import Auth\n", 45 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 46 | "\n", 47 | "client = weaviate.connect_to_weaviate_cloud(\n", 48 | " cluster_url=WEAVIATE_URL,\n", 49 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 50 | "\n", 51 | " # additional_config=AdditionalConfig(\n", 52 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 53 | " # )\n", 54 | ")\n", 55 | "\n", 56 | "client.is_ready()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Helper function" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "import json\n", 73 | "def print_properties(item):\n", 74 | " print(\n", 75 | " json.dumps(\n", 76 | " item.properties,\n", 77 | " indent=2, sort_keys=True, default=str\n", 78 | " )\n", 79 | " )" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## Vector search\n", 87 | "[Docs - near_text](https://weaviate.io/developers/weaviate/search/similarity#an-input-medium)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "wiki = client.collections.use(\"Wiki\")\n", 97 | "\n", 98 | "response = wiki.query.near_text(\n", 99 | " query=\"musical instruments\",\n", 100 | " limit=5\n", 101 | ")\n", 102 | "\n", 103 | "for item in response.objects:\n", 104 | " print_properties(item)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from weaviate.classes.query import MetadataQuery\n", 114 | "\n", 115 | "wiki = client.collections.use(\"Wiki\")\n", 116 | "\n", 117 | "response = wiki.query.near_text(\n", 118 | " query=\"musical instruments\",\n", 119 | " limit=5,\n", 120 | " return_metadata=MetadataQuery(distance=True)\n", 121 | ")\n", 122 | "\n", 123 | "for item in response.objects:\n", 124 | " print_properties(item)\n", 125 | " print(item.metadata.distance)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### Autocut\n", 133 | "\n", 134 | "Return groups of results based on the quality/distance jumps" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "from weaviate.classes.query import MetadataQuery\n", 144 | "\n", 145 | "wiki = client.collections.use(\"Wiki\")\n", 146 | "\n", 147 | "response = wiki.query.near_text(\n", 148 | " query=\"musical instruments\",\n", 149 | " auto_limit=1,\n", 150 | " return_metadata=MetadataQuery(distance=True)\n", 151 | ")\n", 152 | "\n", 153 | "print(f\"Returned object count: {len(response.objects)}\")\n", 154 | "\n", 155 | "for item in response.objects:\n", 156 | " print_properties(item)\n", 157 | " print(item.metadata.distance)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Filters" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### Fetch with filters" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "from weaviate.classes.query import Filter\n", 181 | "\n", 182 | "wiki = client.collections.use(\"Wiki\")\n", 183 | "\n", 184 | "response = wiki.query.fetch_objects(\n", 185 | " limit=5,\n", 186 | " filters=Filter.by_property(\"title\").like(\"music\")\n", 187 | ")\n", 188 | "\n", 189 | "for item in response.objects:\n", 190 | " print_properties(item)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "from weaviate.classes.query import Filter\n", 200 | "\n", 201 | "response = wiki.query.fetch_objects(\n", 202 | " limit=5,\n", 203 | " filters=Filter.by_property(\"title\").like(\"m*ic\") & Filter.by_property(\"title\").not_equal(\"music\")\n", 204 | ")\n", 205 | "\n", 206 | "for item in response.objects:\n", 207 | " print_properties(item)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Search with filters\n", 215 | "[Docs - Filters](https://weaviate.io/developers/weaviate/search/filters)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "from weaviate.classes.query import Filter\n", 225 | "\n", 226 | "response = wiki.query.near_text(\n", 227 | " query=\"musical instruments\",\n", 228 | " limit=5,\n", 229 | " filters=Filter.by_property(\"title\").not_equal(\"music\")\n", 230 | ")\n", 231 | "\n", 232 | "for item in response.objects:\n", 233 | " print_properties(item)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Keyword Search\n", 241 | "\n", 242 | "[Docs - keyword/bm25](https://weaviate.io/developers/weaviate/search/bm25)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "response = wiki.query.bm25(\n", 252 | " query=\"musical instruments\",\n", 253 | " limit=5,\n", 254 | ")\n", 255 | "\n", 256 | "for item in response.objects:\n", 257 | " print_properties(item)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "from weaviate.classes.query import MetadataQuery\n", 267 | "\n", 268 | "response = wiki.query.bm25(\n", 269 | " query=\"musical instruments\",\n", 270 | " query_properties=[\"text\", \"title\"],\n", 271 | " limit=5,\n", 272 | " return_metadata=MetadataQuery(score=True)\n", 273 | ")\n", 274 | "\n", 275 | "for item in response.objects:\n", 276 | " print_properties(item)\n", 277 | " print(item.metadata.score)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from weaviate.classes.query import MetadataQuery\n", 287 | "\n", 288 | "response = wiki.query.bm25(\n", 289 | " query=\"musical instruments\",\n", 290 | " query_properties=[\"text\", \"title^3\"],\n", 291 | " limit=5,\n", 292 | " return_metadata=MetadataQuery(score=True)\n", 293 | ")\n", 294 | "\n", 295 | "for item in response.objects:\n", 296 | " print_properties(item)\n", 297 | " print(item.metadata.score)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "## Hybrid search\n", 305 | "[Docs - hybrid](https://weaviate.io/developers/weaviate/search/hybrid)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "response = wiki.query.hybrid(\n", 315 | " query=\"musical instruments\",\n", 316 | " alpha=0.7,\n", 317 | " limit=5,\n", 318 | ")\n", 319 | "\n", 320 | "for item in response.objects:\n", 321 | " print_properties(item)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "### Hybrid - select properties" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "response = wiki.query.hybrid(\n", 338 | " query=\"musical instruments\",\n", 339 | " alpha=0.7,\n", 340 | " limit=5,\n", 341 | " query_properties=[\"title\"]\n", 342 | ")\n", 343 | "\n", 344 | "for item in response.objects:\n", 345 | " print_properties(item)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "### Hybrid - Explain score" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "from weaviate.classes.query import MetadataQuery\n", 362 | "\n", 363 | "response = wiki.query.hybrid(\n", 364 | " query=\"musical instruments\",\n", 365 | " alpha=0.7,\n", 366 | " limit=5,\n", 367 | " query_properties=[\"title\"],\n", 368 | " return_metadata=MetadataQuery(score=True, explain_score=True)\n", 369 | ")\n", 370 | "\n", 371 | "for item in response.objects:\n", 372 | " print_properties(item)\n", 373 | " print(item.metadata.score)\n", 374 | " print(item.metadata.explain_score)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "## Close the client" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "client.close()" 391 | ] 392 | } 393 | ], 394 | "metadata": { 395 | "kernelspec": { 396 | "display_name": ".venv (3.11.9)", 397 | "language": "python", 398 | "name": "python3" 399 | }, 400 | "language_info": { 401 | "codemirror_mode": { 402 | "name": "ipython", 403 | "version": 3 404 | }, 405 | "file_extension": ".py", 406 | "mimetype": "text/x-python", 407 | "name": "python", 408 | "nbconvert_exporter": "python", 409 | "pygments_lexer": "ipython3", 410 | "version": "3.11.9" 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 2 415 | } 416 | -------------------------------------------------------------------------------- /1-intro/2-query.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Query the data\n", 8 | "\n", 9 | "## Get keys and urls" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "from dotenv import load_dotenv\n", 20 | "\n", 21 | "load_dotenv()\n", 22 | "\n", 23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 25 | "\n", 26 | "print(WEAVIATE_URL[:10])\n", 27 | "print(WEAVIATE_KEY[:10])\n", 28 | "\n", 29 | "if(WEAVIATE_URL == \"UPDATE_ME_WEAVIATE_URL\"):\n", 30 | " raise Exception(\"Please update .env and Restart the notebook (see Restart button, next to Run All)\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Connect to Weaviate" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import weaviate\n", 47 | "from weaviate.classes.init import Auth\n", 48 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 49 | "\n", 50 | "client = weaviate.connect_to_weaviate_cloud(\n", 51 | " cluster_url=WEAVIATE_URL,\n", 52 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 53 | "\n", 54 | " # additional_config=AdditionalConfig(\n", 55 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 56 | " # )\n", 57 | ")\n", 58 | "\n", 59 | "client.is_ready()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### Helper function" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import json\n", 76 | "def print_properties(item):\n", 77 | " print(\n", 78 | " json.dumps(\n", 79 | " item.properties,\n", 80 | " indent=2, sort_keys=True, default=str\n", 81 | " )\n", 82 | " )" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Vector search\n", 90 | "[Docs - near_text](https://weaviate.io/developers/weaviate/search/similarity#an-input-medium)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# TODO: get the Wiki collection\n", 100 | "# wiki = \n", 101 | "\n", 102 | "# TODO: run a near text query, search for musical instruments, with limit 5\n", 103 | "# response = wiki.query.\n", 104 | "\n", 105 | "for item in response.objects:\n", 106 | " print_properties(item)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from weaviate.classes.query import MetadataQuery\n", 116 | "\n", 117 | "wiki = client.collections.use(\"Wiki\")\n", 118 | "\n", 119 | "response = wiki.query.near_text(\n", 120 | " query=\"musical instruments\",\n", 121 | " limit=5,\n", 122 | " # TODO: add MetadataQuery - request distance\n", 123 | " # return_metadata=\n", 124 | ")\n", 125 | "\n", 126 | "for item in response.objects:\n", 127 | " print_properties(item)\n", 128 | " print(item.metadata.distance)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Autocut\n", 136 | "\n", 137 | "Return groups of results based on the quality/distance jumps" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from weaviate.classes.query import MetadataQuery\n", 147 | "\n", 148 | "wiki = client.collections.use(\"Wiki\")\n", 149 | "\n", 150 | "response = wiki.query.near_text(\n", 151 | " query=\"musical instruments\",\n", 152 | " # TODO: use auto_limit instead if limit, set it to 1\n", 153 | "\n", 154 | " return_metadata=MetadataQuery(distance=True)\n", 155 | ")\n", 156 | "\n", 157 | "print(f\"Returned object count: {len(response.objects)}\")\n", 158 | "\n", 159 | "for item in response.objects:\n", 160 | " print_properties(item)\n", 161 | " print(item.metadata.distance)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Filters" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Fetch with filters" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from weaviate.classes.query import Filter\n", 185 | "\n", 186 | "wiki = client.collections.use(\"Wiki\")\n", 187 | "\n", 188 | "response = wiki.query.fetch_objects(\n", 189 | " limit=5,\n", 190 | " # TODO: filter by property title, search for something like music\n", 191 | " # filters=\n", 192 | ")\n", 193 | "\n", 194 | "for item in response.objects:\n", 195 | " print_properties(item)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "from weaviate.classes.query import Filter\n", 205 | "\n", 206 | "response = wiki.query.fetch_objects(\n", 207 | " limit=5,\n", 208 | " # NOTE: you can use & as AND operator and | as OR operator\n", 209 | " filters=Filter.by_property(\"title\").like(\"m*ic\") & Filter.by_property(\"title\").not_equal(\"music\")\n", 210 | ")\n", 211 | "\n", 212 | "for item in response.objects:\n", 213 | " print_properties(item)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Search with filters\n", 221 | "[Docs - Filters](https://weaviate.io/developers/weaviate/search/filters)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "from weaviate.classes.query import Filter\n", 231 | "\n", 232 | "response = wiki.query.near_text(\n", 233 | " query=\"musical instruments\",\n", 234 | " limit=5,\n", 235 | " filters=Filter.by_property(\"title\").not_equal(\"music\")\n", 236 | ")\n", 237 | "\n", 238 | "for item in response.objects:\n", 239 | " print_properties(item)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "## Keyword Search\n", 247 | "\n", 248 | "[Docs - keyword/bm25](https://weaviate.io/developers/weaviate/search/bm25)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# TODO: use bm25 query, search for musical instruments, set limit to 5\n", 258 | "\n", 259 | "# response = wiki.\n", 260 | "\n", 261 | "for item in response.objects:\n", 262 | " print_properties(item)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "from weaviate.classes.query import MetadataQuery\n", 272 | "\n", 273 | "response = wiki.query.bm25(\n", 274 | " query=\"musical instruments\",\n", 275 | " # TODO: add query properties for \"text\" and \"title\"\n", 276 | " # query_properties=[],\n", 277 | " limit=5,\n", 278 | " return_metadata=MetadataQuery(score=True)\n", 279 | ")\n", 280 | "\n", 281 | "for item in response.objects:\n", 282 | " print_properties(item)\n", 283 | " print(item.metadata.score)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "from weaviate.classes.query import MetadataQuery\n", 293 | "\n", 294 | "response = wiki.query.bm25(\n", 295 | " query=\"musical instruments\",\n", 296 | " query_properties=[\"text\", \"title^3\"],\n", 297 | " limit=5,\n", 298 | " return_metadata=MetadataQuery(score=True)\n", 299 | ")\n", 300 | "\n", 301 | "for item in response.objects:\n", 302 | " print_properties(item)\n", 303 | " print(item.metadata.score)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## Hybrid search\n", 311 | "[Docs - hybrid](https://weaviate.io/developers/weaviate/search/hybrid)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# TODO: use hybrid query, search for musical instruments, set alpha to 0.7, and limit to 5\n", 321 | "# response = wiki.\n", 322 | "\n", 323 | "for item in response.objects:\n", 324 | " print_properties(item)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Hybrid - select properties" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "response = wiki.query.hybrid(\n", 341 | " query=\"musical instruments\",\n", 342 | " alpha=0.7,\n", 343 | " limit=5,\n", 344 | " # TODO: add query properties for \"title\"\n", 345 | ")\n", 346 | "\n", 347 | "for item in response.objects:\n", 348 | " print_properties(item)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Hybrid - Explain score" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "from weaviate.classes.query import MetadataQuery\n", 365 | "\n", 366 | "response = wiki.query.hybrid(\n", 367 | " query=\"musical instruments\",\n", 368 | " alpha=0.7,\n", 369 | " limit=5,\n", 370 | " query_properties=[\"title\"],\n", 371 | " return_metadata=MetadataQuery(score=True, explain_score=True)\n", 372 | ")\n", 373 | "\n", 374 | "for item in response.objects:\n", 375 | " print_properties(item)\n", 376 | " print(item.metadata.score)\n", 377 | " print(item.metadata.explain_score)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "## Close the client" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "client.close()" 394 | ] 395 | } 396 | ], 397 | "metadata": { 398 | "kernelspec": { 399 | "display_name": ".venv (3.11.9)", 400 | "language": "python", 401 | "name": "python3" 402 | }, 403 | "language_info": { 404 | "codemirror_mode": { 405 | "name": "ipython", 406 | "version": 3 407 | }, 408 | "file_extension": ".py", 409 | "mimetype": "text/x-python", 410 | "name": "python", 411 | "nbconvert_exporter": "python", 412 | "pygments_lexer": "ipython3", 413 | "version": "3.11.9" 414 | } 415 | }, 416 | "nbformat": 4, 417 | "nbformat_minor": 2 418 | } 419 | -------------------------------------------------------------------------------- /2-pre-vectorised-data/1-playground-run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "from dotenv import load_dotenv\n", 11 | "\n", 12 | "load_dotenv()\n", 13 | "\n", 14 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 15 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 16 | "\n", 17 | "print(WEAVIATE_URL[:10])\n", 18 | "print(WEAVIATE_KEY[:10])" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Connect to Weaviate" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import weaviate\n", 35 | "from weaviate.classes.init import Auth\n", 36 | "\n", 37 | "client = weaviate.connect_to_weaviate_cloud(\n", 38 | " cluster_url=WEAVIATE_URL,\n", 39 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 40 | ")\n", 41 | "\n", 42 | "client.is_ready()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Create a collection with no vectorizer" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# Note: in practice, you shouldn't rerun this cell, as it deletes your data\n", 59 | "# in \"MyCollection\", and then you need to re-import it again.\n", 60 | "from weaviate.classes.config import Configure, VectorDistances\n", 61 | "\n", 62 | "# Delete the collection if it already exists\n", 63 | "if (client.collections.exists(\"MyCollection\")):\n", 64 | " client.collections.delete(\"MyCollection\")\n", 65 | "\n", 66 | "client.collections.create(\n", 67 | " name=\"MyCollection\",\n", 68 | " vector_config=Configure.Vectors.self_provided( # No vectorizer needed\n", 69 | " vector_index_config=Configure.VectorIndex.hnsw( # Optional\n", 70 | " distance_metric=VectorDistances.COSINE # select prefered distance metric \n", 71 | " )\n", 72 | " ),\n", 73 | ")\n", 74 | "\n", 75 | "print(f\"Successfully created collection: {'MyCollection'}.\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Insert an object with a vector" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "my_collection = client.collections.use(\"MyCollection\")\n", 92 | "my_collection.data.insert(\n", 93 | " properties={\n", 94 | " \"title\": \"First Object\",\n", 95 | " \"foo\": 11, \n", 96 | " },\n", 97 | " vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]\n", 98 | ")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "res = my_collection.query.fetch_objects(include_vector=True)\n", 108 | "\n", 109 | "print(res.objects[0].properties)\n", 110 | "print(res.objects[0].vector)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Insert many objects with their vectors using batch" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "source = [\n", 127 | " {\n", 128 | " \"title\": \"Second Object\",\n", 129 | " \"foo\": 22,\n", 130 | " \"vector\": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]\n", 131 | " },\n", 132 | " {\n", 133 | " \"title\": \"Third Object\",\n", 134 | " \"foo\": 33,\n", 135 | " \"vector\": [0.3, 0.1, -0.1, -0.3, -0.5, -0.7]\n", 136 | " },\n", 137 | " {\n", 138 | " \"title\": \"Fourth Object\",\n", 139 | " \"foo\": 44,\n", 140 | " \"vector\": [0.4, 0.41, 0.42, 0.43, 0.44, 0.45]\n", 141 | " },\n", 142 | " {\n", 143 | " \"title\": \"Fifth Object\",\n", 144 | " \"foo\": 55,\n", 145 | " \"vector\": [0.5, 0.5, 0, 0, 0, 0]\n", 146 | " },\n", 147 | "]" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "\n", 157 | "with my_collection.batch.dynamic() as batch:\n", 158 | " for item in source:\n", 159 | " batch.add_object(\n", 160 | " properties={\n", 161 | " \"title\": item[\"title\"],\n", 162 | " \"foo\": item[\"foo\"],\n", 163 | " },\n", 164 | " vector=item[\"vector\"]\n", 165 | " )" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "## Example with insert_many" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# sample_data = [\n", 182 | "# wc.DataObject(\n", 183 | "# properties={\n", 184 | "# \"title\": \"First Object\",\n", 185 | "# \"foo\": 11, \n", 186 | "# },\n", 187 | "# vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]\n", 188 | "# ),\n", 189 | "# wc.DataObject(\n", 190 | "# properties={\n", 191 | "# \"title\": \"Second Object\",\n", 192 | "# \"foo\": 22,\n", 193 | "# },\n", 194 | "# vector=[0.2, 0.3, 0.4, 0.5, 0.6, 0.7]\n", 195 | "# ),\n", 196 | "# wc.DataObject(\n", 197 | "# properties={\n", 198 | "# \"title\": \"Third Object\",\n", 199 | "# \"foo\": 33,\n", 200 | "# },\n", 201 | "# vector=[0.3, 0.1, -0.1, -0.3, -0.5, -0.7]\n", 202 | "# ),\n", 203 | "# wc.DataObject(\n", 204 | "# properties={\n", 205 | "# \"title\": \"Fourth Object\",\n", 206 | "# \"foo\": 44,\n", 207 | "# },\n", 208 | "# vector=[0.4, 0.41, 0.42, 0.43, 0.44, 0.45]\n", 209 | "# ),\n", 210 | "# wc.DataObject(\n", 211 | "# properties={\n", 212 | "# \"title\": \"Fifth Object\",\n", 213 | "# \"foo\": 55,\n", 214 | "# },\n", 215 | "# vector=[0.5, 0.5, 0, 0, 0, 0]\n", 216 | "# ),\n", 217 | "# ]\n", 218 | "\n", 219 | "# my_collection.data.insert_many(sample_data)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Query\n", 227 | "Available types of queries you can run when working with vector embeddings (without modules) in **Weaviate**:\n", 228 | "\n", 229 | "1. [near_vector](https://weaviate.io/developers/weaviate/search/similarity#search-with-a-vector)\n", 230 | "\n", 231 | "2. [near_object](https://weaviate.io/developers/weaviate/search/similarity#search-with-an-existing-object)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### nearVector Example\n", 239 | "**First example** - Search Weaviate with a vector embedding, and return title property.\n", 240 | "\n", 241 | "See [the docs](https://weaviate.io/developers/weaviate/search/similarity#search-with-a-vector) for more." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "response = my_collection.query.near_vector(\n", 251 | " near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n", 252 | " limit=2,\n", 253 | ")\n", 254 | "\n", 255 | "for item in response.objects:\n", 256 | " print(item.uuid)\n", 257 | " print(item.properties, \"\\n\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "**Second example** - The same search query, but this time also return `distance`, and `vector`." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "from weaviate.classes.query import MetadataQuery\n", 274 | "\n", 275 | "response = my_collection.query.near_vector(\n", 276 | " near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n", 277 | " include_vector=True,\n", 278 | " return_metadata=MetadataQuery(distance=True),\n", 279 | " limit=2,\n", 280 | ")\n", 281 | "\n", 282 | "for item in response.objects:\n", 283 | " print(item.properties)\n", 284 | " print(item.metadata.distance)\n", 285 | " print(item.vector, \"\\n\")" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "**Third example** – Same vector query, but this time we will filter on \"foo\" (which should be greater than 44). Also, let's return \"title\" and \"foo\".\n", 293 | "\n", 294 | "See [the docs](https://weaviate.io/developers/weaviate/search/filters#filter-with-one-condition) for more." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "from weaviate.classes.query import Filter, MetadataQuery\n", 304 | "\n", 305 | "response = my_collection.query.near_vector(\n", 306 | " near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n", 307 | " return_metadata=MetadataQuery(distance=True),\n", 308 | " filters=Filter.by_property(\"foo\").greater_than(30),\n", 309 | " limit=2,\n", 310 | ")\n", 311 | "\n", 312 | "for item in response.objects:\n", 313 | " print(item.properties)\n", 314 | " print(item.metadata.distance, \"\\n\")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "### nearObject Example\n", 322 | "\n", 323 | "Weaviate also allows you to search for similar objects.\n", 324 | "\n", 325 | "See [the docs](https://weaviate.io/developers/weaviate/search/similarity#search-with-an-existing-object) for more.\n", 326 | "\n", 327 | "**Fourth example** - \n", 328 | "Search through `MyCollection` for similar objects, by providing an id from the previous query. \n", 329 | "\n", 330 | "> Note #1: The id was taken from the query above
\n", 331 | "> The generated id for you might be different.\n", 332 | "\n", 333 | "> Note #2: The first object returned is always itself." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "from weaviate.classes.query import MetadataQuery\n", 343 | "\n", 344 | "response = my_collection.query.near_object(\n", 345 | " near_object=\"20805faa-f0b6-404a-aa34-8a44e01e0bcd\",\n", 346 | " return_metadata=MetadataQuery(distance=True),\n", 347 | " limit=3,\n", 348 | ")\n", 349 | "\n", 350 | "for item in response.objects:\n", 351 | " print(item.uuid)\n", 352 | " print(item.properties)\n", 353 | " print(item.metadata.distance, \"\\n\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## Close the client" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "client.close()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": ".venv (3.11.9)", 383 | "language": "python", 384 | "name": "python3" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.11.9" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 2 401 | } 402 | -------------------------------------------------------------------------------- /1-intro/complete/1-load-data-complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Collection setup and data load" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Get keys and urls" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "from dotenv import load_dotenv\n", 25 | "\n", 26 | "load_dotenv()\n", 27 | "\n", 28 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 29 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 30 | "\n", 31 | "print(WEAVIATE_URL[:10])\n", 32 | "print(WEAVIATE_KEY[:10])" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Connect to Weaviate\n", 40 | "\n", 41 | "You need to pass in your Weaviate Cloud URL and KEY." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import weaviate\n", 51 | "from weaviate.classes.init import Auth\n", 52 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 53 | "\n", 54 | "client = weaviate.connect_to_weaviate_cloud(\n", 55 | " cluster_url=WEAVIATE_URL,\n", 56 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 57 | "\n", 58 | " # additional_config=AdditionalConfig(\n", 59 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 60 | " # )\n", 61 | ")\n", 62 | "\n", 63 | "client.is_ready()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Create a collection with a vectorizer\n", 71 | "\n", 72 | "* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)\n", 73 | "* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)\n", 74 | "\n", 75 | "Examples of other embedding models:\n", 76 | "* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)\n", 77 | "* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)\n", 78 | "* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)\n", 79 | "* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from weaviate.classes.config import Configure\n", 89 | "\n", 90 | "if client.collections.exists(\"Jeopardy\"):\n", 91 | " client.collections.delete(\"Jeopardy\")\n", 92 | "\n", 93 | "# Create a collection - with Weaviate vectorizer\n", 94 | "client.collections.create(\n", 95 | " name=\"Jeopardy\",\n", 96 | "\n", 97 | " # https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings\n", 98 | " vector_config=Configure.Vectors.text2vec_weaviate(\n", 99 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 100 | " dimensions=256 # options 1024 (default) and 256\n", 101 | " ),\n", 102 | ")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Import data\n", 110 | "### Sample Data" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "import json\n", 120 | "\n", 121 | "with open(\"../jeopardy_tiny.json\") as file:\n", 122 | " data_10 = json.load(file)\n", 123 | "\n", 124 | "print(json.dumps(data_10[0:2], indent=2))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### Insert Many\n", 132 | "\n", 133 | "> `insert_many` is only used for inserting small batches of data - must complete within the timeout.\n", 134 | "\n", 135 | "[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Insert data\n", 145 | "jeopardy = client.collections.use(\"Jeopardy\")\n", 146 | "jeopardy.data.insert_many(data_10)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Data preview" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# Show data preview\n", 163 | "jeopardy = client.collections.use(\"Jeopardy\")\n", 164 | "response = jeopardy.query.fetch_objects(limit=4)\n", 165 | "\n", 166 | "for item in response.objects:\n", 167 | " print(item.uuid, item.properties)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# Show data preview - with vectors\n", 177 | "jeopardy = client.collections.use(\"Jeopardy\")\n", 178 | "response = jeopardy.query.fetch_objects(\n", 179 | " limit=4,\n", 180 | " include_vector=True\n", 181 | ")\n", 182 | "\n", 183 | "for item in response.objects:\n", 184 | " print(item.properties)\n", 185 | " print(item.vector, '\\n')" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "### Super quick query example" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "response = jeopardy.query.near_text(\n", 202 | " query=\"African animals\",\n", 203 | " # query=\"weather\",\n", 204 | " limit=2\n", 205 | ")\n", 206 | "\n", 207 | "for item in response.objects:\n", 208 | " print(item.properties)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## A bit bigger example - 2k objects" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "### Load data" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "import json\n", 232 | "\n", 233 | "with open(\"../wiki-2k.json\") as file:\n", 234 | " data_2k = json.load(file)\n", 235 | "\n", 236 | "print(json.dumps(data_2k[0:2], indent=2))" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "### Create a collection with Named Vectors and SourceProperties" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "from weaviate.classes.config import Configure, Property, DataType\n", 253 | "\n", 254 | "def create_wiki_collection():\n", 255 | " if client.collections.exists(\"Wiki\"):\n", 256 | " client.collections.delete(\"Wiki\")\n", 257 | "\n", 258 | " # Create a collection here - with Weaviate vectorizer and define source properties\n", 259 | " client.collections.create(\n", 260 | " name=\"Wiki\",\n", 261 | "\n", 262 | " vector_config=[\n", 263 | " Configure.Vectors.text2vec_weaviate(\n", 264 | " name=\"main_vector\",\n", 265 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\", # default\n", 266 | " source_properties=['title', 'text'] # which properties should be used to generate a vector\n", 267 | " )\n", 268 | " ],\n", 269 | "\n", 270 | " # Example: how to define property schema (Optional)\n", 271 | " # properties=[ \n", 272 | " # Property(name=\"title\", data_type=DataType.TEXT),\n", 273 | " # Property(name=\"text\", data_type=DataType.TEXT),\n", 274 | " # Property(name=\"url\", data_type=DataType.TEXT),\n", 275 | " # Property(name=\"wiki_id\", data_type=DataType.TEXT),\n", 276 | " # ],\n", 277 | " )\n", 278 | "\n", 279 | "create_wiki_collection()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "### Import data - 2k objects with Batch\n", 287 | "\n", 288 | "Batch speeds up the import process by grouping objects to be added in bigger batch groups.\n", 289 | "\n", 290 | "Batch creates an internal buffer to collect objects to be added.
\n", 291 | "Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.\n", 292 | "\n", 293 | "Types of batch:\n", 294 | "* `dynamic` - let batch calculate the optimal batch_size based on detected latency\n", 295 | "* `fixed_size` - provide a fixed batch_size\n", 296 | "* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "### Take 1 – import sample 100" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "from tqdm import tqdm\n", 313 | "\n", 314 | "sample_100 = data_2k[0:100]\n", 315 | "\n", 316 | "wiki = client.collections.use(\"Wiki\")\n", 317 | "\n", 318 | "with wiki.batch.dynamic() as batch:\n", 319 | " for item in tqdm(sample_100):\n", 320 | " batch.add_object(item)\n", 321 | "\n", 322 | "print(f\"Wiki count: {len(wiki)}\")" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# check for errors\n", 332 | "if(len(wiki.batch.failed_objects)>0):\n", 333 | " print(\"Import complete with errors\")\n", 334 | " for err in wiki.batch.failed_objects:\n", 335 | " print(err)\n", 336 | "else:\n", 337 | " print(\"Import complete with no errors\")" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "### Take 2 – import sample 100 – with UUID\n", 345 | "\n", 346 | "To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "from weaviate.util import generate_uuid5\n", 356 | "\n", 357 | "print(generate_uuid5(\"This UUID is always the same\"))\n", 358 | "print(generate_uuid5(\"This UUID is always the same\"))\n", 359 | "print(generate_uuid5(\"This UUID is always the same\"))\n", 360 | "print(\"====================================\")\n", 361 | "\n", 362 | "print(generate_uuid5(\"This UUID is different\"))\n", 363 | "print(generate_uuid5(\"This UUID is different\"))\n", 364 | "print(\"====================================\")\n", 365 | "\n", 366 | "obj1 = { \"title\": \"this is an object\", \"count\": 1 }\n", 367 | "obj2 = { \"title\": \"this is an object\", \"count\": 2 }\n", 368 | "print(generate_uuid5(obj1))\n", 369 | "print(generate_uuid5(obj2))\n" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# recreate the collection to start again\n", 379 | "create_wiki_collection()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "> Rerun the import script multiple times.\n", 387 | "\n", 388 | "> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "from tqdm import tqdm\n", 398 | "from weaviate.util import generate_uuid5\n", 399 | "\n", 400 | "sample_100 = data_2k[0:100]\n", 401 | "\n", 402 | "wiki = client.collections.use(\"Wiki\")\n", 403 | "\n", 404 | "with wiki.batch.fixed_size(batch_size=20, concurrent_requests=2) as batch:\n", 405 | " for item in tqdm(sample_100):\n", 406 | " id = generate_uuid5(item[\"wiki_id\"])\n", 407 | "\n", 408 | " batch.add_object(\n", 409 | " item,\n", 410 | " uuid=id\n", 411 | " )\n", 412 | "\n", 413 | "print(f\"Wiki count: {len(wiki)}\")" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "### Take 2 - import the rest of the data - but break if multiple errors" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "from tqdm import tqdm\n", 430 | "from weaviate.util import generate_uuid5\n", 431 | "\n", 432 | "wiki = client.collections.use(\"Wiki\")\n", 433 | "\n", 434 | "with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n", 435 | " for item in tqdm(data_2k):\n", 436 | " id = generate_uuid5(item[\"wiki_id\"])\n", 437 | " batch.add_object(item, uuid=id)\n", 438 | "\n", 439 | " # Check number of errors while running\n", 440 | " if(batch.number_errors > 10):\n", 441 | " print(\"Errors during batch import\")\n", 442 | " break" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "### Check for errors" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "if(len(wiki.batch.failed_objects)>0):\n", 459 | " print(\"Import complete with errors\")\n", 460 | " for err in wiki.batch.failed_objects:\n", 461 | " print(err)\n", 462 | "else:\n", 463 | " print(\"Import complete with no errors\")" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Bonus - iterate through all collection data\n", 471 | "\n", 472 | "The client has a built-in function that allows you to iterate through all collection data." 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "wiki = client.collections.use(\"Wiki\")\n", 482 | "\n", 483 | "counter = 100\n", 484 | "\n", 485 | "for item in wiki.iterator():\n", 486 | " print(item.properties)\n", 487 | "\n", 488 | " if (counter == 0): break\n", 489 | " \n", 490 | " counter -= 1" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "You can also get `vector embeddings`, by using `include_vector`." 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "counter = 10\n", 507 | "\n", 508 | "for item in wiki.iterator(include_vector=True):\n", 509 | " print(item.properties)\n", 510 | " print(item.vector)\n", 511 | "\n", 512 | " if (counter == 0): break\n", 513 | " \n", 514 | " counter -= 1" 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "## Close the client" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "client.close()" 531 | ] 532 | } 533 | ], 534 | "metadata": { 535 | "kernelspec": { 536 | "display_name": ".venv (3.11.9)", 537 | "language": "python", 538 | "name": "python3" 539 | }, 540 | "language_info": { 541 | "codemirror_mode": { 542 | "name": "ipython", 543 | "version": 3 544 | }, 545 | "file_extension": ".py", 546 | "mimetype": "text/x-python", 547 | "name": "python", 548 | "nbconvert_exporter": "python", 549 | "pygments_lexer": "ipython3", 550 | "version": "3.11.9" 551 | } 552 | }, 553 | "nbformat": 4, 554 | "nbformat_minor": 2 555 | } 556 | -------------------------------------------------------------------------------- /1-intro/1-load-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Collection setup and data load" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Get keys and urls" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "from dotenv import load_dotenv\n", 25 | "\n", 26 | "load_dotenv()\n", 27 | "\n", 28 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n", 29 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n", 30 | "\n", 31 | "print(WEAVIATE_URL[:10)\n", 32 | "print(WEAVIATE_KEY[:10])\n", 33 | "\n", 34 | "if(WEAVIATE_URL == \"UPDATE_ME_WEAVIATE_URL\"):\n", 35 | " raise Exception(\"Please update .env and Restart the notebook (see Restart button, next to Run All)\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Connect to Weaviate\n", 43 | "\n", 44 | "You need to pass in your Weaviate Cloud URL and KEY." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import weaviate\n", 54 | "from weaviate.classes.init import Auth\n", 55 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n", 56 | "\n", 57 | "client = weaviate.connect_to_weaviate_cloud(\n", 58 | " cluster_url=WEAVIATE_URL,\n", 59 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n", 60 | "\n", 61 | " # additional_config=AdditionalConfig(\n", 62 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n", 63 | " # )\n", 64 | ")\n", 65 | "\n", 66 | "client.is_ready()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Create a collection with a vectorizer\n", 74 | "\n", 75 | "* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)\n", 76 | "* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)\n", 77 | "\n", 78 | "Examples of other embedding models:\n", 79 | "* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)\n", 80 | "* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)\n", 81 | "* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)\n", 82 | "* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from weaviate.classes.config import Configure\n", 92 | "\n", 93 | "if client.collections.exists(\"Jeopardy\"):\n", 94 | " client.collections.delete(\"Jeopardy\")\n", 95 | "\n", 96 | "# Create a collection - with Weaviate vectorizer\n", 97 | "client.collections.create(\n", 98 | " name=\"Jeopardy\",\n", 99 | " # TODO: add text2vec_weaviate vectorizer - with:\n", 100 | " # * model - Snowflake/snowflake-arctic-embed-l-v2.0\n", 101 | " \n", 102 | ")" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Import data\n", 110 | "### Sample Data" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "import json\n", 120 | "\n", 121 | "with open(\"./jeopardy_tiny.json\") as file:\n", 122 | " data_10 = json.load(file)\n", 123 | "\n", 124 | "print(json.dumps(data_10[0:2], indent=2))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### Insert Many\n", 132 | "\n", 133 | "> `insert_many` is only used for inserting small batches of data - must complete within the timeout.\n", 134 | "\n", 135 | "[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Insert data\n", 145 | "\n", 146 | "# TODO: get Jeopardy collection\n", 147 | "# TODO: insert data_10" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### Data preview" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# Show data preview\n", 164 | "jeopardy = client.collections.use(\"Jeopardy\")\n", 165 | "\n", 166 | "# TODO: fetch 4 objects\n", 167 | "# response = jeopardy\n", 168 | "\n", 169 | "for item in response.objects:\n", 170 | " print(item.uuid, item.properties)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# Show data preview - with vectors\n", 180 | "response = jeopardy.query.fetch_objects(\n", 181 | " limit=4,\n", 182 | " # TODO: add include_vectors\n", 183 | ")\n", 184 | "\n", 185 | "for item in response.objects:\n", 186 | " print(item.properties)\n", 187 | " print(item.vector, '\\n')" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "### Super quick query example" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "# TODO: add near text query, search for African animals with limit 2\n", 204 | "# response = jeopardy.query\n", 205 | "\n", 206 | "for item in response.objects:\n", 207 | " print(item.properties)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## A bit bigger example - 2k objects" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Load data" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "import json\n", 231 | "\n", 232 | "with open(\"./wiki-2k.json\") as file:\n", 233 | " data_2k = json.load(file)\n", 234 | "\n", 235 | "print(json.dumps(data_2k[0:2], indent=2))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### Create a collection with Named Vectors and SourceProperties" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "from weaviate.classes.config import Configure, Property, DataType\n", 252 | "\n", 253 | "def create_wiki_collection():\n", 254 | " if client.collections.exists(\"Wiki\"):\n", 255 | " client.collections.delete(\"Wiki\")\n", 256 | "\n", 257 | " # Create a collection here - with Weaviate vectorizer and define source properties\n", 258 | " client.collections.create(\n", 259 | " name=\"Wiki\",\n", 260 | "\n", 261 | " vector_config=[\n", 262 | " # NOTE: we are using NamedVectors here\n", 263 | " Configure.Vectors.text2vec_weaviate(\n", 264 | " name=\"main_vector\",\n", 265 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n", 266 | "\n", 267 | " # TODO: set source properties to \"title\" and \"text\"\n", 268 | " # source_properties=[] # which properties should be used to generate a vector\n", 269 | " )\n", 270 | " ],\n", 271 | "\n", 272 | " # Example: how to define property schema (Optional)\n", 273 | " # properties=[ \n", 274 | " # Property(name=\"title\", data_type=DataType.TEXT),\n", 275 | " # Property(name=\"text\", data_type=DataType.TEXT),\n", 276 | " # Property(name=\"url\", data_type=DataType.TEXT),\n", 277 | " # Property(name=\"wiki_id\", data_type=DataType.TEXT),\n", 278 | " # ],\n", 279 | " )\n", 280 | "\n", 281 | "create_wiki_collection()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "### Import data - 2k objects with Batch\n", 289 | "\n", 290 | "Batch speeds up the import process by grouping objects to be added in bigger batch groups.\n", 291 | "\n", 292 | "Batch creates an internal buffer to collect objects to be added.
\n", 293 | "Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.\n", 294 | "\n", 295 | "Types of batch:\n", 296 | "* `dynamic` - let batch calculate the optimal batch_size based on detected latency\n", 297 | "* `fixed_size` - provide a fixed batch_size\n", 298 | "* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "### Take 1 – import sample 100" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "from tqdm import tqdm\n", 315 | "\n", 316 | "sample_100 = data_2k[0:100]\n", 317 | "\n", 318 | "wiki = client.collections.use(\"Wiki\")\n", 319 | "\n", 320 | "# TODO: setup dynamic batch\n", 321 | "# loop through the sample_100 data\n", 322 | "# add each object to the batch\n", 323 | "\n", 324 | "print(f\"Wiki count: {len(wiki)}\")" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "# check for errors\n", 334 | "if(len(wiki.batch.failed_objects)>0):\n", 335 | " print(\"Import complete with errors\")\n", 336 | " for err in wiki.batch.failed_objects:\n", 337 | " print(err)\n", 338 | "else:\n", 339 | " print(\"Import complete with no errors\")" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "### Take 2 – import sample 100 – with UUID\n", 347 | "\n", 348 | "To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property." 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "from weaviate.util import generate_uuid5\n", 358 | "\n", 359 | "print(generate_uuid5(\"This UUID is always the same\"))\n", 360 | "print(generate_uuid5(\"This UUID is always the same\"))\n", 361 | "print(generate_uuid5(\"This UUID is always the same\"))\n", 362 | "print(\"====================================\")\n", 363 | "\n", 364 | "print(generate_uuid5(\"This UUID is different\"))\n", 365 | "print(generate_uuid5(\"This UUID is different\"))\n", 366 | "print(\"====================================\")\n", 367 | "\n", 368 | "obj1 = { \"title\": \"this is an object\", \"count\": 1 }\n", 369 | "obj2 = { \"title\": \"this is an object\", \"count\": 2 }\n", 370 | "print(generate_uuid5(obj1))\n", 371 | "print(generate_uuid5(obj2))\n" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# recreate the collection to start again\n", 381 | "create_wiki_collection()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "> Rerun the import script multiple times.\n", 389 | "\n", 390 | "> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase." 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "from tqdm import tqdm\n", 400 | "from weaviate.util import generate_uuid5\n", 401 | "\n", 402 | "sample_100 = data_2k[0:100]\n", 403 | "\n", 404 | "wiki = client.collections.use(\"Wiki\")\n", 405 | "\n", 406 | "with wiki.batch.fixed_size(batch_size=50, concurrent_requests=2) as batch:\n", 407 | " for item in tqdm(sample_100):\n", 408 | " # TODO: generate an id from item[\"wiki_id\"]\n", 409 | " # id = \n", 410 | "\n", 411 | " batch.add_object(\n", 412 | " item,\n", 413 | " # TODO: provide the new id here \n", 414 | " # uuid=\n", 415 | " )\n", 416 | "\n", 417 | "print(f\"Wiki count: {len(wiki)}\")" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "### Take 2 - import the rest of the data - but break if multiple errors" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "from tqdm import tqdm\n", 434 | "from weaviate.util import generate_uuid5\n", 435 | "\n", 436 | "wiki = client.collections.use(\"Wiki\")\n", 437 | "\n", 438 | "with wiki.batch.fixed_size(batch_size=600, concurrent_requests=2) as batch:\n", 439 | " for item in tqdm(data_2k):\n", 440 | " id = generate_uuid5(item[\"wiki_id\"])\n", 441 | " batch.add_object(item, uuid=id)\n", 442 | "\n", 443 | " # Check number of errors while running\n", 444 | " if(batch.number_errors > 10):\n", 445 | " print(\"Errors during batch import\")\n", 446 | " break" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### Check for errors" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "if(len(wiki.batch.failed_objects)>0):\n", 463 | " print(\"Import complete with errors\")\n", 464 | " for err in wiki.batch.failed_objects:\n", 465 | " print(err)\n", 466 | "else:\n", 467 | " print(\"Import complete with no errors\")" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "## Bonus - iterate through all collection data\n", 475 | "\n", 476 | "The client has a built-in function that allows you to iterate through all collection data." 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "wiki = client.collections.use(\"Wiki\")\n", 486 | "\n", 487 | "counter = 100\n", 488 | "\n", 489 | "for item in wiki.iterator():\n", 490 | " print(item.properties)\n", 491 | "\n", 492 | " if (counter == 0): break\n", 493 | " \n", 494 | " counter -= 1" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "You can also get `vector embeddings`, by using `include_vector`." 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "counter = 10\n", 511 | "\n", 512 | "for item in wiki.iterator(include_vector=True):\n", 513 | " print(item.properties)\n", 514 | " print(item.vector)\n", 515 | "\n", 516 | " if (counter == 0): break\n", 517 | " \n", 518 | " counter -= 1" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "## Close the client" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "client.close()" 535 | ] 536 | } 537 | ], 538 | "metadata": { 539 | "kernelspec": { 540 | "display_name": ".venv (3.11.9)", 541 | "language": "python", 542 | "name": "python3" 543 | }, 544 | "language_info": { 545 | "codemirror_mode": { 546 | "name": "ipython", 547 | "version": 3 548 | }, 549 | "file_extension": ".py", 550 | "mimetype": "text/x-python", 551 | "name": "python", 552 | "nbconvert_exporter": "python", 553 | "pygments_lexer": "ipython3", 554 | "version": "3.11.9" 555 | } 556 | }, 557 | "nbformat": 4, 558 | "nbformat_minor": 2 559 | } 560 | --------------------------------------------------------------------------------