├── img
├── wcd-create-cluster-1.jpg
├── wcd-create-cluster-2.jpg
└── wcd-enable-async-indexing.png
├── .env
├── 4-multi-tenancy
├── dl_data
│ ├── 2212.10496.pdf
│ └── 2401.00107.pdf
├── 3a-generate-data.ipynb
├── 2-setup-run.ipynb
├── 4-search-tenants.ipynb
├── 3b-load-data.ipynb
└── 1-playground-run.ipynb
├── .claude
└── settings.local.json
├── .gitignore
├── install.md
├── _docker
├── docker-compose-ollama.yml
├── docker-compose-ollama-codespace.yml
├── docker-compose.yml
└── docker-compose-clip.yml
├── prep-data.py
├── .devcontainer
└── devcontainer.json
├── 1-intro
├── jeopardy_tiny.json
├── 0-prep-run.ipynb
├── complete
│ ├── 2-query-complete.ipynb
│ └── 1-load-data-complete.ipynb
├── 2-query.ipynb
└── 1-load-data.ipynb
├── README.md
├── 5-vector-compression
├── data_loader.py
├── 2-search-run.ipynb
├── 1-rq-run.ipynb
├── 1-sq-run.ipynb
├── 1-bq-run.ipynb
├── 1-pq-run.ipynb
└── 0-vector-indexes.ipynb
├── prep-data.ipynb
├── 2-pre-vectorised-data
├── 3-wiki-search-run.ipynb
├── 2-wiki-import.ipynb
├── complete
│ └── 2-wiki-import-complete.ipynb
└── 1-playground-run.ipynb
├── requirements.txt
└── 3-rag
├── 2-rag-gen-query-run.ipynb
├── complete
└── 1-rag-complete.ipynb
└── 1-rag.ipynb
/img/wcd-create-cluster-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-create-cluster-1.jpg
--------------------------------------------------------------------------------
/img/wcd-create-cluster-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-create-cluster-2.jpg
--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | # WEAVIATE_URL=your_weaviate_url_here
2 | # WEAVIATE_KEY=your_weaviate_key_here
3 | # OPENAI_API_KEY= your_openai_api_key
4 |
5 |
--------------------------------------------------------------------------------
/img/wcd-enable-async-indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-enable-async-indexing.png
--------------------------------------------------------------------------------
/4-multi-tenancy/dl_data/2212.10496.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/4-multi-tenancy/dl_data/2212.10496.pdf
--------------------------------------------------------------------------------
/4-multi-tenancy/dl_data/2401.00107.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/4-multi-tenancy/dl_data/2401.00107.pdf
--------------------------------------------------------------------------------
/.claude/settings.local.json:
--------------------------------------------------------------------------------
1 | {
2 | "permissions": {
3 | "allow": [
4 | "mcp__ide__executeCode",
5 | "Bash(pip show:*)"
6 | ],
7 | "deny": [],
8 | "ask": []
9 | }
10 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv*
2 | __pycache__/
3 |
4 | # distill files
5 | # **/dl_data/
6 | .DS_Store
7 |
8 |
9 | # ignore temp files/folders with names starting with __
10 | __*
11 |
12 | # ignore big files
13 | *.parquet
14 | wiki-data
--------------------------------------------------------------------------------
/install.md:
--------------------------------------------------------------------------------
1 | ## How to setup the python environment with venv
2 | To run the project locally, it is best to setup python environment with venv.
3 |
4 | ### Setup – do this only once
5 | First create a new venv configuration.
6 | ```
7 | python3 -m venv .venv
8 | ```
9 |
10 | Then switch to the new configuration:
11 | ```
12 | source .venv/bin/activate
13 | ```
14 |
15 | And install the required packages.
16 | ```
17 | pip install -r requirements.txt
18 | ```
19 |
20 | ### How to use after
21 |
22 | **Activate**
23 | If in the future, you need to switch to the venv setup, just call:
24 | ```
25 | source .venv/bin/activate
26 | ```
27 |
28 | **Deactivate**
29 | To disconnect from the venv environment, call:
30 | ```
31 | source deactivate
32 | ```
--------------------------------------------------------------------------------
/_docker/docker-compose-ollama.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: '3.4'
3 | services:
4 | weaviate:
5 | command:
6 | - --host
7 | - 0.0.0.0
8 | - --port
9 | - '8080'
10 | - --scheme
11 | - http
12 | image: cr.weaviate.io/semitechnologies/weaviate:1.26.4
13 | ports:
14 | - 8080:8080
15 | - 50051:50051
16 | volumes:
17 | - weaviate_data:/var/lib/weaviate
18 | restart: on-failure:0
19 | environment:
20 | QUERY_DEFAULTS_LIMIT: 25
21 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
22 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
23 | DEFAULT_VECTORIZER_MODULE: 'none'
24 | ENABLE_API_BASED_MODULES: true
25 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama'
26 | CLUSTER_HOSTNAME: 'node1'
27 | volumes:
28 | weaviate_data:
29 | ...
--------------------------------------------------------------------------------
/_docker/docker-compose-ollama-codespace.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: '3.4'
3 | services:
4 | weaviate:
5 | command:
6 | - --host
7 | - 0.0.0.0
8 | - --port
9 | - '8080'
10 | - --scheme
11 | - http
12 | image: cr.weaviate.io/semitechnologies/weaviate:1.26.4
13 | ports:
14 | - 8080:8080
15 | - 50051:50051
16 | volumes:
17 | - weaviate_data:/var/lib/weaviate
18 | restart: on-failure:0
19 | environment:
20 | QUERY_DEFAULTS_LIMIT: 25
21 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
22 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
23 | DEFAULT_VECTORIZER_MODULE: 'none'
24 | ENABLE_API_BASED_MODULES: true
25 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama'
26 | CLUSTER_HOSTNAME: 'node1'
27 | ollama: # map to locally run ollama models
28 | image: ollama/ollama:0.2.5
29 | volumes:
30 | - /root/.ollama:/root/.ollama
31 | volumes:
32 | weaviate_data:
33 | ...
--------------------------------------------------------------------------------
/_docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: '3.4'
3 | services:
4 | weaviate:
5 | command:
6 | - --host
7 | - 0.0.0.0
8 | - --port
9 | - '8080'
10 | - --scheme
11 | - http
12 | image: semitechnologies/weaviate:1.32.9
13 | ports:
14 | - 8080:8080
15 | - 50051:50051
16 | volumes:
17 | - weaviate_data:/var/lib/weaviate
18 | restart: on-failure:0
19 | environment:
20 | QUERY_DEFAULTS_LIMIT: 25
21 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
22 | ENABLE_API_BASED_MODULES: 'true'
23 | CLUSTER_HOSTNAME: 'node1'
24 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'false'
25 | AUTHENTICATION_DB_USERS_ENABLED: 'true'
26 | AUTHENTICATION_APIKEY_ENABLED: 'true'
27 | AUTHENTICATION_APIKEY_ALLOWED_KEYS: 'root-user-key'
28 | AUTHENTICATION_APIKEY_USERS: 'root-user'
29 | AUTHORIZATION_ENABLE_RBAC: 'true'
30 | AUTHORIZATION_RBAC_ROOT_USERS: 'root-user'
31 | DEFAULT_VECTORIZER_MODULE: 'none'
32 | ENABLE_MODULES: ''
33 | volumes:
34 | weaviate_data:
35 | ...
--------------------------------------------------------------------------------
/_docker/docker-compose-clip.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: '3.4'
3 | services:
4 | weaviate:
5 | command:
6 | - --host
7 | - 0.0.0.0
8 | - --port
9 | - '8080'
10 | - --scheme
11 | - http
12 | image: cr.weaviate.io/semitechnologies/weaviate:1.30.0
13 | ports:
14 | - 8080:8080
15 | - 50051:50051
16 | volumes:
17 | - weaviate_data:/var/lib/weaviate
18 | restart: on-failure:0
19 | environment:
20 | CLIP_INFERENCE_API: 'http://multi2vec-clip:8080'
21 | QUERY_DEFAULTS_LIMIT: 25
22 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
23 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
24 | DEFAULT_VECTORIZER_MODULE: 'none'
25 | ENABLE_API_BASED_MODULES: true
26 | ENABLE_MODULES: 'text2vec-ollama,generative-ollama,multi2vec-clip'
27 | CLUSTER_HOSTNAME: 'node1'
28 | multi2vec-clip:
29 | image: cr.weaviate.io/semitechnologies/multi2vec-clip:sentence-transformers-clip-ViT-B-32-multilingual-v1
30 | environment:
31 | ENABLE_CUDA: '0'
32 | ollama:
33 | image: ollama/ollama:0.2.5
34 | # volumes:
35 | # - /root/.ollama:/root/.ollama
36 | volumes:
37 | weaviate_data:
38 | ...
--------------------------------------------------------------------------------
/prep-data.py:
--------------------------------------------------------------------------------
1 | from huggingface_hub import list_repo_files, hf_hub_download
2 |
3 | def list_wiki_datasets():
4 | all_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset")
5 |
6 | # get items with 0001 parquet file, this way we avoid duplicates
7 | items = list(filter(lambda path: path.endswith("0001.parquet"), all_files))
8 |
9 | # remove the parquet from the name
10 | return [item.replace("/0001.parquet", "") for item in items]
11 |
12 | def list_dataset_files(dataset):
13 | dataset_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset")
14 |
15 | return list(filter(lambda path: path.startswith(dataset), dataset_files))
16 |
17 | def download_file(file):
18 | hf_hub_download(
19 | repo_id="weaviate/wiki-sample",
20 | filename=file,
21 | repo_type="dataset",
22 | local_dir="wiki-data",
23 | )
24 |
25 | def download_source_files(dataset="no-vectors", max_files=1000):
26 | files_to_download = list_dataset_files(dataset)
27 | # print(f"Files to download: {files_to_download}")
28 |
29 | for file in files_to_download:
30 | print(f"Downloading {file}")
31 | download_file(file)
32 |
33 | max_files -= 1
34 | if(max_files == 0): break
35 |
36 | download_source_files("weaviate/snowflake-arctic-v2", 10)
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/python-3
3 | {
4 | "name": "Weaviate Playground",
5 | "image": "mcr.microsoft.com/devcontainers/python:3.12-bullseye",
6 | "features": {
7 | "ghcr.io/devcontainers/features/docker-in-docker:2.9": {}
8 | },
9 | // Configure tool-specific properties.
10 | "customizations": {
11 | // Configure properties specific to VS Code.
12 | "vscode": {
13 | // Set *default* container specific settings.json values on container create.
14 | "settings": {
15 | "python.defaultInterpreterPath": "/usr/local/bin/python",
16 | "files.exclude": {
17 | "__pycache__": true
18 | }
19 | },
20 |
21 | // Add the IDs of extensions you want installed when the container is created.
22 | "extensions": [
23 | "ms-azuretools.vscode-docker",
24 | "ms-python.python",
25 | "ms-toolsai.jupyter"
26 | ]
27 | }
28 | },
29 |
30 | // Use 'postStartCommand' to run commands after the container is started (more frequently than create).
31 | "postStartCommand": "pip3 install --user -r requirements.txt && python3 prep-data.py",
32 |
33 | "hostRequirements": {
34 | "memory": "16gb",
35 | "cpus": 4
36 | },
37 |
38 | // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
39 | "remoteUser": "vscode"
40 | }
--------------------------------------------------------------------------------
/1-intro/jeopardy_tiny.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "Category": "SCIENCE",
4 | "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
5 | "Answer": "Liver"
6 | },
7 | {
8 | "Category": "ANIMALS",
9 | "Question": "It's the only living mammal in the order Proboseidea",
10 | "Answer": "Elephant"
11 | },
12 | {
13 | "Category": "ANIMALS",
14 | "Question": "The gavial looks very much like a crocodile except for this bodily feature",
15 | "Answer": "the nose or snout"
16 | },
17 | {
18 | "Category": "ANIMALS",
19 | "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
20 | "Answer": "Antelope"
21 | },
22 | {
23 | "Category": "ANIMALS",
24 | "Question": "Heaviest of all poisonous snakes is this North American rattlesnake",
25 | "Answer": "the diamondback rattler"
26 | },
27 | {
28 | "Category": "SCIENCE",
29 | "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
30 | "Answer": "species"
31 | },
32 | {
33 | "Category": "SCIENCE",
34 | "Question": "A metal that is ductile can be pulled into this while cold & under pressure",
35 | "Answer": "wire"
36 | },
37 | {
38 | "Category": "SCIENCE",
39 | "Question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance",
40 | "Answer": "DNA"
41 | },
42 | {
43 | "Category": "SCIENCE",
44 | "Question": "Changes in the tropospheric layer of this are what gives us weather",
45 | "Answer": "the atmosphere"
46 | },
47 | {
48 | "Category": "SCIENCE",
49 | "Question": "In 70-degree air, a plane traveling at about 1,130 feet per second breaks it",
50 | "Answer": "Sound barrier"
51 | }
52 | ]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Weaviate Workshop
2 |
3 | ## What you need for the workshop
4 |
5 | * API Keys for embedding models, like:
6 | * OpenAI - [API keys](https://platform.openai.com/settings/profile?tab=api-keys)
7 | * etc,
8 |
9 | ## Create a Weaviate Cloud instance
10 |
11 | * Head to [Weaviate Cloud console](https://console.weaviate.cloud/) and log in, or create a new account.
12 | * Create a free `Sandbox` cluster. Give it a name, select the cloud region and press "Create".
13 |
14 | 
15 | 
16 |
17 | ## Running the workshop
18 |
19 | ### Option 1 - Run locally
20 |
21 | #### Virtual environment – do this only once
22 | First create a new venv configuration.
23 | ```
24 | python3 -m venv .venv
25 | ```
26 |
27 | Then switch to the new configuration:
28 | ```
29 | source .venv/bin/activate
30 | ```
31 |
32 | And install the required packages.
33 | ```
34 | pip install -r requirements.txt
35 | ```
36 |
37 | ### Option 2 - GitHub CodeSpaces instructions
38 |
39 | 1. Go to the project [https://github.com/weaviate-tutorials/weaviate-workshop](https://github.com/weaviate-tutorials/weaviate-workshop)
40 |
41 | Make sure you are logged in with GitHub.
42 |
43 | 2. Create a Codespace project
44 | * Press the green `<> Code` button, then switch to `Codespaces` tab.
45 | * Press the `Create codespace on main` button.
46 | * Your codespace project will install all the necessary components, it will take a few minutes.
47 |
48 |
49 | ## Env vars
50 |
51 | Update env vars in .env.
52 |
53 | Hint. you can find your Weaviate Cluster URL and API keys in the [WCD console](https://console.weaviate.cloud/).
54 |
55 | * WEAVIATE_URL - is the `REST Endpoint`
56 | * WEAVIATE_KEY - is the `Admin` key in `API Keys`
57 |
58 | ## Test your setup
59 |
60 | Head to [1-intro/0-prep-run.ipynb](./1-intro/0-prep-run.ipynb), and run through all steps.
61 |
62 | ## Download the prevectorized data
63 |
64 | Head to [prep-data.ipynb](./prep-data.ipynb) and run all the cells. This should download the data we will use in the second lesson.
65 |
66 | ## Enable asynch indexing in the Cloud Console or in Docker
67 |
68 | In the Cloud Console
69 | 
70 |
--------------------------------------------------------------------------------
/5-vector-compression/data_loader.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | from tqdm import tqdm
3 | from weaviate.util import generate_uuid5
4 |
5 | def prepare_dataset():
6 | dt = load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split="train", streaming=True)
7 | # dt = load_dataset("weaviate/wiki-sample", "weaviate-snowflake-arctic-v2", split="train", streaming=True)
8 |
9 | print(f"Loaded Dataset: '{dt.info.dataset_name}' - Config: '{dt.info.config_name}'")
10 |
11 | return dt
12 |
13 | def test_dataset():
14 | dt = prepare_dataset()
15 |
16 | counter = 10
17 | for item in dt:
18 | print(item)
19 |
20 | counter -= 1
21 | if(counter == 0): break
22 |
23 | def import_wiki_data(client, collection_name, max_rows=20_000):
24 | if(client.collections.exists(collection_name) == False):
25 | print(f"Error: Collection {collection_name} doesn't exist")
26 | return
27 |
28 | print(f"Importing {max_rows} data items")
29 |
30 | dataset = prepare_dataset()
31 | wiki = client.collections.use(collection_name)
32 |
33 | counter = 0
34 |
35 | with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
36 | for item in tqdm(dataset, total=max_rows):
37 |
38 | data_to_insert = {
39 | "wiki_id": item["wiki_id"],
40 | "text": item["text"],
41 | "title": item["title"],
42 | "url": item["url"],
43 | }
44 |
45 | item_id = generate_uuid5(item["wiki_id"])
46 |
47 | # vector = item["vector"]
48 | item_vector = {
49 | "main_vector": item["vector"]
50 | }
51 |
52 | batch.add_object(
53 | properties=data_to_insert,
54 |
55 | uuid=item_id,
56 | vector=item_vector
57 | )
58 |
59 | # Check number of errors while running
60 | if(batch.number_errors > 10):
61 | print(f"Reached {batch.number_errors} Errors during batch import")
62 | break
63 |
64 | # stop after the request number reaches = max_rows
65 | counter += 1
66 | if(counter >= max_rows):
67 | break
68 |
69 | # check for errors at the end
70 | if (len(wiki.batch.failed_objects)>0):
71 | print("Final error check")
72 | print(f"Some errors {len(wiki.batch.failed_objects)}")
73 | print(wiki.batch.failed_objects[-1])
74 |
75 | print(f"Imported {counter} items")
76 | print("-----------------------------------")
--------------------------------------------------------------------------------
/prep-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from huggingface_hub import list_repo_files, hf_hub_download\n",
10 | "\n",
11 | "def list_wiki_datasets():\n",
12 | " all_files = list_repo_files(\"weaviate/wiki-sample\", repo_type=\"dataset\")\n",
13 | " \n",
14 | " # get items with 0001 parquet file, this way we avoid duplicates\n",
15 | " items = list(filter(lambda path: path.endswith(\"0001.parquet\"), all_files))\n",
16 | "\n",
17 | " # remove the parquet from the name\n",
18 | " return [item.replace(\"/0001.parquet\", \"\") for item in items]\n",
19 | "\n",
20 | "def list_dataset_files(dataset):\n",
21 | " dataset_files = list_repo_files(\"weaviate/wiki-sample\", repo_type=\"dataset\")\n",
22 | "\n",
23 | " return list(filter(lambda path: path.startswith(dataset), dataset_files))\n",
24 | "\n",
25 | "def download_file(file):\n",
26 | " hf_hub_download(\n",
27 | " repo_id=\"weaviate/wiki-sample\",\n",
28 | " filename=file,\n",
29 | " repo_type=\"dataset\",\n",
30 | " local_dir=\"wiki-data\",\n",
31 | " )\n",
32 | "\n",
33 | "def download_source_files(dataset=\"no-vectors\", max_files=1000):\n",
34 | " files_to_download = list_dataset_files(dataset)\n",
35 | " print(f\"Files to download: {files_to_download}\")\n",
36 | "\n",
37 | " for file in files_to_download:\n",
38 | " print(f\"Downloading {file}\")\n",
39 | " download_file(file)\n",
40 | "\n",
41 | " max_files -= 1\n",
42 | " if(max_files == 0): break"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "list_wiki_datasets()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "download_source_files(\"weaviate/snowflake-arctic-v2\", 10)"
61 | ]
62 | }
63 | ],
64 | "metadata": {
65 | "kernelspec": {
66 | "display_name": ".venv (3.11.9)",
67 | "language": "python",
68 | "name": "python3"
69 | },
70 | "language_info": {
71 | "codemirror_mode": {
72 | "name": "ipython",
73 | "version": 3
74 | },
75 | "file_extension": ".py",
76 | "mimetype": "text/x-python",
77 | "name": "python",
78 | "nbconvert_exporter": "python",
79 | "pygments_lexer": "ipython3",
80 | "version": "3.11.9"
81 | }
82 | },
83 | "nbformat": 4,
84 | "nbformat_minor": 2
85 | }
86 |
--------------------------------------------------------------------------------
/2-pre-vectorised-data/3-wiki-search-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Query Data - show it works\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import weaviate\n",
37 | "from weaviate.classes.init import Auth\n",
38 | "\n",
39 | "client = weaviate.connect_to_weaviate_cloud(\n",
40 | " cluster_url=WEAVIATE_URL,\n",
41 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
42 | ")\n",
43 | "\n",
44 | "client.is_ready()"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "## Vector search"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "wiki = client.collections.use(\"Wiki\")\n",
61 | "\n",
62 | "response = wiki.query.near_text(\n",
63 | " query=\"musical instruments\",\n",
64 | " limit=5\n",
65 | ")\n",
66 | "\n",
67 | "for item in response.objects:\n",
68 | " print(item.properties)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## Close the client"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "client.close()"
85 | ]
86 | }
87 | ],
88 | "metadata": {
89 | "kernelspec": {
90 | "display_name": ".venv (3.11.9)",
91 | "language": "python",
92 | "name": "python3"
93 | },
94 | "language_info": {
95 | "codemirror_mode": {
96 | "name": "ipython",
97 | "version": 3
98 | },
99 | "file_extension": ".py",
100 | "mimetype": "text/x-python",
101 | "name": "python",
102 | "nbconvert_exporter": "python",
103 | "pygments_lexer": "ipython3",
104 | "version": "3.11.9"
105 | }
106 | },
107 | "nbformat": 4,
108 | "nbformat_minor": 2
109 | }
110 |
--------------------------------------------------------------------------------
/4-multi-tenancy/3a-generate-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Multi-tenant Chat with Papers - Reading data from papers\n",
8 | "\n",
9 | "### Helper function to load content from arxiv papers - `from_arxiv_paper`"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import json\n",
19 | "from distyll.text import from_arxiv_paper\n",
20 | "\n",
21 | "paper = from_arxiv_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")\n",
22 | "\n",
23 | "print(json.dumps(paper, indent=2))"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "### Helper function to chunk up a very long text - `chunk_text`"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "json.dumps??"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "from distyll.utils import chunk_text\n",
49 | "\n",
50 | "chunks = chunk_text(source_text=paper[\"text\"], token_length=200)\n",
51 | "print(json.dumps(chunks, indent=2))"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "### Combine read and chunk - `get_chunks_from_paper`"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "from distyll.text import from_arxiv_paper\n",
68 | "from distyll.utils import chunk_text\n",
69 | "\n",
70 | "def get_chunks_from_paper(url):\n",
71 | " paper = from_arxiv_paper(url)\n",
72 | " chunks = chunk_text(source_text=paper[\"text\"])\n",
73 | "\n",
74 | " paper[\"arxiv_id\"] = url.replace(\"https://arxiv.org/pdf/\", \"\").replace(\".pdf\", \"\").replace(\".\", \"-\")\n",
75 | " paper[\"chunks\"] = chunks\n",
76 | " return paper"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "#### Test an example"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "get_chunks_from_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")"
93 | ]
94 | }
95 | ],
96 | "metadata": {
97 | "kernelspec": {
98 | "display_name": ".venv (3.11.9)",
99 | "language": "python",
100 | "name": "python3"
101 | },
102 | "language_info": {
103 | "codemirror_mode": {
104 | "name": "ipython",
105 | "version": 3
106 | },
107 | "file_extension": ".py",
108 | "mimetype": "text/x-python",
109 | "name": "python",
110 | "nbconvert_exporter": "python",
111 | "pygments_lexer": "ipython3",
112 | "version": "3.11.9"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 2
117 | }
118 |
--------------------------------------------------------------------------------
/5-vector-compression/2-search-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "from dotenv import load_dotenv\n",
11 | "\n",
12 | "load_dotenv()\n",
13 | "\n",
14 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
15 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
16 | "\n",
17 | "print(WEAVIATE_URL[:10])\n",
18 | "print(WEAVIATE_KEY[:10])"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import weaviate\n",
28 | "from weaviate.classes.init import Auth\n",
29 | "\n",
30 | "client = weaviate.connect_to_weaviate_cloud(\n",
31 | " cluster_url=WEAVIATE_URL,\n",
32 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
33 | ")\n",
34 | "\n",
35 | "client.is_ready()"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Preview data"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "from weaviate.classes.query import Filter\n",
52 | "\n",
53 | "wikiQ = client.collections.use(\"WikiQ\")\n",
54 | "\n",
55 | "response = wikiQ.query.fetch_objects(\n",
56 | " filters=Filter.by_property(\"text\").like(\"musical\"),\n",
57 | " limit=5\n",
58 | ")\n",
59 | "\n",
60 | "for item in response.objects:\n",
61 | " print(item.properties[\"wiki_id\"])\n",
62 | " print(item.properties[\"title\"])\n",
63 | " print(item.properties[\"text\"], '\\n')"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "## Vector search"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "wikiQ = client.collections.use(\"WikiQ\")\n",
80 | "\n",
81 | "response = wikiQ.query.near_text(\n",
82 | " query=\"musical instruments\",\n",
83 | " limit=5\n",
84 | ")\n",
85 | "\n",
86 | "for item in response.objects:\n",
87 | " print(item.properties)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "> Have fun! Add your own queries."
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "## Close the client"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "client.close()"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": ".venv (3.11.9)",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.11.9"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 2
135 | }
136 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.9.5
2 | aiosignal==1.3.1
3 | annotated-types==0.7.0
4 | anyio==4.8.0
5 | appnope==0.1.4
6 | argon2-cffi==23.1.0
7 | argon2-cffi-bindings==21.2.0
8 | arrow==1.3.0
9 | asttokens==3.0.0
10 | async-lru==2.0.4
11 | async-timeout==4.0.3
12 | attrs==23.2.0
13 | Authlib==1.3.1
14 | babel==2.16.0
15 | beautifulsoup4==4.12.3
16 | bleach==6.1.0
17 | boto3==1.34.144
18 | botocore==1.34.144
19 | Brotli==1.1.0
20 | certifi==2025.1.31
21 | cffi==1.17.1
22 | charset-normalizer==3.3.2
23 | cohere==5.6.1
24 | comm==0.2.2
25 | cryptography==44.0.1
26 | datasets==2.20.0
27 | debugpy==1.8.2
28 | decorator==4.4.2
29 | defusedxml==0.7.1
30 | deprecation==2.1.0
31 | dill==0.3.8
32 | distro==1.9.0
33 | distyll-info==0.3.1
34 | dotenv==0.9.9
35 | exceptiongroup==1.3.0
36 | executing==2.2.0
37 | fastavro==1.9.5
38 | fastjsonschema==2.20.0
39 | filelock==3.15.4
40 | fqdn==1.5.1
41 | frozenlist==1.4.1
42 | fsspec==2024.5.0
43 | grpcio==1.70.0
44 | grpcio-health-checking==1.70.0
45 | grpcio-tools==1.70.0
46 | h11==0.14.0
47 | httpcore==1.0.7
48 | httpx==0.28.1
49 | httpx-sse==0.4.0
50 | huggingface-hub==0.23.5
51 | idna==3.10
52 | imageio==2.35.1
53 | imageio-ffmpeg==0.5.1
54 | ipykernel==6.29.5
55 | ipython==8.32.0
56 | ipywidgets==8.1.5
57 | isoduration==20.11.0
58 | jedi==0.19.2
59 | Jinja2==3.1.4
60 | jiter==0.8.2
61 | jmespath==1.0.1
62 | json5==0.9.25
63 | jsonpointer==3.0.0
64 | jsonschema==4.23.0
65 | jsonschema-specifications==2023.12.1
66 | jupyter==1.1.1
67 | jupyter-console==6.6.3
68 | jupyter-events==0.10.0
69 | jupyter-lsp==2.2.5
70 | jupyter_client==8.6.2
71 | jupyter_core==5.7.2
72 | jupyter_server==2.14.2
73 | jupyter_server_terminals==0.5.3
74 | jupyterlab==4.2.5
75 | jupyterlab_pygments==0.3.0
76 | jupyterlab_server==2.27.3
77 | jupyterlab_widgets==3.0.13
78 | load-dotenv==0.1.0
79 | MarkupSafe==2.1.5
80 | matplotlib-inline==0.1.7
81 | mistune==3.0.2
82 | moviepy==1.0.3
83 | multidict==6.0.5
84 | multiprocess==0.70.16
85 | mutagen==1.47.0
86 | nbclient==0.10.0
87 | nbconvert==7.16.4
88 | nbformat==5.10.4
89 | nest-asyncio==1.6.0
90 | notebook==7.2.2
91 | notebook_shim==0.2.4
92 | numpy==2.0.0
93 | openai==1.64.0
94 | overrides==7.7.0
95 | packaging==24.1
96 | pandas==2.2.2
97 | pandocfilters==1.5.1
98 | parameterized==0.9.0
99 | parso==0.8.4
100 | pexpect==4.9.0
101 | pillow==10.4.0
102 | platformdirs==4.2.2
103 | proglog==0.1.10
104 | prometheus_client==0.20.0
105 | prompt_toolkit==3.0.50
106 | protobuf==5.29.3
107 | psutil==6.0.0
108 | ptyprocess==0.7.0
109 | pure_eval==0.2.3
110 | pyarrow==17.0.0
111 | pyarrow-hotfix==0.6
112 | pycparser==2.22
113 | pycryptodomex==3.20.0
114 | pydantic==2.10.6
115 | pydantic_core==2.27.2
116 | pydub==0.25.1
117 | Pygments==2.19.1
118 | pypdf==4.3.1
119 | python-dateutil==2.9.0.post0
120 | python-dotenv==1.0.1
121 | python-json-logger==2.0.7
122 | pytz==2024.1
123 | PyYAML==6.0.1
124 | pyzmq==26.0.3
125 | referencing==0.35.1
126 | requests==2.32.3
127 | rfc3339-validator==0.1.4
128 | rfc3986-validator==0.1.1
129 | rpds-py==0.20.0
130 | s3transfer==0.10.2
131 | Send2Trash==1.8.3
132 | six==1.16.0
133 | sniffio==1.3.1
134 | soupsieve==2.6
135 | stack-data==0.6.3
136 | terminado==0.18.1
137 | tinycss2==1.3.0
138 | tokenizers==0.19.1
139 | tomli==2.2.1
140 | tornado==6.4.1
141 | tqdm==4.67.1
142 | traitlets==5.14.3
143 | types-python-dateutil==2.9.0.20240821
144 | types-requests==2.32.0.20240712
145 | typing_extensions==4.12.2
146 | tzdata==2024.1
147 | uri-template==1.3.0
148 | urllib3==2.2.2
149 | validators==0.34.0
150 | wcwidth==0.2.13
151 | weaviate-client==4.17.0
152 | webcolors==24.8.0
153 | webencodings==0.5.1
154 | websocket-client==1.8.0
155 | websockets==13.0.1
156 | widgetsnbextension==4.0.13
157 | xxhash==3.4.1
158 | yarl==1.9.4
159 | yt-dlp==2023.12.30
160 |
--------------------------------------------------------------------------------
/1-intro/0-prep-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Check if everything is in place"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Weaviate Python Client v4\n",
15 | "> This notebook was created with Weaviate `1.26` and the Weaviate Client `4.7`\n",
16 | "\n",
17 | "Run the below command to check if you run the latest version of the Weaviate Python Client v4."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "!pip show weaviate-client"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "## Get keys and urls\n",
34 | "\n",
35 | "> You can update your env variables in the `.env` file at the root of the project."
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import os\n",
45 | "from dotenv import load_dotenv\n",
46 | "\n",
47 | "load_dotenv()\n",
48 | "\n",
49 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
50 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
51 | "\n",
52 | "print(WEAVIATE_URL[:10])\n",
53 | "print(WEAVIATE_KEY[:10])"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Connect to Weaviate"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "import weaviate\n",
70 | "from weaviate.classes.init import Auth\n",
71 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
72 | "\n",
73 | "client = weaviate.connect_to_weaviate_cloud(\n",
74 | " cluster_url=WEAVIATE_URL,\n",
75 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
76 | "\n",
77 | " # additional_config=AdditionalConfig(\n",
78 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
79 | " # )\n",
80 | ")\n",
81 | "\n",
82 | "client.is_ready()"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## Display the available modules\n",
90 | "\n",
91 | "> You should be able to see 'generative-openai' and 'text2vec-openai', plus many other modules."
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "client.get_meta()"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "### Close the client\n",
108 | "When you are done with the client, you should close it to release the resources."
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "client.close()"
118 | ]
119 | }
120 | ],
121 | "metadata": {
122 | "kernelspec": {
123 | "display_name": ".venv (3.11.9)",
124 | "language": "python",
125 | "name": "python3"
126 | },
127 | "language_info": {
128 | "codemirror_mode": {
129 | "name": "ipython",
130 | "version": 3
131 | },
132 | "file_extension": ".py",
133 | "mimetype": "text/x-python",
134 | "name": "python",
135 | "nbconvert_exporter": "python",
136 | "pygments_lexer": "ipython3",
137 | "version": "3.11.9"
138 | }
139 | },
140 | "nbformat": 4,
141 | "nbformat_minor": 2
142 | }
143 |
--------------------------------------------------------------------------------
/4-multi-tenancy/2-setup-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Multi-tenant Chat with Papers - Setup\n",
8 | "## Get keys and urls"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import os\n",
18 | "from dotenv import load_dotenv\n",
19 | "\n",
20 | "load_dotenv()\n",
21 | "\n",
22 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
23 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
24 | "\n",
25 | "print(WEAVIATE_URL[:10])\n",
26 | "print(WEAVIATE_KEY[:10])"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "## Connect to Weaviate"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import weaviate\n",
43 | "from weaviate.classes.init import Auth\n",
44 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
45 | "\n",
46 | "client = weaviate.connect_to_weaviate_cloud(\n",
47 | " cluster_url=WEAVIATE_URL,\n",
48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
49 | "\n",
50 | " # additional_config=AdditionalConfig(\n",
51 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
52 | " # )\n",
53 | ")\n",
54 | "\n",
55 | "client.is_ready()"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Create Tenant-ready collection"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "from weaviate.classes.config import Configure\n",
72 | "\n",
73 | "if (client.collections.exists(\"Papers\")):\n",
74 | " client.collections.delete(\"Papers\")\n",
75 | "\n",
76 | "client.collections.create(\n",
77 | " \"Papers\",\n",
78 | "\n",
79 | " vector_config=[\n",
80 | " Configure.Vectors.text2vec_weaviate(\n",
81 | " name=\"main_vector\",\n",
82 | "\n",
83 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
84 | " source_properties=[\"chunk\"]\n",
85 | " )\n",
86 | " ],\n",
87 | "\n",
88 | " # supported models: https://weaviate.io/developers/weaviate/model-providers/openai/generative#available-models\n",
89 | " generative_config=Configure.Generative.openai(\n",
90 | " model=\"gpt-4o-mini\" # gpt-4\n",
91 | " ),\n",
92 | "\n",
93 | " multi_tenancy_config=Configure.multi_tenancy(True)\n",
94 | ")"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "## List Tenants"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "papers = client.collections.use(\"Papers\")\n",
111 | "papers.tenants.get()"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "## Close the client"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "client.close()"
128 | ]
129 | }
130 | ],
131 | "metadata": {
132 | "kernelspec": {
133 | "display_name": ".venv (3.11.9)",
134 | "language": "python",
135 | "name": "python3"
136 | },
137 | "language_info": {
138 | "codemirror_mode": {
139 | "name": "ipython",
140 | "version": 3
141 | },
142 | "file_extension": ".py",
143 | "mimetype": "text/x-python",
144 | "name": "python",
145 | "nbconvert_exporter": "python",
146 | "pygments_lexer": "ipython3",
147 | "version": "3.11.9"
148 | }
149 | },
150 | "nbformat": 4,
151 | "nbformat_minor": 2
152 | }
153 |
--------------------------------------------------------------------------------
/5-vector-compression/1-rq-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Compression – Load Data and compress vectors\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "\n",
46 | "client = weaviate.connect_to_weaviate_cloud(\n",
47 | " cluster_url=WEAVIATE_URL,\n",
48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
49 | ")\n",
50 | "\n",
51 | "client.is_ready()"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Create Collection with RQ configuration\n",
59 | "\n",
60 | "[Docs: Rotational Quantization (RQ)](https://weaviate.io/developers/weaviate/configuration/compression/rq-compression)\n",
61 | "\n",
62 | "> Note: Rotational Quantization (RQ) does not require a training phase.
\n",
63 | "> RQ begins compressing vectors immediately upon insertion, without waiting for a minimum number of objects or a training step.\n",
64 | ">\n",
65 | "> This makes RQ ideal for applications that need immediate compression and fast setup, as vectors are compressed and searchable as soon as they are added to the collection.
\n"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "from weaviate.classes.config import Configure\n",
75 | "\n",
76 | "client.collections.delete(\"WikiQ\")\n",
77 | "\n",
78 | "client.collections.create(\n",
79 | " name=\"WikiQ\",\n",
80 | "\n",
81 | " vector_config=[\n",
82 | " Configure.Vectors.text2vec_weaviate(\n",
83 | " name=\"main_vector\",\n",
84 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
85 | " source_properties=['title', 'text'],\n",
86 | "\n",
87 | " # Configure RQ\n",
88 | " vector_index_config=Configure.VectorIndex.hnsw(\n",
89 | " quantizer=Configure.VectorIndex.Quantizer.rq(\n",
90 | " rescore_limit=200, # Number of overfetched candidates used for rescoring\n",
91 | " bits=8 # Number of bits (only 8 is supported)\n",
92 | " )\n",
93 | " ),\n",
94 | " )\n",
95 | " ],\n",
96 | ")"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "## The rest is the same"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "from data_loader import import_wiki_data\n",
113 | "import_wiki_data(client, \"WikiQ\", 25000)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "WikiQ = client.collections.get(\"WikiQ\")\n",
123 | "WikiQ.aggregate.over_all()"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "## Clean up"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# client.collections.delete(\"WikiQ\")"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "## Close the client"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "client.close()"
156 | ]
157 | }
158 | ],
159 | "metadata": {
160 | "kernelspec": {
161 | "display_name": ".venv (3.11.9)",
162 | "language": "python",
163 | "name": "python3"
164 | },
165 | "language_info": {
166 | "codemirror_mode": {
167 | "name": "ipython",
168 | "version": 3
169 | },
170 | "file_extension": ".py",
171 | "mimetype": "text/x-python",
172 | "name": "python",
173 | "nbconvert_exporter": "python",
174 | "pygments_lexer": "ipython3",
175 | "version": "3.11.9"
176 | }
177 | },
178 | "nbformat": 4,
179 | "nbformat_minor": 2
180 | }
181 |
--------------------------------------------------------------------------------
/5-vector-compression/1-sq-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Compression – Load Data and compress vectors\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
45 | "\n",
46 | "client = weaviate.connect_to_custom(\n",
47 | " http_host=WEAVIATE_URL,\n",
48 | " http_port=8080,\n",
49 | " http_secure=False,\n",
50 | " grpc_host=WEAVIATE_KEY,\n",
51 | " grpc_port=50051,\n",
52 | " grpc_secure=False,\n",
53 | ")\n",
54 | "\n",
55 | "client.is_ready()"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Create Collection with SQ configuration\n",
63 | "\n",
64 | "[Docs: Scalar Quantization (SQ)](https://weaviate.io/developers/weaviate/configuration/compression/sq-compression)\n",
65 | "\n",
66 | "> Note: Scalar Quantization includes a training phase, which is required to determine scalar bucket boundaries.
\n",
67 | "> In other words, based on your data, it figures out how to best compress your vectors.\n",
68 | ">\n",
69 | "> The compression training starts when the collection reaches `training_limit` number of objects.
\n",
70 | "> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "from weaviate.classes.config import Configure\n",
80 | "\n",
81 | "client.collections.delete(\"WikiQ\")\n",
82 | "\n",
83 | "# Create a collection here - with Cohere as a vectorizer\n",
84 | "client.collections.create(\n",
85 | " name=\"WikiQ\",\n",
86 | "\n",
87 | " vector_config=[\n",
88 | " Configure.Vectors.text2vec_weaviate(\n",
89 | " name=\"main_vector\",\n",
90 | "\n",
91 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
92 | " source_properties=['title', 'text'],\n",
93 | "\n",
94 | " # Configure SQ\n",
95 | " vector_index_config=Configure.VectorIndex.hnsw(\n",
96 | " quantizer=Configure.VectorIndex.Quantizer.sq(\n",
97 | " rescore_limit=200, # the number of overfeteched candidates used for rescoring\n",
98 | " training_limit=10_000 # (default 100k) number of objects needed to train the codebook\n",
99 | " )\n",
100 | " ),\n",
101 | " )\n",
102 | " ],\n",
103 | ")"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "## The rest is the same"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "from data_loader import import_wiki_data\n",
120 | "import_wiki_data(client, \"WikiQ\", 25000)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "WikiQ = client.collections.use(\"WikiQ\")\n",
130 | "WikiQ.aggregate.over_all()"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "## Clean up"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "# client.collections.delete(\"WikiQ\")"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Close the client"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "client.close()"
163 | ]
164 | }
165 | ],
166 | "metadata": {
167 | "kernelspec": {
168 | "display_name": ".venv (3.11.9)",
169 | "language": "python",
170 | "name": "python3"
171 | },
172 | "language_info": {
173 | "codemirror_mode": {
174 | "name": "ipython",
175 | "version": 3
176 | },
177 | "file_extension": ".py",
178 | "mimetype": "text/x-python",
179 | "name": "python",
180 | "nbconvert_exporter": "python",
181 | "pygments_lexer": "ipython3",
182 | "version": "3.11.9"
183 | }
184 | },
185 | "nbformat": 4,
186 | "nbformat_minor": 2
187 | }
188 |
--------------------------------------------------------------------------------
/5-vector-compression/1-bq-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# BQ Compression – Load Data and compress vectors\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "\n",
46 | "client = weaviate.connect_to_weaviate_cloud(\n",
47 | " cluster_url=WEAVIATE_URL,\n",
48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
49 | ")\n",
50 | "\n",
51 | "client.is_ready()"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Create Collection with BQ configuration\n",
59 | "\n",
60 | "[Docs: Binary Quantization (BQ)](https://weaviate.io/developers/weaviate/configuration/compression/bq-compression)\n",
61 | "\n",
62 | "Note #1: Binary Quantization works from the first object added to the collection. No training required.\n",
63 | "\n",
64 | "Note #2: Binary Quantization works both with HNSW and Flat index."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "from weaviate.classes.config import Configure, VectorDistances\n",
74 | "\n",
75 | "client.collections.delete(\"WikiQ\")\n",
76 | "\n",
77 | "# Create a collection here - with Cohere as a vectorizer\n",
78 | "client.collections.create(\n",
79 | " name=\"WikiQ\",\n",
80 | "\n",
81 | " vector_config=[\n",
82 | " Configure.Vectors.text2vec_weaviate(\n",
83 | " name=\"main_vector\",\n",
84 | "\n",
85 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
86 | " source_properties=['title', 'text'],\n",
87 | "\n",
88 | " # Configure BQ with flat vector index\n",
89 | " vector_index_config=Configure.VectorIndex.flat(\n",
90 | " distance_metric=VectorDistances.COSINE,\n",
91 | " vector_cache_max_objects=100_000,\n",
92 | " quantizer=Configure.VectorIndex.Quantizer.bq(\n",
93 | " rescore_limit=200,\n",
94 | " cache=True\n",
95 | " )\n",
96 | " ),\n",
97 | "\n",
98 | " # HSNW example\n",
99 | " # vector_index_config=Configure.VectorIndex.hsnw(\n",
100 | " # quantizer=Configure.VectorIndex.Quantizer.bq(\n",
101 | " # rescore_limit=200,\n",
102 | " # cache=True\n",
103 | " # )\n",
104 | " # ),\n",
105 | " )\n",
106 | " ],\n",
107 | ")"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## The rest is the same"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "from data_loader import import_wiki_data\n",
124 | "import_wiki_data(client, \"WikiQ\", 25000)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "wikiQ = client.collections.use(\"WikiQ\")\n",
134 | "wikiQ.aggregate.over_all()"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "## Clean up"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "# client.collections.delete(\"WikiQ\")"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Close the client"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "client.close()"
167 | ]
168 | }
169 | ],
170 | "metadata": {
171 | "kernelspec": {
172 | "display_name": ".venv (3.11.9)",
173 | "language": "python",
174 | "name": "python3"
175 | },
176 | "language_info": {
177 | "codemirror_mode": {
178 | "name": "ipython",
179 | "version": 3
180 | },
181 | "file_extension": ".py",
182 | "mimetype": "text/x-python",
183 | "name": "python",
184 | "nbconvert_exporter": "python",
185 | "pygments_lexer": "ipython3",
186 | "version": "3.11.9"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 2
191 | }
192 |
--------------------------------------------------------------------------------
/5-vector-compression/1-pq-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Compression – Load Data and compress vectors\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
45 | "\n",
46 | "client = weaviate.connect_to_custom(\n",
47 | " http_host=WEAVIATE_URL,\n",
48 | " http_port=8080,\n",
49 | " http_secure=False,\n",
50 | " grpc_host=WEAVIATE_KEY,\n",
51 | " grpc_port=50051,\n",
52 | " grpc_secure=False,\n",
53 | ")\n",
54 | "\n",
55 | "client.is_ready()"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Create Collection with PQ configuration\n",
63 | "\n",
64 | "[Docs: Product Quantization (PQ)](https://weaviate.io/developers/weaviate/configuration/compression/pq-compression)\n",
65 | "\n",
66 | "> Note: Product Quantization includes a training phase, which is required to create codebooks (codebooks are used to generate centroids for compressed vectors).
\n",
67 | "> In other words, based on your data, it figures out how to best compress your vectors.\n",
68 | ">\n",
69 | "> The compression training starts when the collection reaches `training_limit` number of objects.
\n",
70 | "> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "from weaviate.classes.config import Configure\n",
80 | "\n",
81 | "client.collections.delete(\"WikiQ\")\n",
82 | "\n",
83 | "# Create a collection here - with Weaviate as a vectorizer\n",
84 | "client.collections.create(\n",
85 | " name=\"WikiQ\",\n",
86 | "\n",
87 | " vector_config=[\n",
88 | " Configure.Vectors.text2vec_weaviate(\n",
89 | " name=\"main_vector\",\n",
90 | "\n",
91 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
92 | " source_properties=['title', 'text'],\n",
93 | "\n",
94 | " # Configure PQ\n",
95 | " vector_index_config=Configure.VectorIndex.hnsw(\n",
96 | " quantizer=Configure.VectorIndex.Quantizer.pq(\n",
97 | " segments=256, # 1536/6 # new number of dimension segments\n",
98 | " training_limit=10_000 # (default 100k) number of objects needed to train the codebook\n",
99 | " )\n",
100 | " ),\n",
101 | " )\n",
102 | " ],\n",
103 | ")"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "## The rest is the same"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "from data_loader import import_wiki_data\n",
120 | "import_wiki_data(client, \"WikiQ\", 25000)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "WikiQ = client.collections.use(\"WikiQ\")\n",
130 | "WikiQ.aggregate.over_all()"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "## Clean up"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "# client.collections.delete(\"WikiQ\")"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Close the client"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "client.close()"
163 | ]
164 | }
165 | ],
166 | "metadata": {
167 | "kernelspec": {
168 | "display_name": ".venv (3.11.9)",
169 | "language": "python",
170 | "name": "python3"
171 | },
172 | "language_info": {
173 | "codemirror_mode": {
174 | "name": "ipython",
175 | "version": 3
176 | },
177 | "file_extension": ".py",
178 | "mimetype": "text/x-python",
179 | "name": "python",
180 | "nbconvert_exporter": "python",
181 | "pygments_lexer": "ipython3",
182 | "version": "3.11.9"
183 | }
184 | },
185 | "nbformat": 4,
186 | "nbformat_minor": 2
187 | }
188 |
--------------------------------------------------------------------------------
/4-multi-tenancy/4-search-tenants.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Multi-tenant Chat with Papers - Query papers\n",
8 | "## Get keys and urls"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import os\n",
18 | "from dotenv import load_dotenv\n",
19 | "\n",
20 | "load_dotenv()\n",
21 | "\n",
22 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
23 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
24 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])\n",
28 | "print(OPENAI_API_KEY[:10])"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Connect to Weaviate"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import weaviate\n",
45 | "from weaviate.classes.init import Auth\n",
46 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
47 | "\n",
48 | "client = weaviate.connect_to_weaviate_cloud(\n",
49 | " cluster_url=WEAVIATE_URL,\n",
50 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
51 | "\n",
52 | " headers = {\n",
53 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
54 | " },\n",
55 | "\n",
56 | " # additional_config=AdditionalConfig(\n",
57 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
58 | " # )\n",
59 | ")\n",
60 | "\n",
61 | "client.is_ready()"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## Vector search on tenants"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "papers = client.collections.use(\"Papers\")\n",
78 | "\n",
79 | "ten = papers.with_tenant(\"2212-10496\")\n",
80 | "\n",
81 | "response = ten.query.near_text(\n",
82 | " query=\"Unsupervised learning\",\n",
83 | " limit=5,\n",
84 | ")\n",
85 | "\n",
86 | "for item in response.objects:\n",
87 | " print(item.properties[\"chunk\"])"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "## Generative Search with tenants"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "papers = client.collections.use(\"Papers\")\n",
104 | "\n",
105 | "ten2212 = papers.with_tenant(\"2212-10496\")\n",
106 | "\n",
107 | "response = ten2212.generate.near_text(\n",
108 | " query=\"Unsupervised learning\",\n",
109 | " limit=5,\n",
110 | " single_prompt=\"What does the following text describe: {chunk}\",\n",
111 | ")\n",
112 | "\n",
113 | "for item in response.objects:\n",
114 | " print(item.properties[\"chunk\"])\n",
115 | " print(item.generative.text, '\\n')"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "papers = client.collections.use(\"Papers\")\n",
125 | "\n",
126 | "ten2212 = papers.with_tenant(\"2212-10496\")\n",
127 | "\n",
128 | "response = ten2212.generate.near_text(\n",
129 | " query=\"Unsupervised learning\",\n",
130 | " limit=5,\n",
131 | " grouped_task=\"Explain how unsupervised learning works. Use only the provided content.\",\n",
132 | " grouped_properties=[\"chunk\"]\n",
133 | ")\n",
134 | "\n",
135 | "for item in response.objects:\n",
136 | " print(item.properties[\"chunk\"])\n",
137 | "\n",
138 | "print(response.generative.text)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "def paper_rag(paper_id, query, prompt):\n",
148 | " papers = client.collections.use(\"Papers\")\n",
149 | " ten = papers.with_tenant(paper_id)\n",
150 | "\n",
151 | " response = ten.generate.near_text(\n",
152 | " query=query,\n",
153 | " limit=5,\n",
154 | " grouped_task=prompt + \" Use only the provided content.\",\n",
155 | " grouped_properties=[\"chunk\"],\n",
156 | " )\n",
157 | "\n",
158 | " return {\n",
159 | " \"title\": response.objects[0].properties[\"title\"],\n",
160 | " \"source\": [p.properties[\"chunk\"] for p in response.objects],\n",
161 | " \"generated\": response.generative.text\n",
162 | " }"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "paper_rag(\n",
172 | " \"2212-10496\",\n",
173 | " \"Unsupervised learning\",\n",
174 | " \"Explain how unsupervised learning works\"\n",
175 | ")"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "papers = client.collections.use(\"Papers\")\n",
185 | "papers.tenants.get()"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "## Close the client"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "client.close()"
202 | ]
203 | }
204 | ],
205 | "metadata": {
206 | "kernelspec": {
207 | "display_name": ".venv (3.11.9)",
208 | "language": "python",
209 | "name": "python3"
210 | },
211 | "language_info": {
212 | "codemirror_mode": {
213 | "name": "ipython",
214 | "version": 3
215 | },
216 | "file_extension": ".py",
217 | "mimetype": "text/x-python",
218 | "name": "python",
219 | "nbconvert_exporter": "python",
220 | "pygments_lexer": "ipython3",
221 | "version": "3.11.9"
222 | }
223 | },
224 | "nbformat": 4,
225 | "nbformat_minor": 2
226 | }
227 |
--------------------------------------------------------------------------------
/3-rag/2-rag-gen-query-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# !pip install openai"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
26 | "\n",
27 | "print(WEAVIATE_URL[:10])\n",
28 | "print(WEAVIATE_KEY[:10])\n",
29 | "print(OPENAI_API_KEY[:10])"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Generate query from prompt"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "from openai import OpenAI\n",
46 | "\n",
47 | "openai_client = OpenAI(\n",
48 | " api_key=OPENAI_API_KEY,\n",
49 | " base_url=\"https://api.openai.com/v1\",\n",
50 | ")"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "def generate_query_from_promt(prompt):\n",
60 | " response = openai_client.chat.completions.create(\n",
61 | " model=\"gpt-3.5-turbo\",\n",
62 | " messages=[\n",
63 | " { \"role\": \"system\", \"content\": \"Your job is to extract a query from the provided user prompt, the query will then be used to run a query in a vector database.\" },\n",
64 | " { \n",
65 | " \"role\": \"user\",\n",
66 | " \"content\": f\"Please give me a 2-3 word query that can be used to find relevant info to the following prompt - {prompt}\"\n",
67 | " },\n",
68 | " ]\n",
69 | " )\n",
70 | " return response.choices[0].message.content"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "# Example of how to generate a query from a prompt\n",
80 | "generate_query_from_promt(\"Where do the tallest penguins live?\")"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "## Connect to Weaviate"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "import weaviate\n",
97 | "from weaviate.classes.init import Auth\n",
98 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
99 | "\n",
100 | "client = weaviate.connect_to_weaviate_cloud(\n",
101 | " cluster_url=WEAVIATE_URL,\n",
102 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
103 | "\n",
104 | " headers = {\n",
105 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
106 | " },\n",
107 | "\n",
108 | " # additional_config=AdditionalConfig(\n",
109 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
110 | " # )\n",
111 | ")\n",
112 | "\n",
113 | "client.is_ready()"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "## Two-step RAG"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "def two_step_rag(user_prompt):\n",
130 | " # Step 1\n",
131 | " prompt = user_prompt + \" Please only use the provided content with this prompt. Don't make things up.\"\n",
132 | " \n",
133 | " generated_query = generate_query_from_promt(prompt)\n",
134 | " print(\"=== Generated Query ===\")\n",
135 | " print(f\"Generated query: {generated_query}\")\n",
136 | "\n",
137 | " # Step 2\n",
138 | " wiki = client.collections.use(\"Wiki\")\n",
139 | "\n",
140 | " response = wiki.generate.near_text(\n",
141 | " query=generated_query,\n",
142 | " limit=3,\n",
143 | " grouped_task=prompt,\n",
144 | " grouped_properties=[\"text\", \"title\"]\n",
145 | " )\n",
146 | "\n",
147 | " # Print results\n",
148 | " print(\"\\n=== Generated Response ===\")\n",
149 | " print(response.generative.text)\n",
150 | "\n",
151 | " print(\"\\n=== Source ===\")\n",
152 | " for item in response.objects:\n",
153 | " print(item.properties)"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "# two_step_rag(\"What wild animals do we know about?\")\n",
163 | "two_step_rag(\"Please provide an explanation at a highschool level. How do airplanes fly?\")"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "two_step_rag(\"What are the pros and cons of automation using computer?\")"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "two_step_rag(\"How do CPUs work?\")"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "## Close the client"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "client.close()"
198 | ]
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": ".venv (3.11.9)",
204 | "language": "python",
205 | "name": "python3"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 3
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython3",
217 | "version": "3.11.9"
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 2
222 | }
223 |
--------------------------------------------------------------------------------
/4-multi-tenancy/3b-load-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Multi-tenant Chat with Papers - Load and chunk papers\n",
8 | "## Get keys and urls"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import os\n",
18 | "from dotenv import load_dotenv\n",
19 | "\n",
20 | "load_dotenv()\n",
21 | "\n",
22 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
23 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
24 | "\n",
25 | "print(WEAVIATE_URL[:10])\n",
26 | "print(WEAVIATE_KEY[:10])"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "## Connect to Weaviate"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import weaviate\n",
43 | "from weaviate.classes.init import Auth\n",
44 | "\n",
45 | "client = weaviate.connect_to_weaviate_cloud(\n",
46 | " cluster_url=WEAVIATE_URL,\n",
47 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
48 | ")\n",
49 | "\n",
50 | "client.is_ready()"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Load Data from arxiv\n",
58 | "\n",
59 | "1. Get chunks from paper - `get_chunks_from_paper`\n",
60 | "2. Create a tenant for the paper - `create_tenant`\n",
61 | "3. Batch import chunks - `batch_import_chunks`"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### 1. Get chunks from paper - `get_chunks_from_paper`"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "from distyll.text import from_arxiv_paper\n",
78 | "from distyll.utils import chunk_text\n",
79 | "\n",
80 | "def get_chunks_from_paper(url):\n",
81 | " paper = from_arxiv_paper(url)\n",
82 | " chunks = chunk_text(source_text=paper[\"text\"])\n",
83 | "\n",
84 | " paper[\"arxiv_id\"] = url.replace(\"https://arxiv.org/pdf/\", \"\").replace(\".pdf\", \"\").replace(\".\", \"-\")\n",
85 | " paper[\"chunks\"] = chunks\n",
86 | " return paper"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "chunked_2212 = get_chunks_from_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")\n",
96 | "chunked_2212"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "### 2. Create a tenant for the paper - `create_tenant`"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "from weaviate.classes.tenants import Tenant\n",
113 | "papers = client.collections.use(\"Papers\")\n",
114 | "\n",
115 | "def create_tenant(chunked_paper):\n",
116 | " tenant_name = chunked_paper[\"arxiv_id\"]\n",
117 | "\n",
118 | " papers.tenants.create([\n",
119 | " Tenant(name=tenant_name)\n",
120 | " ])\n",
121 | "\n",
122 | " return tenant_name"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "create_tenant(chunked_2212)"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "papers.tenants.get()"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "### 3. Batch import chunks - `batch_import_chunks`"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "def batch_import_chunks(chunked_paper):\n",
157 | " ten = papers.with_tenant(chunked_paper[\"arxiv_id\"])\n",
158 | "\n",
159 | " i=0\n",
160 | " with ten.batch.dynamic() as batch:\n",
161 | " for chunk in chunked_paper[\"chunks\"]:\n",
162 | " batch.add_object({\n",
163 | " \"title\": chunked_paper[\"title\"],\n",
164 | " \"url\": chunked_paper[\"url\"],\n",
165 | " \"chunk\": chunk,\n",
166 | " \"chunk_no\": i,\n",
167 | " })\n",
168 | " i+=1\n",
169 | "\n",
170 | " # if(len(papers.batch.failed_objects)>0):\n",
171 | " if(len(ten.batch.failed_objects)>0):\n",
172 | " print(\"Import complete with errors\")\n",
173 | " for err in papers.batch.failed_objects:\n",
174 | " print(err)\n",
175 | " else:\n",
176 | " print(\"Import complete with no errors\")"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "batch_import_chunks(chunked_2212)"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "## End-to-end paper load"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "def import_paper(url):\n",
202 | " cp = get_chunks_from_paper(url)\n",
203 | " tenant_name = create_tenant(cp)\n",
204 | " batch_import_chunks(cp)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "import_paper(\"https://arxiv.org/pdf/2401.00107.pdf\")"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Close the client"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "client.close()"
230 | ]
231 | }
232 | ],
233 | "metadata": {
234 | "kernelspec": {
235 | "display_name": ".venv (3.11.9)",
236 | "language": "python",
237 | "name": "python3"
238 | },
239 | "language_info": {
240 | "codemirror_mode": {
241 | "name": "ipython",
242 | "version": 3
243 | },
244 | "file_extension": ".py",
245 | "mimetype": "text/x-python",
246 | "name": "python",
247 | "nbconvert_exporter": "python",
248 | "pygments_lexer": "ipython3",
249 | "version": "3.11.9"
250 | }
251 | },
252 | "nbformat": 4,
253 | "nbformat_minor": 2
254 | }
255 |
--------------------------------------------------------------------------------
/5-vector-compression/0-vector-indexes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Vector Indexes\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "\n",
46 | "client = weaviate.connect_to_weaviate_cloud(\n",
47 | " cluster_url=WEAVIATE_URL,\n",
48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
49 | ")\n",
50 | "\n",
51 | "client.is_ready()"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Collection with HNSW index (default)\n",
59 | "\n",
60 | "[HNSW params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#hnsw-index-parameters)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "from weaviate.classes.config import Configure, VectorDistances\n",
70 | "\n",
71 | "client.collections.delete(\"IndexExample\")\n",
72 | "\n",
73 | "# Create a collection here - with Weaviate as a vectorizer\n",
74 | "client.collections.create(\n",
75 | " name=\"IndexExample\",\n",
76 | "\n",
77 | " vector_config=[\n",
78 | " Configure.Vectors.text2vec_weaviate(\n",
79 | " name=\"main_vector\",\n",
80 | "\n",
81 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
82 | " source_properties=['title', 'text'],\n",
83 | "\n",
84 | " # HSNW example \n",
85 | " vector_index_config=Configure.VectorIndex.hnsw()\n",
86 | " )\n",
87 | " ],\n",
88 | ")"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "## Collection with Flat index\n",
96 | "\n",
97 | "[Flat params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#flat-indexes)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from weaviate.classes.config import Configure, VectorDistances\n",
107 | "\n",
108 | "client.collections.delete(\"IndexExample\")\n",
109 | "\n",
110 | "# Create a collection here - with Cohere as a vectorizer\n",
111 | "client.collections.create(\n",
112 | " name=\"IndexExample\",\n",
113 | "\n",
114 | " vector_config=[\n",
115 | " Configure.Vectors.text2vec_weaviate(\n",
116 | " name=\"main_vector\",\n",
117 | "\n",
118 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
119 | " source_properties=['title', 'text'],\n",
120 | "\n",
121 | " # Flat example\n",
122 | " vector_index_config=Configure.VectorIndex.flat(\n",
123 | " # distance_metric=VectorDistances.COSINE, # optional\n",
124 | " vector_cache_max_objects=100_000,\n",
125 | " ),\n",
126 | " ),\n",
127 | " ],\n",
128 | ")"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "## Collection with Dynamic index\n",
136 | "\n",
137 | "[Dynamic params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#dynamic-index-parameters)"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "from weaviate.classes.config import Configure\n",
147 | "\n",
148 | "client.collections.delete(\"IndexExample\")\n",
149 | "\n",
150 | "# Create a collection here - with Cohere as a vectorizer\n",
151 | "client.collections.create(\n",
152 | " name=\"IndexExample\",\n",
153 | "\n",
154 | " vector_config=[\n",
155 | " Configure.Vectors.text2vec_weaviate(\n",
156 | " name=\"main_vector\",\n",
157 | "\n",
158 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
159 | " source_properties=['title', 'text'],\n",
160 | "\n",
161 | " # Dynamic example\n",
162 | " vector_index_config=Configure.VectorIndex.dynamic(\n",
163 | " threshold=10_000, # when to switch to HNSW\n",
164 | "\n",
165 | " flat=Configure.VectorIndex.flat(\n",
166 | " vector_cache_max_objects=100_000,\n",
167 | " # note: can also include a quantizer\n",
168 | " quantizer=Configure.VectorIndex.Quantizer.bq()\n",
169 | " ),\n",
170 | "\n",
171 | " hnsw=Configure.VectorIndex.hnsw(\n",
172 | " max_connections=32, # optional\n",
173 | " # note: the quantizer can be different between flat and hnsw\n",
174 | " quantizer=Configure.VectorIndex.Quantizer.pq()\n",
175 | " ),\n",
176 | " ),\n",
177 | " )\n",
178 | " ],\n",
179 | ")"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## The rest is the same"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "# comment this out if you want to import 25k objects to your collection to test it\n",
196 | "# from data_loader import import_wiki_data\n",
197 | "# import_wiki_data(client, \"IndexExample\", 25_000)"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "# index_example = client.collections.use(\"IndexExample\")\n",
207 | "# index_example.aggregate.over_all()"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "## Clean up"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "client.collections.delete(\"IndexExample\")"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "## Close the client"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "client.close()"
240 | ]
241 | }
242 | ],
243 | "metadata": {
244 | "kernelspec": {
245 | "display_name": ".venv (3.11.9)",
246 | "language": "python",
247 | "name": "python3"
248 | },
249 | "language_info": {
250 | "codemirror_mode": {
251 | "name": "ipython",
252 | "version": 3
253 | },
254 | "file_extension": ".py",
255 | "mimetype": "text/x-python",
256 | "name": "python",
257 | "nbconvert_exporter": "python",
258 | "pygments_lexer": "ipython3",
259 | "version": "3.11.9"
260 | }
261 | },
262 | "nbformat": 4,
263 | "nbformat_minor": 2
264 | }
265 |
--------------------------------------------------------------------------------
/2-pre-vectorised-data/2-wiki-import.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load Data with Vectors\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "\n",
46 | "client = weaviate.connect_to_weaviate_cloud(\n",
47 | " cluster_url=WEAVIATE_URL,\n",
48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
49 | ")\n",
50 | "\n",
51 | "client.is_ready()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "weaviate.__version__"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "from weaviate.classes.config import Configure\n",
70 | "\n",
71 | "def create_wiki_collection():\n",
72 | " if client.collections.exists(\"Wiki\"):\n",
73 | " client.collections.delete(\"Wiki\")\n",
74 | "\n",
75 | " # Create a collection here - with Weaviate vectorizer and define source properties\n",
76 | " client.collections.create(\n",
77 | " name=\"Wiki\",\n",
78 | "\n",
79 | " vector_config=[\n",
80 | " Configure.Vectors.text2vec_weaviate(\n",
81 | " name=\"main_vector\",\n",
82 | "\n",
83 | " # TODO: use model Snowflake/snowflake-arctic-embed-l-v2.0\n",
84 | " # TODO: set source properties to title and text\n",
85 | " \n",
86 | " )\n",
87 | " ],\n",
88 | " )\n",
89 | "\n",
90 | "create_wiki_collection()"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "## Load the data from parquet files"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from datasets import load_dataset\n",
107 | "\n",
108 | "def prepare_dataset():\n",
109 | " return load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split=\"train\", streaming=True)\n",
110 | " # return load_dataset(\"weaviate/wiki-sample\", \"weaviate-snowflake-arctic-v2\", split=\"train\", streaming=True)"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "### Dataset Test\n",
118 | ""
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "dataset = prepare_dataset()\n",
128 | "\n",
129 | "counter = 10\n",
130 | "for i in dataset:\n",
131 | " print(i)\n",
132 | "\n",
133 | " counter -= 1\n",
134 | " if(counter == 0): break"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "### The import function\n",
142 | "\n",
143 | "`TODO:`\n",
144 | "* add a function to add objects to batch"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": null,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "from tqdm import tqdm\n",
154 | "from weaviate.util import generate_uuid5\n",
155 | "\n",
156 | "def import_wiki_data(max_rows=10_000):\n",
157 | " print(f\"Importing {max_rows} data items\")\n",
158 | "\n",
159 | " dataset = prepare_dataset()\n",
160 | " wiki = client.collections.use(\"Wiki\")\n",
161 | "\n",
162 | " counter = 0\n",
163 | "\n",
164 | " with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n",
165 | " for item in tqdm(dataset, total=max_rows):\n",
166 | "\n",
167 | " data_to_insert = { \n",
168 | " \"wiki_id\": item[\"wiki_id\"],\n",
169 | " \"text\": item[\"text\"],\n",
170 | " \"title\": item[\"title\"],\n",
171 | " \"url\": item[\"url\"],\n",
172 | " }\n",
173 | "\n",
174 | " item_id = generate_uuid5(item[\"wiki_id\"])\n",
175 | "\n",
176 | " item_vector = {\n",
177 | " \"main_vector\": item[\"vector\"]\n",
178 | " }\n",
179 | "\n",
180 | " # TODO: add objects to batch using\n",
181 | " batch.add_object(\n",
182 | " # * data_to_insert\n",
183 | " # * item_id\n",
184 | " # * item_vector\n",
185 | " )\n",
186 | "\n",
187 | " # Check number of errors while running\n",
188 | " if(batch.number_errors > 10):\n",
189 | " print(f\"Reached {batch.number_errors} errors during batch import\")\n",
190 | " break\n",
191 | " \n",
192 | " # stop after the request number reaches = max_rows\n",
193 | " counter += 1\n",
194 | " if counter >= max_rows:\n",
195 | " break\n",
196 | " \n",
197 | " # check for errors at the end\n",
198 | " if (len(wiki.batch.failed_objects)>0):\n",
199 | " print(\"Final error check\")\n",
200 | " print(f\"Some errors {len(wiki.batch.failed_objects)}\")\n",
201 | " print(wiki.batch.failed_objects[-1])\n",
202 | " \n",
203 | " print(f\"Imported {counter} items\")\n",
204 | " print(\"-----------------------------------\")"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "import_wiki_data(10_000)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Check if data loaded correctly"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "wiki = client.collections.use(\"Wiki\")\n",
230 | "len(wiki)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "res = wiki.query.fetch_objects(limit=1, include_vector=True)\n",
240 | "print(res.objects[0].properties)\n",
241 | "print(res.objects[0].vector)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "client.close()"
251 | ]
252 | }
253 | ],
254 | "metadata": {
255 | "kernelspec": {
256 | "display_name": ".venv (3.11.9)",
257 | "language": "python",
258 | "name": "python3"
259 | },
260 | "language_info": {
261 | "codemirror_mode": {
262 | "name": "ipython",
263 | "version": 3
264 | },
265 | "file_extension": ".py",
266 | "mimetype": "text/x-python",
267 | "name": "python",
268 | "nbconvert_exporter": "python",
269 | "pygments_lexer": "ipython3",
270 | "version": "3.11.9"
271 | }
272 | },
273 | "nbformat": 4,
274 | "nbformat_minor": 2
275 | }
276 |
--------------------------------------------------------------------------------
/2-pre-vectorised-data/complete/2-wiki-import-complete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load Data with Vectors\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "\n",
46 | "client = weaviate.connect_to_weaviate_cloud(\n",
47 | " cluster_url=WEAVIATE_URL,\n",
48 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
49 | ")\n",
50 | "\n",
51 | "client.is_ready()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "from weaviate.classes.config import Configure\n",
61 | "\n",
62 | "def create_wiki_collection():\n",
63 | " if client.collections.exists(\"Wiki\"):\n",
64 | " client.collections.delete(\"Wiki\")\n",
65 | "\n",
66 | " # Create a collection here - with OpenAI vectorizer and define source properties\n",
67 | " client.collections.create(\n",
68 | " name=\"Wiki\",\n",
69 | "\n",
70 | " vector_config=[\n",
71 | " Configure.Vectors.text2vec_weaviate(\n",
72 | " name=\"main_vector\",\n",
73 | "\n",
74 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
75 | " source_properties=['title', 'text'] # which properties should be used to generate a vector\n",
76 | " )\n",
77 | " ],\n",
78 | " )\n",
79 | "\n",
80 | "create_wiki_collection()"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "## Load the data from parquet files"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "from datasets import load_dataset\n",
97 | "\n",
98 | "def prepare_dataset():\n",
99 | " return load_dataset('parquet', data_files={'train': ['../../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split=\"train\", streaming=True)\n",
100 | " # return load_dataset(\"weaviate/wiki-sample\", \"weaviate-snowflake-arctic-v2\", split=\"train\", streaming=True)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "### Dataset Test\n",
108 | ""
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "dataset = prepare_dataset()\n",
118 | "\n",
119 | "counter = 10\n",
120 | "for i in dataset:\n",
121 | " print(i)\n",
122 | "\n",
123 | " counter -= 1\n",
124 | " if(counter == 0): break"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "### The import function\n",
132 | "\n",
133 | "`TODO:`\n",
134 | "* add a function to add objects to batch"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "from tqdm import tqdm\n",
144 | "from weaviate.util import generate_uuid5\n",
145 | "\n",
146 | "def import_wiki_data(max_rows=10_000):\n",
147 | " print(f\"Importing {max_rows} data items\")\n",
148 | "\n",
149 | " dataset = prepare_dataset()\n",
150 | " wiki = client.collections.use(\"Wiki\")\n",
151 | "\n",
152 | " counter = 0\n",
153 | "\n",
154 | " with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n",
155 | " for item in tqdm(dataset, total=max_rows):\n",
156 | "\n",
157 | " data_to_insert = { \n",
158 | " \"wiki_id\": item[\"wiki_id\"],\n",
159 | " \"text\": item[\"text\"],\n",
160 | " \"title\": item[\"title\"],\n",
161 | " \"url\": item[\"url\"],\n",
162 | " }\n",
163 | "\n",
164 | " item_id = generate_uuid5(item[\"wiki_id\"])\n",
165 | "\n",
166 | " # vector = item[\"vector\"]\n",
167 | " item_vector = {\n",
168 | " \"main_vector\": item[\"vector\"]\n",
169 | " }\n",
170 | "\n",
171 | " batch.add_object(\n",
172 | " properties=data_to_insert,\n",
173 | " \n",
174 | " uuid=item_id,\n",
175 | " vector=item_vector\n",
176 | " )\n",
177 | "\n",
178 | " # Check number of errors while running\n",
179 | " if(batch.number_errors > 10):\n",
180 | " print(f\"Reached {batch.number_errors} Errors during batch import\")\n",
181 | " break\n",
182 | " \n",
183 | " # stop after the request number reaches = max_rows\n",
184 | " counter += 1\n",
185 | " if(counter >= max_rows):\n",
186 | " break\n",
187 | " \n",
188 | " # check for errors at the end\n",
189 | " if (len(wiki.batch.failed_objects)>0):\n",
190 | " print(\"Final error check\")\n",
191 | " print(f\"Some errors {len(wiki.batch.failed_objects)}\")\n",
192 | " print(wiki.batch.failed_objects[-1])\n",
193 | " \n",
194 | " print(f\"Imported {counter} items\")\n",
195 | " print(\"-----------------------------------\")"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "import_wiki_data(10_000)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "## Check if data loaded correctly"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "wiki = client.collections.use(\"Wiki\")\n",
221 | "len(wiki)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "res = wiki.query.fetch_objects(limit=1, include_vector=True)\n",
231 | "print(res.objects[0].properties)\n",
232 | "print(res.objects[0].vector)"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "## Close the client"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "client.close()"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": []
257 | }
258 | ],
259 | "metadata": {
260 | "kernelspec": {
261 | "display_name": ".venv (3.11.9)",
262 | "language": "python",
263 | "name": "python3"
264 | },
265 | "language_info": {
266 | "codemirror_mode": {
267 | "name": "ipython",
268 | "version": 3
269 | },
270 | "file_extension": ".py",
271 | "mimetype": "text/x-python",
272 | "name": "python",
273 | "nbconvert_exporter": "python",
274 | "pygments_lexer": "ipython3",
275 | "version": "3.11.9"
276 | }
277 | },
278 | "nbformat": 4,
279 | "nbformat_minor": 2
280 | }
281 |
--------------------------------------------------------------------------------
/3-rag/complete/1-rag-complete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# RAG - How to query"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "from dotenv import load_dotenv\n",
18 | "\n",
19 | "load_dotenv()\n",
20 | "\n",
21 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
22 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
23 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
24 | "\n",
25 | "print(WEAVIATE_URL[:10])\n",
26 | "print(WEAVIATE_KEY[:10])\n",
27 | "print(OPENAI_API_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
46 | "\n",
47 | "client = weaviate.connect_to_weaviate_cloud(\n",
48 | " cluster_url=WEAVIATE_URL,\n",
49 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
50 | "\n",
51 | " headers = {\n",
52 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
53 | " },\n",
54 | "\n",
55 | " # additional_config=AdditionalConfig(\n",
56 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
57 | " # )\n",
58 | ")\n",
59 | "\n",
60 | "client.is_ready()"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Start with (R) - Retrieval"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "wiki = client.collections.use(\"Wiki\")\n",
77 | "\n",
78 | "response = wiki.query.near_text(\n",
79 | " query=\"How do planes fly\",\n",
80 | " limit=5,\n",
81 | " return_properties=[\"text\", \"title\"]\n",
82 | ")\n",
83 | "\n",
84 | "for item in response.objects:\n",
85 | " print(item.properties)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "### Add (AG) - augmented generation - to make full RAG"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "#### Single Prompt\n",
100 | "\n",
101 | "> Generate a response per **retrieved** object."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# Let's add some colour to our lives :)\n",
111 | "BLUE = \"\\033[94m\"\n",
112 | "PURPLE = \"\\033[95m\"\n",
113 | "RESET = \"\\033[0\""
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "from weaviate.classes.generate import GenerativeConfig\n",
123 | "\n",
124 | "wiki = client.collections.use(\"Wiki\")\n",
125 | "\n",
126 | "response = wiki.generate.near_text(\n",
127 | " query=\"How do planes fly\",\n",
128 | " # auto_limit=1,\n",
129 | " limit=5,\n",
130 | "\n",
131 | " # TODO: add GenerativeConfig with OpenAI and \"gpt-4o-mini\"\n",
132 | " generative_provider=GenerativeConfig.openai(\n",
133 | " model=\"gpt-4o-mini\",\n",
134 | " ),\n",
135 | " \n",
136 | " # TODO: add a single prompt \"Explain what this is about? {text}\"\n",
137 | " single_prompt=\"Explain what this is about? {text}\"\n",
138 | ")\n",
139 | "\n",
140 | "for item in response.objects:\n",
141 | " print(f\"{BLUE}=== Source ===\")\n",
142 | " print(item.properties)\n",
143 | "\n",
144 | " print(f\"{PURPLE}=== Generated Response ===\")\n",
145 | " print(item.generative.text)\n",
146 | " print(\"\\n\")"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "#### Grouped Task\n",
154 | "\n",
155 | "> Generate one response based on all **retrieved** objects."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "wiki = client.collections.use(\"Wiki\")\n",
165 | "\n",
166 | "response = wiki.generate.near_text(\n",
167 | " query=\"How do planes fly\",\n",
168 | " # auto_limit=1,\n",
169 | " limit=5,\n",
170 | "\n",
171 | " generative_provider=GenerativeConfig.openai(\n",
172 | " model=\"gpt-4o-mini\",\n",
173 | " ),\n",
174 | "\n",
175 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\"\n",
176 | ")\n",
177 | "\n",
178 | "print(f\"{PURPLE}=== Generated Response ===\")\n",
179 | "print(response.generative.text)\n",
180 | "\n",
181 | "print(f\"{BLUE}=== Source ===\")\n",
182 | "for item in response.objects:\n",
183 | " print(item.properties)"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "#### Specify which properties to use for grouped task"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "wiki = client.collections.use(\"Wiki\")\n",
200 | "\n",
201 | "response = wiki.generate.near_text(\n",
202 | " query=\"How do planes fly\",\n",
203 | " auto_limit=1,\n",
204 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
205 | " grouped_properties=[\"text\", \"title\"],\n",
206 | "\n",
207 | " generative_provider=GenerativeConfig.openai(\n",
208 | " model=\"gpt-4o-mini\",\n",
209 | " ),\n",
210 | ")\n",
211 | "\n",
212 | "print(f\"{PURPLE}=== Generated Response ===\")\n",
213 | "print(response.generative.text)\n",
214 | "\n",
215 | "print(f\"{BLUE}=== Source ===\")\n",
216 | "for item in response.objects:\n",
217 | " print(item.properties)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "## Set default Generative model"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "from weaviate.classes.config import Reconfigure\n",
234 | "\n",
235 | "wiki = client.collections.use(\"Wiki\")\n",
236 | "\n",
237 | "wiki.config.update(\n",
238 | " generative_config=Reconfigure.Generative.openai(\n",
239 | " model=\"gpt-4o-mini\" # Update the generative model\n",
240 | " )\n",
241 | ")"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "> Try generative query without providing the model"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "response = wiki.generate.near_text(\n",
258 | " query=\"What african animals do we have info on. Please only list those provided in here.\",\n",
259 | " auto_limit=1,\n",
260 | " \n",
261 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
262 | ")\n",
263 | "\n",
264 | "print(f\"{PURPLE}=== Generated Response ===\")\n",
265 | "print(response.generative.text)\n",
266 | "\n",
267 | "print(f\"{BLUE}=== Source ===\")\n",
268 | "for item in response.objects:\n",
269 | " print(item.properties)"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "## Close the client"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "client.close()"
286 | ]
287 | }
288 | ],
289 | "metadata": {
290 | "kernelspec": {
291 | "display_name": ".venv (3.11.9)",
292 | "language": "python",
293 | "name": "python3"
294 | },
295 | "language_info": {
296 | "codemirror_mode": {
297 | "name": "ipython",
298 | "version": 3
299 | },
300 | "file_extension": ".py",
301 | "mimetype": "text/x-python",
302 | "name": "python",
303 | "nbconvert_exporter": "python",
304 | "pygments_lexer": "ipython3",
305 | "version": "3.11.9"
306 | }
307 | },
308 | "nbformat": 4,
309 | "nbformat_minor": 2
310 | }
311 |
--------------------------------------------------------------------------------
/3-rag/1-rag.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# RAG - How to query"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "from dotenv import load_dotenv\n",
18 | "\n",
19 | "load_dotenv()\n",
20 | "\n",
21 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
22 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
23 | "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
24 | "\n",
25 | "print(WEAVIATE_URL[:10])\n",
26 | "print(WEAVIATE_KEY[:10])\n",
27 | "print(OPENAI_API_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
46 | "\n",
47 | "client = weaviate.connect_to_weaviate_cloud(\n",
48 | " cluster_url=WEAVIATE_URL,\n",
49 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
50 | "\n",
51 | " headers = {\n",
52 | " \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
53 | " },\n",
54 | "\n",
55 | " # additional_config=AdditionalConfig(\n",
56 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
57 | " # )\n",
58 | ")\n",
59 | "\n",
60 | "client.is_ready()"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Start with (R) - Retrieval"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "wiki = client.collections.use(\"Wiki\")\n",
77 | "\n",
78 | "response = wiki.query.near_text(\n",
79 | " query=\"How do planes fly\",\n",
80 | " limit=5,\n",
81 | " return_properties=[\"text\", \"title\"]\n",
82 | ")\n",
83 | "\n",
84 | "for item in response.objects:\n",
85 | " print(item.properties)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "### Add (AG) - augmented generation - to make full RAG"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "#### Single Prompt\n",
100 | "\n",
101 | "> Generate a response per **retrieved** object."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# Let's add some colour to our lives :)\n",
111 | "BLUE = \"\\033[94m\"\n",
112 | "PURPLE = \"\\033[95m\"\n",
113 | "RESET = \"\\033[0\""
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "from weaviate.classes.generate import GenerativeConfig\n",
123 | "\n",
124 | "wiki = client.collections.use(\"Wiki\")\n",
125 | "\n",
126 | "response = wiki.generate.near_text(\n",
127 | " query=\"How do planes fly\",\n",
128 | " # auto_limit=1,\n",
129 | " limit=5,\n",
130 | "\n",
131 | " # TODO: add GenerativeConfig with OpenAI and \"gpt-4o-mini\"\n",
132 | " # generative_provider=GenerativeConfig.\n",
133 | "\n",
134 | " # TODO: add a single prompt \"Explain what this is about? {text}\"\n",
135 | " # single_prompt=\n",
136 | ")\n",
137 | "\n",
138 | "# NOTE: the generated responses are included with the each object\n",
139 | "\n",
140 | "for item in response.objects:\n",
141 | " print(f\"{BLUE}=== Source ===\")\n",
142 | " print(item.properties)\n",
143 | "\n",
144 | " print(f\"{PURPLE}=== Generated Response ===\")\n",
145 | " # TODO: print the generative.text object\n",
146 | " # print(item.)\n",
147 | "\n",
148 | " print(\"\\n\")"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "#### Grouped Task\n",
156 | "\n",
157 | "> Generate one response based on all **retrieved** objects."
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "wiki = client.collections.use(\"Wiki\")\n",
167 | "\n",
168 | "response = wiki.generate.near_text(\n",
169 | " query=\"How do planes fly\",\n",
170 | " # auto_limit=1,\n",
171 | " limit=5,\n",
172 | " \n",
173 | " generative_provider=GenerativeConfig.openai(\n",
174 | " model=\"gpt-4o-mini\",\n",
175 | " ),\n",
176 | "\n",
177 | " # TODO: add a grouped task \"Explain, how do planes fly? Please only use the provided content.\"\n",
178 | " # grouped_task=\n",
179 | ")\n",
180 | "\n",
181 | "print(f\"{PURPLE}=== Generated Response ===\")\n",
182 | "# NOTE: group task response is at response.generative.text\n",
183 | "# TODO: print the generated text\n",
184 | "# print(response.)\n",
185 | "\n",
186 | "print(f\"{BLUE}=== Source ===\")\n",
187 | "for item in response.objects:\n",
188 | " print(item.properties)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "#### Specify which properties to use for grouped task"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "response = wiki.generate.near_text(\n",
205 | " query=\"How do planes fly\",\n",
206 | " auto_limit=1,\n",
207 | "\n",
208 | " generative_provider=GenerativeConfig.openai(\n",
209 | " model=\"gpt-4o-mini\",\n",
210 | " ),\n",
211 | "\n",
212 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
213 | " # TODO: add grouped properties to only use \"text\" and \"title\" \n",
214 | " # grouped_properties=[]\n",
215 | ")\n",
216 | "\n",
217 | "print(\"=== Generated Response ===\")\n",
218 | "print(response.generative.text)\n",
219 | "\n",
220 | "print(\"=== Source ===\")\n",
221 | "for item in response.objects:\n",
222 | " print(item.properties)"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "## Set default Generative model"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "from weaviate.classes.config import Reconfigure\n",
239 | "\n",
240 | "wiki = client.collections.use(\"Wiki\")\n",
241 | "\n",
242 | "wiki.config.update(\n",
243 | " #TODO: set generative model\n",
244 | " # generative_config=Reconfigure.Generative.\n",
245 | ")"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "> Try generative query without providing the model"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "response = wiki.generate.near_text(\n",
262 | " query=\"What african animals do we have info on. Please only list those provided in here.\",\n",
263 | " auto_limit=1,\n",
264 | " \n",
265 | " grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
266 | ")\n",
267 | "\n",
268 | "print(f\"{PURPLE}=== Generated Response ===\")\n",
269 | "print(response.generative.text)\n",
270 | "\n",
271 | "print(f\"{BLUE}=== Source ===\")\n",
272 | "for item in response.objects:\n",
273 | " print(item.properties)"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "## Close the client"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "client.close()"
290 | ]
291 | }
292 | ],
293 | "metadata": {
294 | "kernelspec": {
295 | "display_name": ".venv",
296 | "language": "python",
297 | "name": "python3"
298 | },
299 | "language_info": {
300 | "codemirror_mode": {
301 | "name": "ipython",
302 | "version": 3
303 | },
304 | "file_extension": ".py",
305 | "mimetype": "text/x-python",
306 | "name": "python",
307 | "nbconvert_exporter": "python",
308 | "pygments_lexer": "ipython3",
309 | "version": "3.11.8"
310 | }
311 | },
312 | "nbformat": 4,
313 | "nbformat_minor": 2
314 | }
315 |
--------------------------------------------------------------------------------
/4-multi-tenancy/1-playground-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "from dotenv import load_dotenv\n",
11 | "\n",
12 | "load_dotenv()\n",
13 | "\n",
14 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
15 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
16 | "\n",
17 | "print(WEAVIATE_URL[:10])\n",
18 | "print(WEAVIATE_KEY[:10])"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "# Setup\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import weaviate\n",
35 | "from weaviate.classes.init import Auth\n",
36 | "\n",
37 | "client = weaviate.connect_to_weaviate_cloud(\n",
38 | " cluster_url=WEAVIATE_URL,\n",
39 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
40 | ")\n",
41 | "\n",
42 | "client.is_ready()"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Create Tenant-ready collection"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "from weaviate.classes.config import Configure\n",
59 | "\n",
60 | "if (client.collections.exists(\"Play\")):\n",
61 | " client.collections.delete(\"Play\")\n",
62 | "\n",
63 | "client.collections.create(\n",
64 | " \"Play\",\n",
65 | " vector_config=Configure.Vectors.self_provided(),\n",
66 | "\n",
67 | " multi_tenancy_config=Configure.multi_tenancy(True)\n",
68 | "\n",
69 | " # multi_tenancy_config=Configure.multi_tenancy(\n",
70 | " # enabled=True,\n",
71 | " # auto_tenant_creation=True, #Assign to non-existant tenant will create\n",
72 | " # auto_tenant_activation=True\n",
73 | " # )\n",
74 | ")"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## Create tenants\n",
82 | "> tenant name – must be made of alphanumeric characters (a-z, A-Z, 0-9), underscore (_), and hyphen (-), with a length between 1 and 64 characters'\n"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "from weaviate.classes.tenants import Tenant\n",
92 | "\n",
93 | "play = client.collections.use(\"Play\")\n",
94 | "\n",
95 | "play.tenants.create([\n",
96 | " Tenant(name=\"ten_A\"),\n",
97 | " Tenant(name=\"ten_B\"),\n",
98 | " Tenant(name=\"ten_C\"),\n",
99 | " Tenant(name=\"ten_D\"),\n",
100 | "])"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "## List Tenants"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "play.tenants.get()"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "play.tenants.exists(\"ten_E\")"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "## Access Tenants"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "# this will fail – multi-tenant collections require us to use tenants\n",
142 | "play.aggregate.over_all()"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "tenA = play.with_tenant(\"ten_A\")\n",
152 | "\n",
153 | "tenA.aggregate.over_all()"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "### Insert data"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "# tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n",
170 | "play = client.collections.use(\"Play\")\n",
171 | "tenA = play.with_tenant(\"ten_A\")\n",
172 | "\n",
173 | "tenA.data.insert_many([\n",
174 | " {\n",
175 | " \"title\": \"A book about vector databases\"\n",
176 | " },\n",
177 | " {\n",
178 | " \"title\": \"Tutorial for multimodal collections\"\n",
179 | " },\n",
180 | "])\n",
181 | "\n",
182 | "tenA.aggregate.over_all()"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "### Query Example"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "from weaviate.classes.query import Filter\n",
199 | "\n",
200 | "tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n",
201 | "\n",
202 | "response = tenA.query.fetch_objects(\n",
203 | " filters=Filter.by_property(\"title\").like(\"about\")\n",
204 | ")\n",
205 | "\n",
206 | "for item in response.objects:\n",
207 | " print(item.properties)"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "### Delete Tenants"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "play.tenants.remove([\"ten_D\"])\n",
224 | "play.tenants.get()"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "### Update Tenants – Active & Inactive & Offloaded\n",
232 | "Tenants can be:\n",
233 | "* `Active` (default) - active tenants use `HOT` resources (RAM)\n",
234 | "* `Inacative` - inactive tenants cannot be searched on, their index is not loaded into memory, they don't use (RAM)\n",
235 | "* `Offloaded` - offloaded tenants are moved to a cloud storage\n",
236 | "\n",
237 | "> Tenant offloading, requires an extra configuration, which is out of scope for this workshop.
\n",
238 | "> You can learn more from [How-to: Configure - Tenant Offloading](https://weaviate.io/developers/weaviate/configuration/tenant-offloading)"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "### Deactivate – make tenant `Inactive`"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "from weaviate.classes.tenants import Tenant, TenantActivityStatus\n",
255 | "\n",
256 | "play.tenants.update([\n",
257 | " Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.INACTIVE),\n",
258 | "])\n",
259 | "\n",
260 | "play.tenants.get()"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "**Cannot search `Inactive` tenants**"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "# tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n",
277 | "\n",
278 | "response = tenA.query.fetch_objects(\n",
279 | " filters=Filter.by_property(\"title\").like(\"about\")\n",
280 | ")\n",
281 | "\n",
282 | "for item in response.objects:\n",
283 | " print(item.properties)"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "### Activate - make tenant `Active`\n",
291 | "\n",
292 | "> You can't query an inactive tenant, but you can activate it."
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "play.tenants.update([\n",
302 | " Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.ACTIVE),\n",
303 | "])\n",
304 | "\n",
305 | "play.tenants.get()"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "response = tenA.query.fetch_objects(\n",
315 | " filters=Filter.by_property(\"title\").like(\"about\")\n",
316 | ")\n",
317 | "\n",
318 | "for item in response.objects:\n",
319 | " print(item.properties)"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "### Offload - make tenant `offloaded`\n",
327 | "\n",
328 | "> Tenant offloading, requires an extra configuration, which is out of scope for this workshop.
\n",
329 | "> You can learn more from [How-to: Configure - Tenant Offloading](https://weaviate.io/developers/weaviate/configuration/tenant-offloading)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "# play.tenants.update([\n",
339 | "# Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.OFFLOADED),\n",
340 | "# ])\n",
341 | "\n",
342 | "# play.tenants.get()"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "## Clean up"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "client.collections.delete(\"Play\")"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "## Don't close yet...\n",
366 | "\n",
367 | "> You can try again with `auto_tenant_creation=True` and `auto_tenant_activation=True`"
368 | ]
369 | },
370 | {
371 | "cell_type": "markdown",
372 | "metadata": {},
373 | "source": [
374 | "## Close the client"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {},
381 | "outputs": [],
382 | "source": [
383 | "client.close()"
384 | ]
385 | }
386 | ],
387 | "metadata": {
388 | "kernelspec": {
389 | "display_name": ".venv (3.11.9)",
390 | "language": "python",
391 | "name": "python3"
392 | },
393 | "language_info": {
394 | "codemirror_mode": {
395 | "name": "ipython",
396 | "version": 3
397 | },
398 | "file_extension": ".py",
399 | "mimetype": "text/x-python",
400 | "name": "python",
401 | "nbconvert_exporter": "python",
402 | "pygments_lexer": "ipython3",
403 | "version": "3.11.9"
404 | }
405 | },
406 | "nbformat": 4,
407 | "nbformat_minor": 2
408 | }
409 |
--------------------------------------------------------------------------------
/1-intro/complete/2-query-complete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Query the data\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Connect to Weaviate"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import weaviate\n",
44 | "from weaviate.classes.init import Auth\n",
45 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
46 | "\n",
47 | "client = weaviate.connect_to_weaviate_cloud(\n",
48 | " cluster_url=WEAVIATE_URL,\n",
49 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
50 | "\n",
51 | " # additional_config=AdditionalConfig(\n",
52 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
53 | " # )\n",
54 | ")\n",
55 | "\n",
56 | "client.is_ready()"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "### Helper function"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "import json\n",
73 | "def print_properties(item):\n",
74 | " print(\n",
75 | " json.dumps(\n",
76 | " item.properties,\n",
77 | " indent=2, sort_keys=True, default=str\n",
78 | " )\n",
79 | " )"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "## Vector search\n",
87 | "[Docs - near_text](https://weaviate.io/developers/weaviate/search/similarity#an-input-medium)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "wiki = client.collections.use(\"Wiki\")\n",
97 | "\n",
98 | "response = wiki.query.near_text(\n",
99 | " query=\"musical instruments\",\n",
100 | " limit=5\n",
101 | ")\n",
102 | "\n",
103 | "for item in response.objects:\n",
104 | " print_properties(item)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "from weaviate.classes.query import MetadataQuery\n",
114 | "\n",
115 | "wiki = client.collections.use(\"Wiki\")\n",
116 | "\n",
117 | "response = wiki.query.near_text(\n",
118 | " query=\"musical instruments\",\n",
119 | " limit=5,\n",
120 | " return_metadata=MetadataQuery(distance=True)\n",
121 | ")\n",
122 | "\n",
123 | "for item in response.objects:\n",
124 | " print_properties(item)\n",
125 | " print(item.metadata.distance)"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "### Autocut\n",
133 | "\n",
134 | "Return groups of results based on the quality/distance jumps"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "from weaviate.classes.query import MetadataQuery\n",
144 | "\n",
145 | "wiki = client.collections.use(\"Wiki\")\n",
146 | "\n",
147 | "response = wiki.query.near_text(\n",
148 | " query=\"musical instruments\",\n",
149 | " auto_limit=1,\n",
150 | " return_metadata=MetadataQuery(distance=True)\n",
151 | ")\n",
152 | "\n",
153 | "print(f\"Returned object count: {len(response.objects)}\")\n",
154 | "\n",
155 | "for item in response.objects:\n",
156 | " print_properties(item)\n",
157 | " print(item.metadata.distance)"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "## Filters"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "### Fetch with filters"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "from weaviate.classes.query import Filter\n",
181 | "\n",
182 | "wiki = client.collections.use(\"Wiki\")\n",
183 | "\n",
184 | "response = wiki.query.fetch_objects(\n",
185 | " limit=5,\n",
186 | " filters=Filter.by_property(\"title\").like(\"music\")\n",
187 | ")\n",
188 | "\n",
189 | "for item in response.objects:\n",
190 | " print_properties(item)"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "from weaviate.classes.query import Filter\n",
200 | "\n",
201 | "response = wiki.query.fetch_objects(\n",
202 | " limit=5,\n",
203 | " filters=Filter.by_property(\"title\").like(\"m*ic\") & Filter.by_property(\"title\").not_equal(\"music\")\n",
204 | ")\n",
205 | "\n",
206 | "for item in response.objects:\n",
207 | " print_properties(item)"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "## Search with filters\n",
215 | "[Docs - Filters](https://weaviate.io/developers/weaviate/search/filters)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "from weaviate.classes.query import Filter\n",
225 | "\n",
226 | "response = wiki.query.near_text(\n",
227 | " query=\"musical instruments\",\n",
228 | " limit=5,\n",
229 | " filters=Filter.by_property(\"title\").not_equal(\"music\")\n",
230 | ")\n",
231 | "\n",
232 | "for item in response.objects:\n",
233 | " print_properties(item)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## Keyword Search\n",
241 | "\n",
242 | "[Docs - keyword/bm25](https://weaviate.io/developers/weaviate/search/bm25)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "response = wiki.query.bm25(\n",
252 | " query=\"musical instruments\",\n",
253 | " limit=5,\n",
254 | ")\n",
255 | "\n",
256 | "for item in response.objects:\n",
257 | " print_properties(item)"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "from weaviate.classes.query import MetadataQuery\n",
267 | "\n",
268 | "response = wiki.query.bm25(\n",
269 | " query=\"musical instruments\",\n",
270 | " query_properties=[\"text\", \"title\"],\n",
271 | " limit=5,\n",
272 | " return_metadata=MetadataQuery(score=True)\n",
273 | ")\n",
274 | "\n",
275 | "for item in response.objects:\n",
276 | " print_properties(item)\n",
277 | " print(item.metadata.score)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "from weaviate.classes.query import MetadataQuery\n",
287 | "\n",
288 | "response = wiki.query.bm25(\n",
289 | " query=\"musical instruments\",\n",
290 | " query_properties=[\"text\", \"title^3\"],\n",
291 | " limit=5,\n",
292 | " return_metadata=MetadataQuery(score=True)\n",
293 | ")\n",
294 | "\n",
295 | "for item in response.objects:\n",
296 | " print_properties(item)\n",
297 | " print(item.metadata.score)"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "## Hybrid search\n",
305 | "[Docs - hybrid](https://weaviate.io/developers/weaviate/search/hybrid)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "response = wiki.query.hybrid(\n",
315 | " query=\"musical instruments\",\n",
316 | " alpha=0.7,\n",
317 | " limit=5,\n",
318 | ")\n",
319 | "\n",
320 | "for item in response.objects:\n",
321 | " print_properties(item)"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "### Hybrid - select properties"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "response = wiki.query.hybrid(\n",
338 | " query=\"musical instruments\",\n",
339 | " alpha=0.7,\n",
340 | " limit=5,\n",
341 | " query_properties=[\"title\"]\n",
342 | ")\n",
343 | "\n",
344 | "for item in response.objects:\n",
345 | " print_properties(item)"
346 | ]
347 | },
348 | {
349 | "cell_type": "markdown",
350 | "metadata": {},
351 | "source": [
352 | "### Hybrid - Explain score"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "from weaviate.classes.query import MetadataQuery\n",
362 | "\n",
363 | "response = wiki.query.hybrid(\n",
364 | " query=\"musical instruments\",\n",
365 | " alpha=0.7,\n",
366 | " limit=5,\n",
367 | " query_properties=[\"title\"],\n",
368 | " return_metadata=MetadataQuery(score=True, explain_score=True)\n",
369 | ")\n",
370 | "\n",
371 | "for item in response.objects:\n",
372 | " print_properties(item)\n",
373 | " print(item.metadata.score)\n",
374 | " print(item.metadata.explain_score)"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "## Close the client"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | "client.close()"
391 | ]
392 | }
393 | ],
394 | "metadata": {
395 | "kernelspec": {
396 | "display_name": ".venv (3.11.9)",
397 | "language": "python",
398 | "name": "python3"
399 | },
400 | "language_info": {
401 | "codemirror_mode": {
402 | "name": "ipython",
403 | "version": 3
404 | },
405 | "file_extension": ".py",
406 | "mimetype": "text/x-python",
407 | "name": "python",
408 | "nbconvert_exporter": "python",
409 | "pygments_lexer": "ipython3",
410 | "version": "3.11.9"
411 | }
412 | },
413 | "nbformat": 4,
414 | "nbformat_minor": 2
415 | }
416 |
--------------------------------------------------------------------------------
/1-intro/2-query.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Query the data\n",
8 | "\n",
9 | "## Get keys and urls"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "from dotenv import load_dotenv\n",
20 | "\n",
21 | "load_dotenv()\n",
22 | "\n",
23 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
24 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
25 | "\n",
26 | "print(WEAVIATE_URL[:10])\n",
27 | "print(WEAVIATE_KEY[:10])\n",
28 | "\n",
29 | "if(WEAVIATE_URL == \"UPDATE_ME_WEAVIATE_URL\"):\n",
30 | " raise Exception(\"Please update .env and Restart the notebook (see Restart button, next to Run All)\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Connect to Weaviate"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import weaviate\n",
47 | "from weaviate.classes.init import Auth\n",
48 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
49 | "\n",
50 | "client = weaviate.connect_to_weaviate_cloud(\n",
51 | " cluster_url=WEAVIATE_URL,\n",
52 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
53 | "\n",
54 | " # additional_config=AdditionalConfig(\n",
55 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
56 | " # )\n",
57 | ")\n",
58 | "\n",
59 | "client.is_ready()"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "### Helper function"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "import json\n",
76 | "def print_properties(item):\n",
77 | " print(\n",
78 | " json.dumps(\n",
79 | " item.properties,\n",
80 | " indent=2, sort_keys=True, default=str\n",
81 | " )\n",
82 | " )"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## Vector search\n",
90 | "[Docs - near_text](https://weaviate.io/developers/weaviate/search/similarity#an-input-medium)"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "# TODO: get the Wiki collection\n",
100 | "# wiki = \n",
101 | "\n",
102 | "# TODO: run a near text query, search for musical instruments, with limit 5\n",
103 | "# response = wiki.query.\n",
104 | "\n",
105 | "for item in response.objects:\n",
106 | " print_properties(item)"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "from weaviate.classes.query import MetadataQuery\n",
116 | "\n",
117 | "wiki = client.collections.use(\"Wiki\")\n",
118 | "\n",
119 | "response = wiki.query.near_text(\n",
120 | " query=\"musical instruments\",\n",
121 | " limit=5,\n",
122 | " # TODO: add MetadataQuery - request distance\n",
123 | " # return_metadata=\n",
124 | ")\n",
125 | "\n",
126 | "for item in response.objects:\n",
127 | " print_properties(item)\n",
128 | " print(item.metadata.distance)"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### Autocut\n",
136 | "\n",
137 | "Return groups of results based on the quality/distance jumps"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "from weaviate.classes.query import MetadataQuery\n",
147 | "\n",
148 | "wiki = client.collections.use(\"Wiki\")\n",
149 | "\n",
150 | "response = wiki.query.near_text(\n",
151 | " query=\"musical instruments\",\n",
152 | " # TODO: use auto_limit instead if limit, set it to 1\n",
153 | "\n",
154 | " return_metadata=MetadataQuery(distance=True)\n",
155 | ")\n",
156 | "\n",
157 | "print(f\"Returned object count: {len(response.objects)}\")\n",
158 | "\n",
159 | "for item in response.objects:\n",
160 | " print_properties(item)\n",
161 | " print(item.metadata.distance)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "## Filters"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "### Fetch with filters"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "from weaviate.classes.query import Filter\n",
185 | "\n",
186 | "wiki = client.collections.use(\"Wiki\")\n",
187 | "\n",
188 | "response = wiki.query.fetch_objects(\n",
189 | " limit=5,\n",
190 | " # TODO: filter by property title, search for something like music\n",
191 | " # filters=\n",
192 | ")\n",
193 | "\n",
194 | "for item in response.objects:\n",
195 | " print_properties(item)"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "from weaviate.classes.query import Filter\n",
205 | "\n",
206 | "response = wiki.query.fetch_objects(\n",
207 | " limit=5,\n",
208 | " # NOTE: you can use & as AND operator and | as OR operator\n",
209 | " filters=Filter.by_property(\"title\").like(\"m*ic\") & Filter.by_property(\"title\").not_equal(\"music\")\n",
210 | ")\n",
211 | "\n",
212 | "for item in response.objects:\n",
213 | " print_properties(item)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Search with filters\n",
221 | "[Docs - Filters](https://weaviate.io/developers/weaviate/search/filters)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "from weaviate.classes.query import Filter\n",
231 | "\n",
232 | "response = wiki.query.near_text(\n",
233 | " query=\"musical instruments\",\n",
234 | " limit=5,\n",
235 | " filters=Filter.by_property(\"title\").not_equal(\"music\")\n",
236 | ")\n",
237 | "\n",
238 | "for item in response.objects:\n",
239 | " print_properties(item)"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "## Keyword Search\n",
247 | "\n",
248 | "[Docs - keyword/bm25](https://weaviate.io/developers/weaviate/search/bm25)"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "# TODO: use bm25 query, search for musical instruments, set limit to 5\n",
258 | "\n",
259 | "# response = wiki.\n",
260 | "\n",
261 | "for item in response.objects:\n",
262 | " print_properties(item)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "from weaviate.classes.query import MetadataQuery\n",
272 | "\n",
273 | "response = wiki.query.bm25(\n",
274 | " query=\"musical instruments\",\n",
275 | " # TODO: add query properties for \"text\" and \"title\"\n",
276 | " # query_properties=[],\n",
277 | " limit=5,\n",
278 | " return_metadata=MetadataQuery(score=True)\n",
279 | ")\n",
280 | "\n",
281 | "for item in response.objects:\n",
282 | " print_properties(item)\n",
283 | " print(item.metadata.score)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "from weaviate.classes.query import MetadataQuery\n",
293 | "\n",
294 | "response = wiki.query.bm25(\n",
295 | " query=\"musical instruments\",\n",
296 | " query_properties=[\"text\", \"title^3\"],\n",
297 | " limit=5,\n",
298 | " return_metadata=MetadataQuery(score=True)\n",
299 | ")\n",
300 | "\n",
301 | "for item in response.objects:\n",
302 | " print_properties(item)\n",
303 | " print(item.metadata.score)"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "## Hybrid search\n",
311 | "[Docs - hybrid](https://weaviate.io/developers/weaviate/search/hybrid)"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "# TODO: use hybrid query, search for musical instruments, set alpha to 0.7, and limit to 5\n",
321 | "# response = wiki.\n",
322 | "\n",
323 | "for item in response.objects:\n",
324 | " print_properties(item)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "### Hybrid - select properties"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "response = wiki.query.hybrid(\n",
341 | " query=\"musical instruments\",\n",
342 | " alpha=0.7,\n",
343 | " limit=5,\n",
344 | " # TODO: add query properties for \"title\"\n",
345 | ")\n",
346 | "\n",
347 | "for item in response.objects:\n",
348 | " print_properties(item)"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {},
354 | "source": [
355 | "### Hybrid - Explain score"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "from weaviate.classes.query import MetadataQuery\n",
365 | "\n",
366 | "response = wiki.query.hybrid(\n",
367 | " query=\"musical instruments\",\n",
368 | " alpha=0.7,\n",
369 | " limit=5,\n",
370 | " query_properties=[\"title\"],\n",
371 | " return_metadata=MetadataQuery(score=True, explain_score=True)\n",
372 | ")\n",
373 | "\n",
374 | "for item in response.objects:\n",
375 | " print_properties(item)\n",
376 | " print(item.metadata.score)\n",
377 | " print(item.metadata.explain_score)"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {},
383 | "source": [
384 | "## Close the client"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "client.close()"
394 | ]
395 | }
396 | ],
397 | "metadata": {
398 | "kernelspec": {
399 | "display_name": ".venv (3.11.9)",
400 | "language": "python",
401 | "name": "python3"
402 | },
403 | "language_info": {
404 | "codemirror_mode": {
405 | "name": "ipython",
406 | "version": 3
407 | },
408 | "file_extension": ".py",
409 | "mimetype": "text/x-python",
410 | "name": "python",
411 | "nbconvert_exporter": "python",
412 | "pygments_lexer": "ipython3",
413 | "version": "3.11.9"
414 | }
415 | },
416 | "nbformat": 4,
417 | "nbformat_minor": 2
418 | }
419 |
--------------------------------------------------------------------------------
/2-pre-vectorised-data/1-playground-run.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "from dotenv import load_dotenv\n",
11 | "\n",
12 | "load_dotenv()\n",
13 | "\n",
14 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
15 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
16 | "\n",
17 | "print(WEAVIATE_URL[:10])\n",
18 | "print(WEAVIATE_KEY[:10])"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Connect to Weaviate"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import weaviate\n",
35 | "from weaviate.classes.init import Auth\n",
36 | "\n",
37 | "client = weaviate.connect_to_weaviate_cloud(\n",
38 | " cluster_url=WEAVIATE_URL,\n",
39 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
40 | ")\n",
41 | "\n",
42 | "client.is_ready()"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Create a collection with no vectorizer"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "# Note: in practice, you shouldn't rerun this cell, as it deletes your data\n",
59 | "# in \"MyCollection\", and then you need to re-import it again.\n",
60 | "from weaviate.classes.config import Configure, VectorDistances\n",
61 | "\n",
62 | "# Delete the collection if it already exists\n",
63 | "if (client.collections.exists(\"MyCollection\")):\n",
64 | " client.collections.delete(\"MyCollection\")\n",
65 | "\n",
66 | "client.collections.create(\n",
67 | " name=\"MyCollection\",\n",
68 | " vector_config=Configure.Vectors.self_provided( # No vectorizer needed\n",
69 | " vector_index_config=Configure.VectorIndex.hnsw( # Optional\n",
70 | " distance_metric=VectorDistances.COSINE # select prefered distance metric \n",
71 | " )\n",
72 | " ),\n",
73 | ")\n",
74 | "\n",
75 | "print(f\"Successfully created collection: {'MyCollection'}.\")"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Insert an object with a vector"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "my_collection = client.collections.use(\"MyCollection\")\n",
92 | "my_collection.data.insert(\n",
93 | " properties={\n",
94 | " \"title\": \"First Object\",\n",
95 | " \"foo\": 11, \n",
96 | " },\n",
97 | " vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]\n",
98 | ")"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "res = my_collection.query.fetch_objects(include_vector=True)\n",
108 | "\n",
109 | "print(res.objects[0].properties)\n",
110 | "print(res.objects[0].vector)"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "## Insert many objects with their vectors using batch"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "source = [\n",
127 | " {\n",
128 | " \"title\": \"Second Object\",\n",
129 | " \"foo\": 22,\n",
130 | " \"vector\": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]\n",
131 | " },\n",
132 | " {\n",
133 | " \"title\": \"Third Object\",\n",
134 | " \"foo\": 33,\n",
135 | " \"vector\": [0.3, 0.1, -0.1, -0.3, -0.5, -0.7]\n",
136 | " },\n",
137 | " {\n",
138 | " \"title\": \"Fourth Object\",\n",
139 | " \"foo\": 44,\n",
140 | " \"vector\": [0.4, 0.41, 0.42, 0.43, 0.44, 0.45]\n",
141 | " },\n",
142 | " {\n",
143 | " \"title\": \"Fifth Object\",\n",
144 | " \"foo\": 55,\n",
145 | " \"vector\": [0.5, 0.5, 0, 0, 0, 0]\n",
146 | " },\n",
147 | "]"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "\n",
157 | "with my_collection.batch.dynamic() as batch:\n",
158 | " for item in source:\n",
159 | " batch.add_object(\n",
160 | " properties={\n",
161 | " \"title\": item[\"title\"],\n",
162 | " \"foo\": item[\"foo\"],\n",
163 | " },\n",
164 | " vector=item[\"vector\"]\n",
165 | " )"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "## Example with insert_many"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# sample_data = [\n",
182 | "# wc.DataObject(\n",
183 | "# properties={\n",
184 | "# \"title\": \"First Object\",\n",
185 | "# \"foo\": 11, \n",
186 | "# },\n",
187 | "# vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]\n",
188 | "# ),\n",
189 | "# wc.DataObject(\n",
190 | "# properties={\n",
191 | "# \"title\": \"Second Object\",\n",
192 | "# \"foo\": 22,\n",
193 | "# },\n",
194 | "# vector=[0.2, 0.3, 0.4, 0.5, 0.6, 0.7]\n",
195 | "# ),\n",
196 | "# wc.DataObject(\n",
197 | "# properties={\n",
198 | "# \"title\": \"Third Object\",\n",
199 | "# \"foo\": 33,\n",
200 | "# },\n",
201 | "# vector=[0.3, 0.1, -0.1, -0.3, -0.5, -0.7]\n",
202 | "# ),\n",
203 | "# wc.DataObject(\n",
204 | "# properties={\n",
205 | "# \"title\": \"Fourth Object\",\n",
206 | "# \"foo\": 44,\n",
207 | "# },\n",
208 | "# vector=[0.4, 0.41, 0.42, 0.43, 0.44, 0.45]\n",
209 | "# ),\n",
210 | "# wc.DataObject(\n",
211 | "# properties={\n",
212 | "# \"title\": \"Fifth Object\",\n",
213 | "# \"foo\": 55,\n",
214 | "# },\n",
215 | "# vector=[0.5, 0.5, 0, 0, 0, 0]\n",
216 | "# ),\n",
217 | "# ]\n",
218 | "\n",
219 | "# my_collection.data.insert_many(sample_data)"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "## Query\n",
227 | "Available types of queries you can run when working with vector embeddings (without modules) in **Weaviate**:\n",
228 | "\n",
229 | "1. [near_vector](https://weaviate.io/developers/weaviate/search/similarity#search-with-a-vector)\n",
230 | "\n",
231 | "2. [near_object](https://weaviate.io/developers/weaviate/search/similarity#search-with-an-existing-object)"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "### nearVector Example\n",
239 | "**First example** - Search Weaviate with a vector embedding, and return title property.\n",
240 | "\n",
241 | "See [the docs](https://weaviate.io/developers/weaviate/search/similarity#search-with-a-vector) for more."
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "response = my_collection.query.near_vector(\n",
251 | " near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n",
252 | " limit=2,\n",
253 | ")\n",
254 | "\n",
255 | "for item in response.objects:\n",
256 | " print(item.uuid)\n",
257 | " print(item.properties, \"\\n\")"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "**Second example** - The same search query, but this time also return `distance`, and `vector`."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "from weaviate.classes.query import MetadataQuery\n",
274 | "\n",
275 | "response = my_collection.query.near_vector(\n",
276 | " near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n",
277 | " include_vector=True,\n",
278 | " return_metadata=MetadataQuery(distance=True),\n",
279 | " limit=2,\n",
280 | ")\n",
281 | "\n",
282 | "for item in response.objects:\n",
283 | " print(item.properties)\n",
284 | " print(item.metadata.distance)\n",
285 | " print(item.vector, \"\\n\")"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "**Third example** – Same vector query, but this time we will filter on \"foo\" (which should be greater than 44). Also, let's return \"title\" and \"foo\".\n",
293 | "\n",
294 | "See [the docs](https://weaviate.io/developers/weaviate/search/filters#filter-with-one-condition) for more."
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "from weaviate.classes.query import Filter, MetadataQuery\n",
304 | "\n",
305 | "response = my_collection.query.near_vector(\n",
306 | " near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n",
307 | " return_metadata=MetadataQuery(distance=True),\n",
308 | " filters=Filter.by_property(\"foo\").greater_than(30),\n",
309 | " limit=2,\n",
310 | ")\n",
311 | "\n",
312 | "for item in response.objects:\n",
313 | " print(item.properties)\n",
314 | " print(item.metadata.distance, \"\\n\")"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "### nearObject Example\n",
322 | "\n",
323 | "Weaviate also allows you to search for similar objects.\n",
324 | "\n",
325 | "See [the docs](https://weaviate.io/developers/weaviate/search/similarity#search-with-an-existing-object) for more.\n",
326 | "\n",
327 | "**Fourth example** - \n",
328 | "Search through `MyCollection` for similar objects, by providing an id from the previous query. \n",
329 | "\n",
330 | "> Note #1: The id was taken from the query above
\n",
331 | "> The generated id for you might be different.\n",
332 | "\n",
333 | "> Note #2: The first object returned is always itself."
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "from weaviate.classes.query import MetadataQuery\n",
343 | "\n",
344 | "response = my_collection.query.near_object(\n",
345 | " near_object=\"20805faa-f0b6-404a-aa34-8a44e01e0bcd\",\n",
346 | " return_metadata=MetadataQuery(distance=True),\n",
347 | " limit=3,\n",
348 | ")\n",
349 | "\n",
350 | "for item in response.objects:\n",
351 | " print(item.uuid)\n",
352 | " print(item.properties)\n",
353 | " print(item.metadata.distance, \"\\n\")"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "## Close the client"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": null,
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "client.close()"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": []
378 | }
379 | ],
380 | "metadata": {
381 | "kernelspec": {
382 | "display_name": ".venv (3.11.9)",
383 | "language": "python",
384 | "name": "python3"
385 | },
386 | "language_info": {
387 | "codemirror_mode": {
388 | "name": "ipython",
389 | "version": 3
390 | },
391 | "file_extension": ".py",
392 | "mimetype": "text/x-python",
393 | "name": "python",
394 | "nbconvert_exporter": "python",
395 | "pygments_lexer": "ipython3",
396 | "version": "3.11.9"
397 | }
398 | },
399 | "nbformat": 4,
400 | "nbformat_minor": 2
401 | }
402 |
--------------------------------------------------------------------------------
/1-intro/complete/1-load-data-complete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Collection setup and data load"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Get keys and urls"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import os\n",
24 | "from dotenv import load_dotenv\n",
25 | "\n",
26 | "load_dotenv()\n",
27 | "\n",
28 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
29 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
30 | "\n",
31 | "print(WEAVIATE_URL[:10])\n",
32 | "print(WEAVIATE_KEY[:10])"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Connect to Weaviate\n",
40 | "\n",
41 | "You need to pass in your Weaviate Cloud URL and KEY."
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import weaviate\n",
51 | "from weaviate.classes.init import Auth\n",
52 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
53 | "\n",
54 | "client = weaviate.connect_to_weaviate_cloud(\n",
55 | " cluster_url=WEAVIATE_URL,\n",
56 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
57 | "\n",
58 | " # additional_config=AdditionalConfig(\n",
59 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
60 | " # )\n",
61 | ")\n",
62 | "\n",
63 | "client.is_ready()"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "## Create a collection with a vectorizer\n",
71 | "\n",
72 | "* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)\n",
73 | "* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)\n",
74 | "\n",
75 | "Examples of other embedding models:\n",
76 | "* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)\n",
77 | "* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)\n",
78 | "* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)\n",
79 | "* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "from weaviate.classes.config import Configure\n",
89 | "\n",
90 | "if client.collections.exists(\"Jeopardy\"):\n",
91 | " client.collections.delete(\"Jeopardy\")\n",
92 | "\n",
93 | "# Create a collection - with Weaviate vectorizer\n",
94 | "client.collections.create(\n",
95 | " name=\"Jeopardy\",\n",
96 | "\n",
97 | " # https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings\n",
98 | " vector_config=Configure.Vectors.text2vec_weaviate(\n",
99 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
100 | " dimensions=256 # options 1024 (default) and 256\n",
101 | " ),\n",
102 | ")"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Import data\n",
110 | "### Sample Data"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "import json\n",
120 | "\n",
121 | "with open(\"../jeopardy_tiny.json\") as file:\n",
122 | " data_10 = json.load(file)\n",
123 | "\n",
124 | "print(json.dumps(data_10[0:2], indent=2))"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "### Insert Many\n",
132 | "\n",
133 | "> `insert_many` is only used for inserting small batches of data - must complete within the timeout.\n",
134 | "\n",
135 | "[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "# Insert data\n",
145 | "jeopardy = client.collections.use(\"Jeopardy\")\n",
146 | "jeopardy.data.insert_many(data_10)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "### Data preview"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "# Show data preview\n",
163 | "jeopardy = client.collections.use(\"Jeopardy\")\n",
164 | "response = jeopardy.query.fetch_objects(limit=4)\n",
165 | "\n",
166 | "for item in response.objects:\n",
167 | " print(item.uuid, item.properties)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "# Show data preview - with vectors\n",
177 | "jeopardy = client.collections.use(\"Jeopardy\")\n",
178 | "response = jeopardy.query.fetch_objects(\n",
179 | " limit=4,\n",
180 | " include_vector=True\n",
181 | ")\n",
182 | "\n",
183 | "for item in response.objects:\n",
184 | " print(item.properties)\n",
185 | " print(item.vector, '\\n')"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "### Super quick query example"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "response = jeopardy.query.near_text(\n",
202 | " query=\"African animals\",\n",
203 | " # query=\"weather\",\n",
204 | " limit=2\n",
205 | ")\n",
206 | "\n",
207 | "for item in response.objects:\n",
208 | " print(item.properties)"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## A bit bigger example - 2k objects"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "### Load data"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "import json\n",
232 | "\n",
233 | "with open(\"../wiki-2k.json\") as file:\n",
234 | " data_2k = json.load(file)\n",
235 | "\n",
236 | "print(json.dumps(data_2k[0:2], indent=2))"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "### Create a collection with Named Vectors and SourceProperties"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "from weaviate.classes.config import Configure, Property, DataType\n",
253 | "\n",
254 | "def create_wiki_collection():\n",
255 | " if client.collections.exists(\"Wiki\"):\n",
256 | " client.collections.delete(\"Wiki\")\n",
257 | "\n",
258 | " # Create a collection here - with Weaviate vectorizer and define source properties\n",
259 | " client.collections.create(\n",
260 | " name=\"Wiki\",\n",
261 | "\n",
262 | " vector_config=[\n",
263 | " Configure.Vectors.text2vec_weaviate(\n",
264 | " name=\"main_vector\",\n",
265 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\", # default\n",
266 | " source_properties=['title', 'text'] # which properties should be used to generate a vector\n",
267 | " )\n",
268 | " ],\n",
269 | "\n",
270 | " # Example: how to define property schema (Optional)\n",
271 | " # properties=[ \n",
272 | " # Property(name=\"title\", data_type=DataType.TEXT),\n",
273 | " # Property(name=\"text\", data_type=DataType.TEXT),\n",
274 | " # Property(name=\"url\", data_type=DataType.TEXT),\n",
275 | " # Property(name=\"wiki_id\", data_type=DataType.TEXT),\n",
276 | " # ],\n",
277 | " )\n",
278 | "\n",
279 | "create_wiki_collection()"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "### Import data - 2k objects with Batch\n",
287 | "\n",
288 | "Batch speeds up the import process by grouping objects to be added in bigger batch groups.\n",
289 | "\n",
290 | "Batch creates an internal buffer to collect objects to be added.
\n",
291 | "Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.\n",
292 | "\n",
293 | "Types of batch:\n",
294 | "* `dynamic` - let batch calculate the optimal batch_size based on detected latency\n",
295 | "* `fixed_size` - provide a fixed batch_size\n",
296 | "* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "### Take 1 – import sample 100"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "from tqdm import tqdm\n",
313 | "\n",
314 | "sample_100 = data_2k[0:100]\n",
315 | "\n",
316 | "wiki = client.collections.use(\"Wiki\")\n",
317 | "\n",
318 | "with wiki.batch.dynamic() as batch:\n",
319 | " for item in tqdm(sample_100):\n",
320 | " batch.add_object(item)\n",
321 | "\n",
322 | "print(f\"Wiki count: {len(wiki)}\")"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "# check for errors\n",
332 | "if(len(wiki.batch.failed_objects)>0):\n",
333 | " print(\"Import complete with errors\")\n",
334 | " for err in wiki.batch.failed_objects:\n",
335 | " print(err)\n",
336 | "else:\n",
337 | " print(\"Import complete with no errors\")"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "### Take 2 – import sample 100 – with UUID\n",
345 | "\n",
346 | "To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property."
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "from weaviate.util import generate_uuid5\n",
356 | "\n",
357 | "print(generate_uuid5(\"This UUID is always the same\"))\n",
358 | "print(generate_uuid5(\"This UUID is always the same\"))\n",
359 | "print(generate_uuid5(\"This UUID is always the same\"))\n",
360 | "print(\"====================================\")\n",
361 | "\n",
362 | "print(generate_uuid5(\"This UUID is different\"))\n",
363 | "print(generate_uuid5(\"This UUID is different\"))\n",
364 | "print(\"====================================\")\n",
365 | "\n",
366 | "obj1 = { \"title\": \"this is an object\", \"count\": 1 }\n",
367 | "obj2 = { \"title\": \"this is an object\", \"count\": 2 }\n",
368 | "print(generate_uuid5(obj1))\n",
369 | "print(generate_uuid5(obj2))\n"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "# recreate the collection to start again\n",
379 | "create_wiki_collection()"
380 | ]
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "metadata": {},
385 | "source": [
386 | "> Rerun the import script multiple times.\n",
387 | "\n",
388 | "> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase."
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "from tqdm import tqdm\n",
398 | "from weaviate.util import generate_uuid5\n",
399 | "\n",
400 | "sample_100 = data_2k[0:100]\n",
401 | "\n",
402 | "wiki = client.collections.use(\"Wiki\")\n",
403 | "\n",
404 | "with wiki.batch.fixed_size(batch_size=20, concurrent_requests=2) as batch:\n",
405 | " for item in tqdm(sample_100):\n",
406 | " id = generate_uuid5(item[\"wiki_id\"])\n",
407 | "\n",
408 | " batch.add_object(\n",
409 | " item,\n",
410 | " uuid=id\n",
411 | " )\n",
412 | "\n",
413 | "print(f\"Wiki count: {len(wiki)}\")"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {},
419 | "source": [
420 | "### Take 2 - import the rest of the data - but break if multiple errors"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "from tqdm import tqdm\n",
430 | "from weaviate.util import generate_uuid5\n",
431 | "\n",
432 | "wiki = client.collections.use(\"Wiki\")\n",
433 | "\n",
434 | "with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n",
435 | " for item in tqdm(data_2k):\n",
436 | " id = generate_uuid5(item[\"wiki_id\"])\n",
437 | " batch.add_object(item, uuid=id)\n",
438 | "\n",
439 | " # Check number of errors while running\n",
440 | " if(batch.number_errors > 10):\n",
441 | " print(\"Errors during batch import\")\n",
442 | " break"
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "metadata": {},
448 | "source": [
449 | "### Check for errors"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "if(len(wiki.batch.failed_objects)>0):\n",
459 | " print(\"Import complete with errors\")\n",
460 | " for err in wiki.batch.failed_objects:\n",
461 | " print(err)\n",
462 | "else:\n",
463 | " print(\"Import complete with no errors\")"
464 | ]
465 | },
466 | {
467 | "cell_type": "markdown",
468 | "metadata": {},
469 | "source": [
470 | "## Bonus - iterate through all collection data\n",
471 | "\n",
472 | "The client has a built-in function that allows you to iterate through all collection data."
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": null,
478 | "metadata": {},
479 | "outputs": [],
480 | "source": [
481 | "wiki = client.collections.use(\"Wiki\")\n",
482 | "\n",
483 | "counter = 100\n",
484 | "\n",
485 | "for item in wiki.iterator():\n",
486 | " print(item.properties)\n",
487 | "\n",
488 | " if (counter == 0): break\n",
489 | " \n",
490 | " counter -= 1"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "You can also get `vector embeddings`, by using `include_vector`."
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "counter = 10\n",
507 | "\n",
508 | "for item in wiki.iterator(include_vector=True):\n",
509 | " print(item.properties)\n",
510 | " print(item.vector)\n",
511 | "\n",
512 | " if (counter == 0): break\n",
513 | " \n",
514 | " counter -= 1"
515 | ]
516 | },
517 | {
518 | "cell_type": "markdown",
519 | "metadata": {},
520 | "source": [
521 | "## Close the client"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": null,
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "client.close()"
531 | ]
532 | }
533 | ],
534 | "metadata": {
535 | "kernelspec": {
536 | "display_name": ".venv (3.11.9)",
537 | "language": "python",
538 | "name": "python3"
539 | },
540 | "language_info": {
541 | "codemirror_mode": {
542 | "name": "ipython",
543 | "version": 3
544 | },
545 | "file_extension": ".py",
546 | "mimetype": "text/x-python",
547 | "name": "python",
548 | "nbconvert_exporter": "python",
549 | "pygments_lexer": "ipython3",
550 | "version": "3.11.9"
551 | }
552 | },
553 | "nbformat": 4,
554 | "nbformat_minor": 2
555 | }
556 |
--------------------------------------------------------------------------------
/1-intro/1-load-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Collection setup and data load"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Get keys and urls"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import os\n",
24 | "from dotenv import load_dotenv\n",
25 | "\n",
26 | "load_dotenv()\n",
27 | "\n",
28 | "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
29 | "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
30 | "\n",
31 | "print(WEAVIATE_URL[:10)\n",
32 | "print(WEAVIATE_KEY[:10])\n",
33 | "\n",
34 | "if(WEAVIATE_URL == \"UPDATE_ME_WEAVIATE_URL\"):\n",
35 | " raise Exception(\"Please update .env and Restart the notebook (see Restart button, next to Run All)\")"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Connect to Weaviate\n",
43 | "\n",
44 | "You need to pass in your Weaviate Cloud URL and KEY."
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import weaviate\n",
54 | "from weaviate.classes.init import Auth\n",
55 | "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
56 | "\n",
57 | "client = weaviate.connect_to_weaviate_cloud(\n",
58 | " cluster_url=WEAVIATE_URL,\n",
59 | " auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
60 | "\n",
61 | " # additional_config=AdditionalConfig(\n",
62 | " # timeout=Timeout(init=2, query=45, insert=120), # Values in seconds\n",
63 | " # )\n",
64 | ")\n",
65 | "\n",
66 | "client.is_ready()"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Create a collection with a vectorizer\n",
74 | "\n",
75 | "* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)\n",
76 | "* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)\n",
77 | "\n",
78 | "Examples of other embedding models:\n",
79 | "* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)\n",
80 | "* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)\n",
81 | "* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)\n",
82 | "* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "from weaviate.classes.config import Configure\n",
92 | "\n",
93 | "if client.collections.exists(\"Jeopardy\"):\n",
94 | " client.collections.delete(\"Jeopardy\")\n",
95 | "\n",
96 | "# Create a collection - with Weaviate vectorizer\n",
97 | "client.collections.create(\n",
98 | " name=\"Jeopardy\",\n",
99 | " # TODO: add text2vec_weaviate vectorizer - with:\n",
100 | " # * model - Snowflake/snowflake-arctic-embed-l-v2.0\n",
101 | " \n",
102 | ")"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Import data\n",
110 | "### Sample Data"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "import json\n",
120 | "\n",
121 | "with open(\"./jeopardy_tiny.json\") as file:\n",
122 | " data_10 = json.load(file)\n",
123 | "\n",
124 | "print(json.dumps(data_10[0:2], indent=2))"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "### Insert Many\n",
132 | "\n",
133 | "> `insert_many` is only used for inserting small batches of data - must complete within the timeout.\n",
134 | "\n",
135 | "[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "# Insert data\n",
145 | "\n",
146 | "# TODO: get Jeopardy collection\n",
147 | "# TODO: insert data_10"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "### Data preview"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "# Show data preview\n",
164 | "jeopardy = client.collections.use(\"Jeopardy\")\n",
165 | "\n",
166 | "# TODO: fetch 4 objects\n",
167 | "# response = jeopardy\n",
168 | "\n",
169 | "for item in response.objects:\n",
170 | " print(item.uuid, item.properties)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "# Show data preview - with vectors\n",
180 | "response = jeopardy.query.fetch_objects(\n",
181 | " limit=4,\n",
182 | " # TODO: add include_vectors\n",
183 | ")\n",
184 | "\n",
185 | "for item in response.objects:\n",
186 | " print(item.properties)\n",
187 | " print(item.vector, '\\n')"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "### Super quick query example"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "# TODO: add near text query, search for African animals with limit 2\n",
204 | "# response = jeopardy.query\n",
205 | "\n",
206 | "for item in response.objects:\n",
207 | " print(item.properties)"
208 | ]
209 | },
210 | {
211 | "cell_type": "markdown",
212 | "metadata": {},
213 | "source": [
214 | "## A bit bigger example - 2k objects"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "### Load data"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "import json\n",
231 | "\n",
232 | "with open(\"./wiki-2k.json\") as file:\n",
233 | " data_2k = json.load(file)\n",
234 | "\n",
235 | "print(json.dumps(data_2k[0:2], indent=2))"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### Create a collection with Named Vectors and SourceProperties"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "from weaviate.classes.config import Configure, Property, DataType\n",
252 | "\n",
253 | "def create_wiki_collection():\n",
254 | " if client.collections.exists(\"Wiki\"):\n",
255 | " client.collections.delete(\"Wiki\")\n",
256 | "\n",
257 | " # Create a collection here - with Weaviate vectorizer and define source properties\n",
258 | " client.collections.create(\n",
259 | " name=\"Wiki\",\n",
260 | "\n",
261 | " vector_config=[\n",
262 | " # NOTE: we are using NamedVectors here\n",
263 | " Configure.Vectors.text2vec_weaviate(\n",
264 | " name=\"main_vector\",\n",
265 | " model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
266 | "\n",
267 | " # TODO: set source properties to \"title\" and \"text\"\n",
268 | " # source_properties=[] # which properties should be used to generate a vector\n",
269 | " )\n",
270 | " ],\n",
271 | "\n",
272 | " # Example: how to define property schema (Optional)\n",
273 | " # properties=[ \n",
274 | " # Property(name=\"title\", data_type=DataType.TEXT),\n",
275 | " # Property(name=\"text\", data_type=DataType.TEXT),\n",
276 | " # Property(name=\"url\", data_type=DataType.TEXT),\n",
277 | " # Property(name=\"wiki_id\", data_type=DataType.TEXT),\n",
278 | " # ],\n",
279 | " )\n",
280 | "\n",
281 | "create_wiki_collection()"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "### Import data - 2k objects with Batch\n",
289 | "\n",
290 | "Batch speeds up the import process by grouping objects to be added in bigger batch groups.\n",
291 | "\n",
292 | "Batch creates an internal buffer to collect objects to be added.
\n",
293 | "Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.\n",
294 | "\n",
295 | "Types of batch:\n",
296 | "* `dynamic` - let batch calculate the optimal batch_size based on detected latency\n",
297 | "* `fixed_size` - provide a fixed batch_size\n",
298 | "* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "### Take 1 – import sample 100"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "from tqdm import tqdm\n",
315 | "\n",
316 | "sample_100 = data_2k[0:100]\n",
317 | "\n",
318 | "wiki = client.collections.use(\"Wiki\")\n",
319 | "\n",
320 | "# TODO: setup dynamic batch\n",
321 | "# loop through the sample_100 data\n",
322 | "# add each object to the batch\n",
323 | "\n",
324 | "print(f\"Wiki count: {len(wiki)}\")"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "# check for errors\n",
334 | "if(len(wiki.batch.failed_objects)>0):\n",
335 | " print(\"Import complete with errors\")\n",
336 | " for err in wiki.batch.failed_objects:\n",
337 | " print(err)\n",
338 | "else:\n",
339 | " print(\"Import complete with no errors\")"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {},
345 | "source": [
346 | "### Take 2 – import sample 100 – with UUID\n",
347 | "\n",
348 | "To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property."
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "from weaviate.util import generate_uuid5\n",
358 | "\n",
359 | "print(generate_uuid5(\"This UUID is always the same\"))\n",
360 | "print(generate_uuid5(\"This UUID is always the same\"))\n",
361 | "print(generate_uuid5(\"This UUID is always the same\"))\n",
362 | "print(\"====================================\")\n",
363 | "\n",
364 | "print(generate_uuid5(\"This UUID is different\"))\n",
365 | "print(generate_uuid5(\"This UUID is different\"))\n",
366 | "print(\"====================================\")\n",
367 | "\n",
368 | "obj1 = { \"title\": \"this is an object\", \"count\": 1 }\n",
369 | "obj2 = { \"title\": \"this is an object\", \"count\": 2 }\n",
370 | "print(generate_uuid5(obj1))\n",
371 | "print(generate_uuid5(obj2))\n"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {},
378 | "outputs": [],
379 | "source": [
380 | "# recreate the collection to start again\n",
381 | "create_wiki_collection()"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {},
387 | "source": [
388 | "> Rerun the import script multiple times.\n",
389 | "\n",
390 | "> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase."
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "metadata": {},
397 | "outputs": [],
398 | "source": [
399 | "from tqdm import tqdm\n",
400 | "from weaviate.util import generate_uuid5\n",
401 | "\n",
402 | "sample_100 = data_2k[0:100]\n",
403 | "\n",
404 | "wiki = client.collections.use(\"Wiki\")\n",
405 | "\n",
406 | "with wiki.batch.fixed_size(batch_size=50, concurrent_requests=2) as batch:\n",
407 | " for item in tqdm(sample_100):\n",
408 | " # TODO: generate an id from item[\"wiki_id\"]\n",
409 | " # id = \n",
410 | "\n",
411 | " batch.add_object(\n",
412 | " item,\n",
413 | " # TODO: provide the new id here \n",
414 | " # uuid=\n",
415 | " )\n",
416 | "\n",
417 | "print(f\"Wiki count: {len(wiki)}\")"
418 | ]
419 | },
420 | {
421 | "cell_type": "markdown",
422 | "metadata": {},
423 | "source": [
424 | "### Take 2 - import the rest of the data - but break if multiple errors"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "from tqdm import tqdm\n",
434 | "from weaviate.util import generate_uuid5\n",
435 | "\n",
436 | "wiki = client.collections.use(\"Wiki\")\n",
437 | "\n",
438 | "with wiki.batch.fixed_size(batch_size=600, concurrent_requests=2) as batch:\n",
439 | " for item in tqdm(data_2k):\n",
440 | " id = generate_uuid5(item[\"wiki_id\"])\n",
441 | " batch.add_object(item, uuid=id)\n",
442 | "\n",
443 | " # Check number of errors while running\n",
444 | " if(batch.number_errors > 10):\n",
445 | " print(\"Errors during batch import\")\n",
446 | " break"
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {},
452 | "source": [
453 | "### Check for errors"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "if(len(wiki.batch.failed_objects)>0):\n",
463 | " print(\"Import complete with errors\")\n",
464 | " for err in wiki.batch.failed_objects:\n",
465 | " print(err)\n",
466 | "else:\n",
467 | " print(\"Import complete with no errors\")"
468 | ]
469 | },
470 | {
471 | "cell_type": "markdown",
472 | "metadata": {},
473 | "source": [
474 | "## Bonus - iterate through all collection data\n",
475 | "\n",
476 | "The client has a built-in function that allows you to iterate through all collection data."
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": null,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "wiki = client.collections.use(\"Wiki\")\n",
486 | "\n",
487 | "counter = 100\n",
488 | "\n",
489 | "for item in wiki.iterator():\n",
490 | " print(item.properties)\n",
491 | "\n",
492 | " if (counter == 0): break\n",
493 | " \n",
494 | " counter -= 1"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {},
500 | "source": [
501 | "You can also get `vector embeddings`, by using `include_vector`."
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": null,
507 | "metadata": {},
508 | "outputs": [],
509 | "source": [
510 | "counter = 10\n",
511 | "\n",
512 | "for item in wiki.iterator(include_vector=True):\n",
513 | " print(item.properties)\n",
514 | " print(item.vector)\n",
515 | "\n",
516 | " if (counter == 0): break\n",
517 | " \n",
518 | " counter -= 1"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "metadata": {},
524 | "source": [
525 | "## Close the client"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": null,
531 | "metadata": {},
532 | "outputs": [],
533 | "source": [
534 | "client.close()"
535 | ]
536 | }
537 | ],
538 | "metadata": {
539 | "kernelspec": {
540 | "display_name": ".venv (3.11.9)",
541 | "language": "python",
542 | "name": "python3"
543 | },
544 | "language_info": {
545 | "codemirror_mode": {
546 | "name": "ipython",
547 | "version": 3
548 | },
549 | "file_extension": ".py",
550 | "mimetype": "text/x-python",
551 | "name": "python",
552 | "nbconvert_exporter": "python",
553 | "pygments_lexer": "ipython3",
554 | "version": "3.11.9"
555 | }
556 | },
557 | "nbformat": 4,
558 | "nbformat_minor": 2
559 | }
560 |
--------------------------------------------------------------------------------