├── img
    ├── wcd-create-cluster-1.jpg
    ├── wcd-create-cluster-2.jpg
    └── wcd-enable-async-indexing.png
├── .env
├── 4-multi-tenancy
    ├── dl_data
    │   ├── 2212.10496.pdf
    │   └── 2401.00107.pdf
    ├── 3a-generate-data.ipynb
    ├── 2-setup-run.ipynb
    ├── 4-search-tenants.ipynb
    ├── 3b-load-data.ipynb
    └── 1-playground-run.ipynb
├── .claude
    └── settings.local.json
├── .gitignore
├── install.md
├── _docker
    ├── docker-compose-ollama.yml
    ├── docker-compose-ollama-codespace.yml
    ├── docker-compose.yml
    └── docker-compose-clip.yml
├── prep-data.py
├── .devcontainer
    └── devcontainer.json
├── 1-intro
    ├── jeopardy_tiny.json
    ├── 0-prep-run.ipynb
    ├── complete
    │   ├── 2-query-complete.ipynb
    │   └── 1-load-data-complete.ipynb
    ├── 2-query.ipynb
    └── 1-load-data.ipynb
├── README.md
├── 5-vector-compression
    ├── data_loader.py
    ├── 2-search-run.ipynb
    ├── 1-rq-run.ipynb
    ├── 1-sq-run.ipynb
    ├── 1-bq-run.ipynb
    ├── 1-pq-run.ipynb
    └── 0-vector-indexes.ipynb
├── prep-data.ipynb
├── 2-pre-vectorised-data
    ├── 3-wiki-search-run.ipynb
    ├── 2-wiki-import.ipynb
    ├── complete
    │   └── 2-wiki-import-complete.ipynb
    └── 1-playground-run.ipynb
├── requirements.txt
└── 3-rag
    ├── 2-rag-gen-query-run.ipynb
    ├── complete
        └── 1-rag-complete.ipynb
    └── 1-rag.ipynb


/img/wcd-create-cluster-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-create-cluster-1.jpg


--------------------------------------------------------------------------------
/img/wcd-create-cluster-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-create-cluster-2.jpg


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | # WEAVIATE_URL=your_weaviate_url_here
2 | # WEAVIATE_KEY=your_weaviate_key_here
3 | # OPENAI_API_KEY= your_openai_api_key
4 | 
5 | 


--------------------------------------------------------------------------------
/img/wcd-enable-async-indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/img/wcd-enable-async-indexing.png


--------------------------------------------------------------------------------
/4-multi-tenancy/dl_data/2212.10496.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/4-multi-tenancy/dl_data/2212.10496.pdf


--------------------------------------------------------------------------------
/4-multi-tenancy/dl_data/2401.00107.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weaviate-tutorials/weaviate-workshop/HEAD/4-multi-tenancy/dl_data/2401.00107.pdf


--------------------------------------------------------------------------------
/.claude/settings.local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "permissions": {
 3 |     "allow": [
 4 |       "mcp__ide__executeCode",
 5 |       "Bash(pip show:*)"
 6 |     ],
 7 |     "deny": [],
 8 |     "ask": []
 9 |   }
10 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv*
 2 | __pycache__/
 3 | 
 4 | # distill files
 5 | # **/dl_data/
 6 | .DS_Store
 7 | 
 8 | 
 9 | # ignore temp files/folders with names starting with __
10 | __*
11 | 
12 | # ignore big files
13 | *.parquet
14 | wiki-data


--------------------------------------------------------------------------------
/install.md:
--------------------------------------------------------------------------------
 1 | ## How to setup the python environment with venv
 2 | To run the project locally, it is best to setup python environment with venv.
 3 | 
 4 | ### Setup – do this only once
 5 | First create a new venv configuration.
 6 | ```
 7 | python3 -m venv .venv
 8 | ```
 9 | 
10 | Then switch to the new configuration:
11 | ```
12 | source .venv/bin/activate
13 | ```
14 | 
15 | And install the required packages.
16 | ```
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | ### How to use after
21 | 
22 | **Activate**
23 | If in the future, you need to switch to the venv setup, just call:
24 | ```
25 | source .venv/bin/activate
26 | ```
27 | 
28 | **Deactivate**
29 | To disconnect from the venv environment, call:
30 | ```
31 | source deactivate
32 | ```


--------------------------------------------------------------------------------
/_docker/docker-compose-ollama.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.4'
 3 | services:
 4 |   weaviate:
 5 |     command:
 6 |     - --host
 7 |     - 0.0.0.0
 8 |     - --port
 9 |     - '8080'
10 |     - --scheme
11 |     - http
12 |     image: cr.weaviate.io/semitechnologies/weaviate:1.26.4
13 |     ports:
14 |     - 8080:8080
15 |     - 50051:50051
16 |     volumes:
17 |     - weaviate_data:/var/lib/weaviate
18 |     restart: on-failure:0
19 |     environment:
20 |       QUERY_DEFAULTS_LIMIT: 25
21 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
22 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
23 |       DEFAULT_VECTORIZER_MODULE: 'none'
24 |       ENABLE_API_BASED_MODULES: true
25 |       ENABLE_MODULES: 'text2vec-ollama,generative-ollama'
26 |       CLUSTER_HOSTNAME: 'node1'
27 | volumes:  
28 |   weaviate_data:
29 | ...


--------------------------------------------------------------------------------
/_docker/docker-compose-ollama-codespace.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.4'
 3 | services:
 4 |   weaviate:
 5 |     command:
 6 |     - --host
 7 |     - 0.0.0.0
 8 |     - --port
 9 |     - '8080'
10 |     - --scheme
11 |     - http
12 |     image: cr.weaviate.io/semitechnologies/weaviate:1.26.4
13 |     ports:
14 |     - 8080:8080
15 |     - 50051:50051
16 |     volumes:
17 |     - weaviate_data:/var/lib/weaviate
18 |     restart: on-failure:0
19 |     environment:
20 |       QUERY_DEFAULTS_LIMIT: 25
21 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
22 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
23 |       DEFAULT_VECTORIZER_MODULE: 'none'
24 |       ENABLE_API_BASED_MODULES: true
25 |       ENABLE_MODULES: 'text2vec-ollama,generative-ollama'
26 |       CLUSTER_HOSTNAME: 'node1'
27 |   ollama: # map to locally run ollama models
28 |     image: ollama/ollama:0.2.5
29 |     volumes:
30 |     - /root/.ollama:/root/.ollama
31 | volumes:  
32 |   weaviate_data:
33 | ...


--------------------------------------------------------------------------------
/_docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.4'
 3 | services:
 4 |   weaviate:
 5 |     command:
 6 |     - --host
 7 |     - 0.0.0.0
 8 |     - --port
 9 |     - '8080'
10 |     - --scheme
11 |     - http
12 |     image: semitechnologies/weaviate:1.32.9
13 |     ports:
14 |     - 8080:8080
15 |     - 50051:50051
16 |     volumes:
17 |     - weaviate_data:/var/lib/weaviate
18 |     restart: on-failure:0
19 |     environment:
20 |       QUERY_DEFAULTS_LIMIT: 25
21 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
22 |       ENABLE_API_BASED_MODULES: 'true'
23 |       CLUSTER_HOSTNAME: 'node1'
24 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'false'
25 |       AUTHENTICATION_DB_USERS_ENABLED: 'true'
26 |       AUTHENTICATION_APIKEY_ENABLED: 'true'
27 |       AUTHENTICATION_APIKEY_ALLOWED_KEYS: 'root-user-key'
28 |       AUTHENTICATION_APIKEY_USERS: 'root-user'
29 |       AUTHORIZATION_ENABLE_RBAC: 'true'
30 |       AUTHORIZATION_RBAC_ROOT_USERS: 'root-user'
31 |       DEFAULT_VECTORIZER_MODULE: 'none'
32 |       ENABLE_MODULES: ''
33 | volumes:  
34 |   weaviate_data:
35 | ...


--------------------------------------------------------------------------------
/_docker/docker-compose-clip.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.4'
 3 | services:
 4 |   weaviate:
 5 |     command:
 6 |     - --host
 7 |     - 0.0.0.0
 8 |     - --port
 9 |     - '8080'
10 |     - --scheme
11 |     - http
12 |     image: cr.weaviate.io/semitechnologies/weaviate:1.30.0
13 |     ports:
14 |     - 8080:8080
15 |     - 50051:50051
16 |     volumes:
17 |     - weaviate_data:/var/lib/weaviate
18 |     restart: on-failure:0
19 |     environment:
20 |       CLIP_INFERENCE_API: 'http://multi2vec-clip:8080'
21 |       QUERY_DEFAULTS_LIMIT: 25
22 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
23 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
24 |       DEFAULT_VECTORIZER_MODULE: 'none'
25 |       ENABLE_API_BASED_MODULES: true
26 |       ENABLE_MODULES: 'text2vec-ollama,generative-ollama,multi2vec-clip'
27 |       CLUSTER_HOSTNAME: 'node1'
28 |   multi2vec-clip:
29 |     image: cr.weaviate.io/semitechnologies/multi2vec-clip:sentence-transformers-clip-ViT-B-32-multilingual-v1
30 |     environment:
31 |       ENABLE_CUDA: '0'
32 |   ollama:
33 |     image: ollama/ollama:0.2.5
34 |     # volumes:
35 |     # - /root/.ollama:/root/.ollama
36 | volumes:
37 |   weaviate_data:
38 | ...


--------------------------------------------------------------------------------
/prep-data.py:
--------------------------------------------------------------------------------
 1 | from huggingface_hub import list_repo_files, hf_hub_download
 2 | 
 3 | def list_wiki_datasets():
 4 |     all_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset")
 5 |     
 6 |     # get items with 0001 parquet file, this way we avoid duplicates
 7 |     items = list(filter(lambda path: path.endswith("0001.parquet"), all_files))
 8 | 
 9 |     # remove the parquet from the name
10 |     return [item.replace("/0001.parquet", "") for item in items]
11 | 
12 | def list_dataset_files(dataset):
13 |     dataset_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset")
14 | 
15 |     return list(filter(lambda path: path.startswith(dataset), dataset_files))
16 | 
17 | def download_file(file):
18 |     hf_hub_download(
19 |         repo_id="weaviate/wiki-sample",
20 |         filename=file,
21 |         repo_type="dataset",
22 |         local_dir="wiki-data",
23 |     )
24 | 
25 | def download_source_files(dataset="no-vectors", max_files=1000):
26 |     files_to_download = list_dataset_files(dataset)
27 |     # print(f"Files to download: {files_to_download}")
28 | 
29 |     for file in files_to_download:
30 |         print(f"Downloading {file}")
31 |         download_file(file)
32 | 
33 |         max_files -= 1
34 |         if(max_files == 0): break
35 | 
36 | download_source_files("weaviate/snowflake-arctic-v2", 10)


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/python-3
 3 | {
 4 |   "name": "Weaviate Playground",
 5 |   "image": "mcr.microsoft.com/devcontainers/python:3.12-bullseye",
 6 |   "features": {
 7 |       "ghcr.io/devcontainers/features/docker-in-docker:2.9": {} 
 8 |   },
 9 |   // Configure tool-specific properties.
10 |   "customizations": {
11 |       // Configure properties specific to VS Code.
12 |       "vscode": {
13 |           // Set *default* container specific settings.json values on container create.
14 |           "settings": {
15 |               "python.defaultInterpreterPath": "/usr/local/bin/python",
16 |               "files.exclude": {
17 |                   "__pycache__": true
18 |               }
19 |           },
20 | 
21 |           // Add the IDs of extensions you want installed when the container is created.
22 |           "extensions": [
23 |               "ms-azuretools.vscode-docker",
24 |               "ms-python.python",
25 |               "ms-toolsai.jupyter"
26 |           ]
27 |       }
28 |   },
29 | 
30 | // Use 'postStartCommand' to run commands after the container is started (more frequently than create).
31 | "postStartCommand": "pip3 install --user -r requirements.txt && python3 prep-data.py",
32 | 
33 |   "hostRequirements": {
34 |       "memory": "16gb",
35 |       "cpus": 4
36 |   },
37 | 
38 |   // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
39 |   "remoteUser": "vscode"
40 | }


--------------------------------------------------------------------------------
/1-intro/jeopardy_tiny.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "Category": "SCIENCE",
 4 |         "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
 5 |         "Answer": "Liver"
 6 |     },
 7 |     {
 8 |         "Category": "ANIMALS",
 9 |         "Question": "It's the only living mammal in the order Proboseidea",
10 |         "Answer": "Elephant"
11 |     },
12 |     {
13 |         "Category": "ANIMALS",
14 |         "Question": "The gavial looks very much like a crocodile except for this bodily feature",
15 |         "Answer": "the nose or snout"
16 |     },
17 |     {
18 |         "Category": "ANIMALS",
19 |         "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
20 |         "Answer": "Antelope"
21 |     },
22 |     {
23 |         "Category": "ANIMALS",
24 |         "Question": "Heaviest of all poisonous snakes is this North American rattlesnake",
25 |         "Answer": "the diamondback rattler"
26 |     },
27 |     {
28 |         "Category": "SCIENCE",
29 |         "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
30 |         "Answer": "species"
31 |     },
32 |     {
33 |         "Category": "SCIENCE",
34 |         "Question": "A metal that is ductile can be pulled into this while cold & under pressure",
35 |         "Answer": "wire"
36 |     },
37 |     {
38 |         "Category": "SCIENCE",
39 |         "Question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance",
40 |         "Answer": "DNA"
41 |     },
42 |     {
43 |         "Category": "SCIENCE",
44 |         "Question": "Changes in the tropospheric layer of this are what gives us weather",
45 |         "Answer": "the atmosphere"
46 |     },
47 |     {
48 |         "Category": "SCIENCE",
49 |         "Question": "In 70-degree air, a plane traveling at about 1,130 feet per second breaks it",
50 |         "Answer": "Sound barrier"
51 |     }
52 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Weaviate Workshop
 2 | 
 3 | ## What you need for the workshop
 4 | 
 5 | * API Keys for embedding models, like:
 6 |   * OpenAI - [API keys](https://platform.openai.com/settings/profile?tab=api-keys)
 7 |   * etc,
 8 | 
 9 | ## Create a Weaviate Cloud instance
10 | 
11 |   * Head to [Weaviate Cloud console](https://console.weaviate.cloud/) and log in, or create a new account.
12 |   * Create a free `Sandbox` cluster. Give it a name, select the cloud region and press "Create".
13 | 
14 | ![wcd create cluster - step 1](img/wcd-create-cluster-1.jpg)
15 | ![wcd create cluster - step 2](img/wcd-create-cluster-2.jpg)
16 | 
17 | ## Running the workshop
18 | 
19 | ### Option 1 - Run locally
20 | 
21 | #### Virtual environment – do this only once
22 | First create a new venv configuration.
23 | ```
24 | python3 -m venv .venv
25 | ```
26 | 
27 | Then switch to the new configuration:
28 | ```
29 | source .venv/bin/activate
30 | ```
31 | 
32 | And install the required packages.
33 | ```
34 | pip install -r requirements.txt
35 | ```
36 | 
37 | ### Option 2 - GitHub CodeSpaces instructions
38 | 
39 | 1. Go to the project [https://github.com/weaviate-tutorials/weaviate-workshop](https://github.com/weaviate-tutorials/weaviate-workshop)
40 | 
41 | Make sure you are logged in with GitHub.
42 | 
43 | 2. Create a Codespace project
44 |   * Press the green `<> Code` button, then switch to `Codespaces` tab.
45 |   * Press the `Create codespace on main` button.
46 |   * Your codespace project will install all the necessary components, it will take a few minutes.
47 | 
48 | 
49 | ## Env vars
50 | 
51 | Update env vars in .env.
52 | 
53 | Hint. you can find your Weaviate Cluster URL and API keys in the [WCD console](https://console.weaviate.cloud/).
54 | 
55 | * WEAVIATE_URL - is the `REST Endpoint`
56 | * WEAVIATE_KEY - is the `Admin` key in `API Keys`
57 | 
58 | ## Test your setup
59 | 
60 | Head to [1-intro/0-prep-run.ipynb](./1-intro/0-prep-run.ipynb), and run through all steps.
61 | 
62 | ## Download the prevectorized data
63 | 
64 | Head to [prep-data.ipynb](./prep-data.ipynb) and run all the cells. This should download the data we will use in the second lesson.
65 | 
66 | ## Enable asynch indexing in the Cloud Console or in Docker
67 | 
68 | In the Cloud Console
69 | ![wcd enable async - step 1](img/wcd-enable-async-indexing.png)
70 | 


--------------------------------------------------------------------------------
/5-vector-compression/data_loader.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from tqdm import tqdm
 3 | from weaviate.util import generate_uuid5
 4 | 
 5 | def prepare_dataset():
 6 |     dt = load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split="train", streaming=True)
 7 |     # dt = load_dataset("weaviate/wiki-sample", "weaviate-snowflake-arctic-v2", split="train", streaming=True)
 8 | 
 9 |     print(f"Loaded Dataset: '{dt.info.dataset_name}' - Config: '{dt.info.config_name}'")
10 | 
11 |     return dt
12 | 
13 | def test_dataset():
14 |     dt = prepare_dataset()
15 | 
16 |     counter = 10
17 |     for item in dt:
18 |         print(item)
19 | 
20 |         counter -= 1
21 |         if(counter == 0): break
22 | 
23 | def import_wiki_data(client, collection_name, max_rows=20_000):
24 |     if(client.collections.exists(collection_name) == False):
25 |         print(f"Error: Collection {collection_name} doesn't exist")
26 |         return
27 | 
28 |     print(f"Importing {max_rows} data items")
29 | 
30 |     dataset = prepare_dataset()
31 |     wiki = client.collections.use(collection_name)
32 | 
33 |     counter = 0
34 | 
35 |     with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
36 |         for item in tqdm(dataset, total=max_rows):
37 | 
38 |             data_to_insert = {   
39 |                 "wiki_id": item["wiki_id"],
40 |                 "text": item["text"],
41 |                 "title": item["title"],
42 |                 "url": item["url"],
43 |             }
44 | 
45 |             item_id = generate_uuid5(item["wiki_id"])
46 | 
47 |             # vector = item["vector"]
48 |             item_vector = {
49 |                 "main_vector": item["vector"]
50 |             }
51 | 
52 |             batch.add_object(
53 |                 properties=data_to_insert,
54 |                 
55 |                 uuid=item_id,
56 |                 vector=item_vector
57 |             )
58 | 
59 |             # Check number of errors while running
60 |             if(batch.number_errors > 10):
61 |                 print(f"Reached {batch.number_errors} Errors during batch import")
62 |                 break
63 |             
64 |             # stop after the request number reaches = max_rows
65 |             counter += 1
66 |             if(counter >= max_rows):
67 |                 break
68 |     
69 |     # check for errors at the end
70 |     if (len(wiki.batch.failed_objects)>0):
71 |         print("Final error check")
72 |         print(f"Some errors {len(wiki.batch.failed_objects)}")
73 |         print(wiki.batch.failed_objects[-1])
74 |     
75 |     print(f"Imported {counter} items")
76 |     print("-----------------------------------")


--------------------------------------------------------------------------------
/prep-data.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from huggingface_hub import list_repo_files, hf_hub_download\n",
10 |     "\n",
11 |     "def list_wiki_datasets():\n",
12 |     "    all_files = list_repo_files(\"weaviate/wiki-sample\", repo_type=\"dataset\")\n",
13 |     "    \n",
14 |     "    # get items with 0001 parquet file, this way we avoid duplicates\n",
15 |     "    items = list(filter(lambda path: path.endswith(\"0001.parquet\"), all_files))\n",
16 |     "\n",
17 |     "    # remove the parquet from the name\n",
18 |     "    return [item.replace(\"/0001.parquet\", \"\") for item in items]\n",
19 |     "\n",
20 |     "def list_dataset_files(dataset):\n",
21 |     "    dataset_files = list_repo_files(\"weaviate/wiki-sample\", repo_type=\"dataset\")\n",
22 |     "\n",
23 |     "    return list(filter(lambda path: path.startswith(dataset), dataset_files))\n",
24 |     "\n",
25 |     "def download_file(file):\n",
26 |     "    hf_hub_download(\n",
27 |     "        repo_id=\"weaviate/wiki-sample\",\n",
28 |     "        filename=file,\n",
29 |     "        repo_type=\"dataset\",\n",
30 |     "        local_dir=\"wiki-data\",\n",
31 |     "    )\n",
32 |     "\n",
33 |     "def download_source_files(dataset=\"no-vectors\", max_files=1000):\n",
34 |     "    files_to_download = list_dataset_files(dataset)\n",
35 |     "    print(f\"Files to download: {files_to_download}\")\n",
36 |     "\n",
37 |     "    for file in files_to_download:\n",
38 |     "        print(f\"Downloading {file}\")\n",
39 |     "        download_file(file)\n",
40 |     "\n",
41 |     "        max_files -= 1\n",
42 |     "        if(max_files == 0): break"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "list_wiki_datasets()"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "download_source_files(\"weaviate/snowflake-arctic-v2\", 10)"
61 |    ]
62 |   }
63 |  ],
64 |  "metadata": {
65 |   "kernelspec": {
66 |    "display_name": ".venv (3.11.9)",
67 |    "language": "python",
68 |    "name": "python3"
69 |   },
70 |   "language_info": {
71 |    "codemirror_mode": {
72 |     "name": "ipython",
73 |     "version": 3
74 |    },
75 |    "file_extension": ".py",
76 |    "mimetype": "text/x-python",
77 |    "name": "python",
78 |    "nbconvert_exporter": "python",
79 |    "pygments_lexer": "ipython3",
80 |    "version": "3.11.9"
81 |   }
82 |  },
83 |  "nbformat": 4,
84 |  "nbformat_minor": 2
85 | }
86 | 


--------------------------------------------------------------------------------
/2-pre-vectorised-data/3-wiki-search-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Query Data - show it works\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import weaviate\n",
 37 |     "from weaviate.classes.init import Auth\n",
 38 |     "\n",
 39 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 40 |     "    cluster_url=WEAVIATE_URL,\n",
 41 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 42 |     ")\n",
 43 |     "\n",
 44 |     "client.is_ready()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Vector search"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "wiki = client.collections.use(\"Wiki\")\n",
 61 |     "\n",
 62 |     "response = wiki.query.near_text(\n",
 63 |     "    query=\"musical instruments\",\n",
 64 |     "    limit=5\n",
 65 |     ")\n",
 66 |     "\n",
 67 |     "for item in response.objects:\n",
 68 |     "    print(item.properties)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Close the client"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "client.close()"
 85 |    ]
 86 |   }
 87 |  ],
 88 |  "metadata": {
 89 |   "kernelspec": {
 90 |    "display_name": ".venv (3.11.9)",
 91 |    "language": "python",
 92 |    "name": "python3"
 93 |   },
 94 |   "language_info": {
 95 |    "codemirror_mode": {
 96 |     "name": "ipython",
 97 |     "version": 3
 98 |    },
 99 |    "file_extension": ".py",
100 |    "mimetype": "text/x-python",
101 |    "name": "python",
102 |    "nbconvert_exporter": "python",
103 |    "pygments_lexer": "ipython3",
104 |    "version": "3.11.9"
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 2
109 | }
110 | 


--------------------------------------------------------------------------------
/4-multi-tenancy/3a-generate-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multi-tenant Chat with Papers - Reading data from papers\n",
  8 |     "\n",
  9 |     "### Helper function to load content from arxiv papers - `from_arxiv_paper`"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import json\n",
 19 |     "from distyll.text import from_arxiv_paper\n",
 20 |     "\n",
 21 |     "paper = from_arxiv_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")\n",
 22 |     "\n",
 23 |     "print(json.dumps(paper, indent=2))"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Helper function to chunk up a very long text - `chunk_text`"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "json.dumps??"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "from distyll.utils import chunk_text\n",
 49 |     "\n",
 50 |     "chunks = chunk_text(source_text=paper[\"text\"], token_length=200)\n",
 51 |     "print(json.dumps(chunks, indent=2))"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "### Combine read and chunk - `get_chunks_from_paper`"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from distyll.text import from_arxiv_paper\n",
 68 |     "from distyll.utils import chunk_text\n",
 69 |     "\n",
 70 |     "def get_chunks_from_paper(url):\n",
 71 |     "    paper = from_arxiv_paper(url)\n",
 72 |     "    chunks = chunk_text(source_text=paper[\"text\"])\n",
 73 |     "\n",
 74 |     "    paper[\"arxiv_id\"] = url.replace(\"https://arxiv.org/pdf/\", \"\").replace(\".pdf\", \"\").replace(\".\", \"-\")\n",
 75 |     "    paper[\"chunks\"] = chunks\n",
 76 |     "    return paper"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "#### Test an example"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "get_chunks_from_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")"
 93 |    ]
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": ".venv (3.11.9)",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.11.9"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 2
117 | }
118 | 


--------------------------------------------------------------------------------
/5-vector-compression/2-search-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "from dotenv import load_dotenv\n",
 11 |     "\n",
 12 |     "load_dotenv()\n",
 13 |     "\n",
 14 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 15 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 16 |     "\n",
 17 |     "print(WEAVIATE_URL[:10])\n",
 18 |     "print(WEAVIATE_KEY[:10])"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import weaviate\n",
 28 |     "from weaviate.classes.init import Auth\n",
 29 |     "\n",
 30 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 31 |     "    cluster_url=WEAVIATE_URL,\n",
 32 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 33 |     ")\n",
 34 |     "\n",
 35 |     "client.is_ready()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Preview data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from weaviate.classes.query import Filter\n",
 52 |     "\n",
 53 |     "wikiQ = client.collections.use(\"WikiQ\")\n",
 54 |     "\n",
 55 |     "response = wikiQ.query.fetch_objects(\n",
 56 |     "    filters=Filter.by_property(\"text\").like(\"musical\"),\n",
 57 |     "    limit=5\n",
 58 |     ")\n",
 59 |     "\n",
 60 |     "for item in response.objects:\n",
 61 |     "    print(item.properties[\"wiki_id\"])\n",
 62 |     "    print(item.properties[\"title\"])\n",
 63 |     "    print(item.properties[\"text\"], '\\n')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Vector search"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "wikiQ = client.collections.use(\"WikiQ\")\n",
 80 |     "\n",
 81 |     "response = wikiQ.query.near_text(\n",
 82 |     "    query=\"musical instruments\",\n",
 83 |     "    limit=5\n",
 84 |     ")\n",
 85 |     "\n",
 86 |     "for item in response.objects:\n",
 87 |     "    print(item.properties)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "> Have fun! Add your own queries."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## Close the client"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "client.close()"
111 |    ]
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": ".venv (3.11.9)",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.11.9"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 2
135 | }
136 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.9.5
  2 | aiosignal==1.3.1
  3 | annotated-types==0.7.0
  4 | anyio==4.8.0
  5 | appnope==0.1.4
  6 | argon2-cffi==23.1.0
  7 | argon2-cffi-bindings==21.2.0
  8 | arrow==1.3.0
  9 | asttokens==3.0.0
 10 | async-lru==2.0.4
 11 | async-timeout==4.0.3
 12 | attrs==23.2.0
 13 | Authlib==1.3.1
 14 | babel==2.16.0
 15 | beautifulsoup4==4.12.3
 16 | bleach==6.1.0
 17 | boto3==1.34.144
 18 | botocore==1.34.144
 19 | Brotli==1.1.0
 20 | certifi==2025.1.31
 21 | cffi==1.17.1
 22 | charset-normalizer==3.3.2
 23 | cohere==5.6.1
 24 | comm==0.2.2
 25 | cryptography==44.0.1
 26 | datasets==2.20.0
 27 | debugpy==1.8.2
 28 | decorator==4.4.2
 29 | defusedxml==0.7.1
 30 | deprecation==2.1.0
 31 | dill==0.3.8
 32 | distro==1.9.0
 33 | distyll-info==0.3.1
 34 | dotenv==0.9.9
 35 | exceptiongroup==1.3.0
 36 | executing==2.2.0
 37 | fastavro==1.9.5
 38 | fastjsonschema==2.20.0
 39 | filelock==3.15.4
 40 | fqdn==1.5.1
 41 | frozenlist==1.4.1
 42 | fsspec==2024.5.0
 43 | grpcio==1.70.0
 44 | grpcio-health-checking==1.70.0
 45 | grpcio-tools==1.70.0
 46 | h11==0.14.0
 47 | httpcore==1.0.7
 48 | httpx==0.28.1
 49 | httpx-sse==0.4.0
 50 | huggingface-hub==0.23.5
 51 | idna==3.10
 52 | imageio==2.35.1
 53 | imageio-ffmpeg==0.5.1
 54 | ipykernel==6.29.5
 55 | ipython==8.32.0
 56 | ipywidgets==8.1.5
 57 | isoduration==20.11.0
 58 | jedi==0.19.2
 59 | Jinja2==3.1.4
 60 | jiter==0.8.2
 61 | jmespath==1.0.1
 62 | json5==0.9.25
 63 | jsonpointer==3.0.0
 64 | jsonschema==4.23.0
 65 | jsonschema-specifications==2023.12.1
 66 | jupyter==1.1.1
 67 | jupyter-console==6.6.3
 68 | jupyter-events==0.10.0
 69 | jupyter-lsp==2.2.5
 70 | jupyter_client==8.6.2
 71 | jupyter_core==5.7.2
 72 | jupyter_server==2.14.2
 73 | jupyter_server_terminals==0.5.3
 74 | jupyterlab==4.2.5
 75 | jupyterlab_pygments==0.3.0
 76 | jupyterlab_server==2.27.3
 77 | jupyterlab_widgets==3.0.13
 78 | load-dotenv==0.1.0
 79 | MarkupSafe==2.1.5
 80 | matplotlib-inline==0.1.7
 81 | mistune==3.0.2
 82 | moviepy==1.0.3
 83 | multidict==6.0.5
 84 | multiprocess==0.70.16
 85 | mutagen==1.47.0
 86 | nbclient==0.10.0
 87 | nbconvert==7.16.4
 88 | nbformat==5.10.4
 89 | nest-asyncio==1.6.0
 90 | notebook==7.2.2
 91 | notebook_shim==0.2.4
 92 | numpy==2.0.0
 93 | openai==1.64.0
 94 | overrides==7.7.0
 95 | packaging==24.1
 96 | pandas==2.2.2
 97 | pandocfilters==1.5.1
 98 | parameterized==0.9.0
 99 | parso==0.8.4
100 | pexpect==4.9.0
101 | pillow==10.4.0
102 | platformdirs==4.2.2
103 | proglog==0.1.10
104 | prometheus_client==0.20.0
105 | prompt_toolkit==3.0.50
106 | protobuf==5.29.3
107 | psutil==6.0.0
108 | ptyprocess==0.7.0
109 | pure_eval==0.2.3
110 | pyarrow==17.0.0
111 | pyarrow-hotfix==0.6
112 | pycparser==2.22
113 | pycryptodomex==3.20.0
114 | pydantic==2.10.6
115 | pydantic_core==2.27.2
116 | pydub==0.25.1
117 | Pygments==2.19.1
118 | pypdf==4.3.1
119 | python-dateutil==2.9.0.post0
120 | python-dotenv==1.0.1
121 | python-json-logger==2.0.7
122 | pytz==2024.1
123 | PyYAML==6.0.1
124 | pyzmq==26.0.3
125 | referencing==0.35.1
126 | requests==2.32.3
127 | rfc3339-validator==0.1.4
128 | rfc3986-validator==0.1.1
129 | rpds-py==0.20.0
130 | s3transfer==0.10.2
131 | Send2Trash==1.8.3
132 | six==1.16.0
133 | sniffio==1.3.1
134 | soupsieve==2.6
135 | stack-data==0.6.3
136 | terminado==0.18.1
137 | tinycss2==1.3.0
138 | tokenizers==0.19.1
139 | tomli==2.2.1
140 | tornado==6.4.1
141 | tqdm==4.67.1
142 | traitlets==5.14.3
143 | types-python-dateutil==2.9.0.20240821
144 | types-requests==2.32.0.20240712
145 | typing_extensions==4.12.2
146 | tzdata==2024.1
147 | uri-template==1.3.0
148 | urllib3==2.2.2
149 | validators==0.34.0
150 | wcwidth==0.2.13
151 | weaviate-client==4.17.0
152 | webcolors==24.8.0
153 | webencodings==0.5.1
154 | websocket-client==1.8.0
155 | websockets==13.0.1
156 | widgetsnbextension==4.0.13
157 | xxhash==3.4.1
158 | yarl==1.9.4
159 | yt-dlp==2023.12.30
160 | 


--------------------------------------------------------------------------------
/1-intro/0-prep-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Check if everything is in place"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Weaviate Python Client v4\n",
 15 |     "> This notebook was created with Weaviate `1.26` and the Weaviate Client `4.7`\n",
 16 |     "\n",
 17 |     "Run the below command to check if you run the latest version of the Weaviate Python Client v4."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "!pip show weaviate-client"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Get keys and urls\n",
 34 |     "\n",
 35 |     "> You can update your env variables in the `.env` file at the root of the project."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import os\n",
 45 |     "from dotenv import load_dotenv\n",
 46 |     "\n",
 47 |     "load_dotenv()\n",
 48 |     "\n",
 49 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 50 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 51 |     "\n",
 52 |     "print(WEAVIATE_URL[:10])\n",
 53 |     "print(WEAVIATE_KEY[:10])"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Connect to Weaviate"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import weaviate\n",
 70 |     "from weaviate.classes.init import Auth\n",
 71 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 72 |     "\n",
 73 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 74 |     "    cluster_url=WEAVIATE_URL,\n",
 75 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 76 |     "\n",
 77 |     "    # additional_config=AdditionalConfig(\n",
 78 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 79 |     "    # )\n",
 80 |     ")\n",
 81 |     "\n",
 82 |     "client.is_ready()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Display the available modules\n",
 90 |     "\n",
 91 |     "> You should be able to see 'generative-openai' and 'text2vec-openai', plus many other modules."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "client.get_meta()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "### Close the client\n",
108 |     "When you are done with the client, you should close it to release the resources."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "client.close()"
118 |    ]
119 |   }
120 |  ],
121 |  "metadata": {
122 |   "kernelspec": {
123 |    "display_name": ".venv (3.11.9)",
124 |    "language": "python",
125 |    "name": "python3"
126 |   },
127 |   "language_info": {
128 |    "codemirror_mode": {
129 |     "name": "ipython",
130 |     "version": 3
131 |    },
132 |    "file_extension": ".py",
133 |    "mimetype": "text/x-python",
134 |    "name": "python",
135 |    "nbconvert_exporter": "python",
136 |    "pygments_lexer": "ipython3",
137 |    "version": "3.11.9"
138 |   }
139 |  },
140 |  "nbformat": 4,
141 |  "nbformat_minor": 2
142 | }
143 | 


--------------------------------------------------------------------------------
/4-multi-tenancy/2-setup-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multi-tenant Chat with Papers - Setup\n",
  8 |     "## Get keys and urls"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import os\n",
 18 |     "from dotenv import load_dotenv\n",
 19 |     "\n",
 20 |     "load_dotenv()\n",
 21 |     "\n",
 22 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 23 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 24 |     "\n",
 25 |     "print(WEAVIATE_URL[:10])\n",
 26 |     "print(WEAVIATE_KEY[:10])"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Connect to Weaviate"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import weaviate\n",
 43 |     "from weaviate.classes.init import Auth\n",
 44 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 47 |     "    cluster_url=WEAVIATE_URL,\n",
 48 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 49 |     "\n",
 50 |     "    # additional_config=AdditionalConfig(\n",
 51 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 52 |     "    # )\n",
 53 |     ")\n",
 54 |     "\n",
 55 |     "client.is_ready()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Create Tenant-ready collection"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from weaviate.classes.config import Configure\n",
 72 |     "\n",
 73 |     "if (client.collections.exists(\"Papers\")):\n",
 74 |     "    client.collections.delete(\"Papers\")\n",
 75 |     "\n",
 76 |     "client.collections.create(\n",
 77 |     "    \"Papers\",\n",
 78 |     "\n",
 79 |     "    vector_config=[\n",
 80 |     "        Configure.Vectors.text2vec_weaviate(\n",
 81 |     "            name=\"main_vector\",\n",
 82 |     "\n",
 83 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 84 |     "            source_properties=[\"chunk\"]\n",
 85 |     "        )\n",
 86 |     "    ],\n",
 87 |     "\n",
 88 |     "    # supported models: https://weaviate.io/developers/weaviate/model-providers/openai/generative#available-models\n",
 89 |     "    generative_config=Configure.Generative.openai(\n",
 90 |     "        model=\"gpt-4o-mini\" # gpt-4\n",
 91 |     "    ),\n",
 92 |     "\n",
 93 |     "    multi_tenancy_config=Configure.multi_tenancy(True)\n",
 94 |     ")"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## List Tenants"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "papers = client.collections.use(\"Papers\")\n",
111 |     "papers.tenants.get()"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "## Close the client"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "client.close()"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": ".venv (3.11.9)",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.11.9"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 2
152 | }
153 | 


--------------------------------------------------------------------------------
/5-vector-compression/1-rq-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Compression – Load Data and compress vectors\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 47 |     "    cluster_url=WEAVIATE_URL,\n",
 48 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 49 |     ")\n",
 50 |     "\n",
 51 |     "client.is_ready()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Create Collection with RQ configuration\n",
 59 |     "\n",
 60 |     "[Docs: Rotational Quantization (RQ)](https://weaviate.io/developers/weaviate/configuration/compression/rq-compression)\n",
 61 |     "\n",
 62 |     "> Note: Rotational Quantization (RQ) does not require a training phase.<br/>\n",
 63 |     "> RQ begins compressing vectors immediately upon insertion, without waiting for a minimum number of objects or a training step.\n",
 64 |     ">\n",
 65 |     "> This makes RQ ideal for applications that need immediate compression and fast setup, as vectors are compressed and searchable as soon as they are added to the collection.<br/>\n"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "from weaviate.classes.config import Configure\n",
 75 |     "\n",
 76 |     "client.collections.delete(\"WikiQ\")\n",
 77 |     "\n",
 78 |     "client.collections.create(\n",
 79 |     "    name=\"WikiQ\",\n",
 80 |     "\n",
 81 |     "    vector_config=[\n",
 82 |     "        Configure.Vectors.text2vec_weaviate(\n",
 83 |     "            name=\"main_vector\",\n",
 84 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 85 |     "            source_properties=['title', 'text'],\n",
 86 |     "\n",
 87 |     "            # Configure RQ\n",
 88 |     "            vector_index_config=Configure.VectorIndex.hnsw(\n",
 89 |     "                quantizer=Configure.VectorIndex.Quantizer.rq(\n",
 90 |     "                    rescore_limit=200,  # Number of overfetched candidates used for rescoring\n",
 91 |     "                    bits=8              # Number of bits (only 8 is supported)\n",
 92 |     "                )\n",
 93 |     "            ),\n",
 94 |     "        )\n",
 95 |     "    ],\n",
 96 |     ")"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "## The rest is the same"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "from data_loader import import_wiki_data\n",
113 |     "import_wiki_data(client, \"WikiQ\", 25000)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "WikiQ = client.collections.get(\"WikiQ\")\n",
123 |     "WikiQ.aggregate.over_all()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Clean up"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# client.collections.delete(\"WikiQ\")"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "## Close the client"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "client.close()"
156 |    ]
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "kernelspec": {
161 |    "display_name": ".venv (3.11.9)",
162 |    "language": "python",
163 |    "name": "python3"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 3
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython3",
175 |    "version": "3.11.9"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 2
180 | }
181 | 


--------------------------------------------------------------------------------
/5-vector-compression/1-sq-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Compression – Load Data and compress vectors\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_custom(\n",
 47 |     "    http_host=WEAVIATE_URL,\n",
 48 |     "    http_port=8080,\n",
 49 |     "    http_secure=False,\n",
 50 |     "    grpc_host=WEAVIATE_KEY,\n",
 51 |     "    grpc_port=50051,\n",
 52 |     "    grpc_secure=False,\n",
 53 |     ")\n",
 54 |     "\n",
 55 |     "client.is_ready()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Create Collection with SQ configuration\n",
 63 |     "\n",
 64 |     "[Docs: Scalar Quantization (SQ)](https://weaviate.io/developers/weaviate/configuration/compression/sq-compression)\n",
 65 |     "\n",
 66 |     "> Note: Scalar Quantization includes a training phase, which is required to determine scalar bucket boundaries.<br/>\n",
 67 |     "> In other words, based on your data, it figures out how to best compress your vectors.\n",
 68 |     ">\n",
 69 |     "> The compression training starts when the collection reaches `training_limit` number of objects.<br/>\n",
 70 |     "> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "from weaviate.classes.config import Configure\n",
 80 |     "\n",
 81 |     "client.collections.delete(\"WikiQ\")\n",
 82 |     "\n",
 83 |     "# Create a collection here - with Cohere as a vectorizer\n",
 84 |     "client.collections.create(\n",
 85 |     "    name=\"WikiQ\",\n",
 86 |     "\n",
 87 |     "    vector_config=[\n",
 88 |     "        Configure.Vectors.text2vec_weaviate(\n",
 89 |     "            name=\"main_vector\",\n",
 90 |     "\n",
 91 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 92 |     "            source_properties=['title', 'text'],\n",
 93 |     "\n",
 94 |     "            # Configure SQ\n",
 95 |     "            vector_index_config=Configure.VectorIndex.hnsw(\n",
 96 |     "                quantizer=Configure.VectorIndex.Quantizer.sq(\n",
 97 |     "                    rescore_limit=200,    # the number of overfeteched candidates used for rescoring\n",
 98 |     "                    training_limit=10_000  # (default 100k) number of objects needed to train the codebook\n",
 99 |     "                )\n",
100 |     "            ),\n",
101 |     "        )\n",
102 |     "    ],\n",
103 |     ")"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## The rest is the same"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "from data_loader import import_wiki_data\n",
120 |     "import_wiki_data(client, \"WikiQ\", 25000)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "WikiQ = client.collections.use(\"WikiQ\")\n",
130 |     "WikiQ.aggregate.over_all()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Clean up"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# client.collections.delete(\"WikiQ\")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Close the client"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "client.close()"
163 |    ]
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": ".venv (3.11.9)",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.11.9"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 2
187 | }
188 | 


--------------------------------------------------------------------------------
/5-vector-compression/1-bq-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# BQ Compression – Load Data and compress vectors\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 47 |     "    cluster_url=WEAVIATE_URL,\n",
 48 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 49 |     ")\n",
 50 |     "\n",
 51 |     "client.is_ready()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Create Collection with BQ configuration\n",
 59 |     "\n",
 60 |     "[Docs: Binary Quantization (BQ)](https://weaviate.io/developers/weaviate/configuration/compression/bq-compression)\n",
 61 |     "\n",
 62 |     "Note #1: Binary Quantization works from the first object added to the collection. No training required.\n",
 63 |     "\n",
 64 |     "Note #2: Binary Quantization works both with HNSW and Flat index."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "from weaviate.classes.config import Configure, VectorDistances\n",
 74 |     "\n",
 75 |     "client.collections.delete(\"WikiQ\")\n",
 76 |     "\n",
 77 |     "# Create a collection here - with Cohere as a vectorizer\n",
 78 |     "client.collections.create(\n",
 79 |     "    name=\"WikiQ\",\n",
 80 |     "\n",
 81 |     "    vector_config=[\n",
 82 |     "        Configure.Vectors.text2vec_weaviate(\n",
 83 |     "            name=\"main_vector\",\n",
 84 |     "\n",
 85 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 86 |     "            source_properties=['title', 'text'],\n",
 87 |     "\n",
 88 |     "            # Configure BQ with flat vector index\n",
 89 |     "            vector_index_config=Configure.VectorIndex.flat(\n",
 90 |     "                distance_metric=VectorDistances.COSINE,\n",
 91 |     "                vector_cache_max_objects=100_000,\n",
 92 |     "                quantizer=Configure.VectorIndex.Quantizer.bq(\n",
 93 |     "                    rescore_limit=200,\n",
 94 |     "                    cache=True\n",
 95 |     "                )\n",
 96 |     "            ),\n",
 97 |     "\n",
 98 |     "            # HSNW example\n",
 99 |     "            # vector_index_config=Configure.VectorIndex.hsnw(\n",
100 |     "            #     quantizer=Configure.VectorIndex.Quantizer.bq(\n",
101 |     "            #         rescore_limit=200,\n",
102 |     "            #         cache=True\n",
103 |     "            #     )\n",
104 |     "            # ),\n",
105 |     "        )\n",
106 |     "    ],\n",
107 |     ")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## The rest is the same"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "from data_loader import import_wiki_data\n",
124 |     "import_wiki_data(client, \"WikiQ\", 25000)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "wikiQ = client.collections.use(\"WikiQ\")\n",
134 |     "wikiQ.aggregate.over_all()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## Clean up"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# client.collections.delete(\"WikiQ\")"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "## Close the client"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "client.close()"
167 |    ]
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": ".venv (3.11.9)",
173 |    "language": "python",
174 |    "name": "python3"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.11.9"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 2
191 | }
192 | 


--------------------------------------------------------------------------------
/5-vector-compression/1-pq-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Compression – Load Data and compress vectors\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_custom(\n",
 47 |     "    http_host=WEAVIATE_URL,\n",
 48 |     "    http_port=8080,\n",
 49 |     "    http_secure=False,\n",
 50 |     "    grpc_host=WEAVIATE_KEY,\n",
 51 |     "    grpc_port=50051,\n",
 52 |     "    grpc_secure=False,\n",
 53 |     ")\n",
 54 |     "\n",
 55 |     "client.is_ready()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Create Collection with PQ configuration\n",
 63 |     "\n",
 64 |     "[Docs: Product Quantization (PQ)](https://weaviate.io/developers/weaviate/configuration/compression/pq-compression)\n",
 65 |     "\n",
 66 |     "> Note: Product Quantization includes a training phase, which is required to create codebooks (codebooks are used to generate centroids for compressed vectors).<br/>\n",
 67 |     "> In other words, based on your data, it figures out how to best compress your vectors.\n",
 68 |     ">\n",
 69 |     "> The compression training starts when the collection reaches `training_limit` number of objects.<br/>\n",
 70 |     "> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "from weaviate.classes.config import Configure\n",
 80 |     "\n",
 81 |     "client.collections.delete(\"WikiQ\")\n",
 82 |     "\n",
 83 |     "# Create a collection here - with Weaviate as a vectorizer\n",
 84 |     "client.collections.create(\n",
 85 |     "    name=\"WikiQ\",\n",
 86 |     "\n",
 87 |     "    vector_config=[\n",
 88 |     "        Configure.Vectors.text2vec_weaviate(\n",
 89 |     "            name=\"main_vector\",\n",
 90 |     "\n",
 91 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 92 |     "            source_properties=['title', 'text'],\n",
 93 |     "\n",
 94 |     "            # Configure PQ\n",
 95 |     "            vector_index_config=Configure.VectorIndex.hnsw(\n",
 96 |     "                quantizer=Configure.VectorIndex.Quantizer.pq(\n",
 97 |     "                    segments=256, # 1536/6 # new number of dimension segments\n",
 98 |     "                    training_limit=10_000  # (default 100k) number of objects needed to train the codebook\n",
 99 |     "                )\n",
100 |     "            ),\n",
101 |     "        )\n",
102 |     "    ],\n",
103 |     ")"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## The rest is the same"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "from data_loader import import_wiki_data\n",
120 |     "import_wiki_data(client, \"WikiQ\", 25000)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "WikiQ = client.collections.use(\"WikiQ\")\n",
130 |     "WikiQ.aggregate.over_all()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Clean up"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# client.collections.delete(\"WikiQ\")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Close the client"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "client.close()"
163 |    ]
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": ".venv (3.11.9)",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.11.9"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 2
187 | }
188 | 


--------------------------------------------------------------------------------
/4-multi-tenancy/4-search-tenants.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multi-tenant Chat with Papers - Query papers\n",
  8 |     "## Get keys and urls"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import os\n",
 18 |     "from dotenv import load_dotenv\n",
 19 |     "\n",
 20 |     "load_dotenv()\n",
 21 |     "\n",
 22 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 23 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 24 |     "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])\n",
 28 |     "print(OPENAI_API_KEY[:10])"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Connect to Weaviate"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import weaviate\n",
 45 |     "from weaviate.classes.init import Auth\n",
 46 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 47 |     "\n",
 48 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 49 |     "    cluster_url=WEAVIATE_URL,\n",
 50 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 51 |     "\n",
 52 |     "    headers = {\n",
 53 |     "        \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
 54 |     "    },\n",
 55 |     "\n",
 56 |     "    # additional_config=AdditionalConfig(\n",
 57 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 58 |     "    # )\n",
 59 |     ")\n",
 60 |     "\n",
 61 |     "client.is_ready()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## Vector search on tenants"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "papers = client.collections.use(\"Papers\")\n",
 78 |     "\n",
 79 |     "ten = papers.with_tenant(\"2212-10496\")\n",
 80 |     "\n",
 81 |     "response = ten.query.near_text(\n",
 82 |     "    query=\"Unsupervised learning\",\n",
 83 |     "    limit=5,\n",
 84 |     ")\n",
 85 |     "\n",
 86 |     "for item in response.objects:\n",
 87 |     "    print(item.properties[\"chunk\"])"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Generative Search with tenants"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "papers = client.collections.use(\"Papers\")\n",
104 |     "\n",
105 |     "ten2212 = papers.with_tenant(\"2212-10496\")\n",
106 |     "\n",
107 |     "response = ten2212.generate.near_text(\n",
108 |     "    query=\"Unsupervised learning\",\n",
109 |     "    limit=5,\n",
110 |     "    single_prompt=\"What does the following text describe: {chunk}\",\n",
111 |     ")\n",
112 |     "\n",
113 |     "for item in response.objects:\n",
114 |     "    print(item.properties[\"chunk\"])\n",
115 |     "    print(item.generative.text, '\\n')"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "papers = client.collections.use(\"Papers\")\n",
125 |     "\n",
126 |     "ten2212 = papers.with_tenant(\"2212-10496\")\n",
127 |     "\n",
128 |     "response = ten2212.generate.near_text(\n",
129 |     "    query=\"Unsupervised learning\",\n",
130 |     "    limit=5,\n",
131 |     "    grouped_task=\"Explain how unsupervised learning works. Use only the provided content.\",\n",
132 |     "    grouped_properties=[\"chunk\"]\n",
133 |     ")\n",
134 |     "\n",
135 |     "for item in response.objects:\n",
136 |     "    print(item.properties[\"chunk\"])\n",
137 |     "\n",
138 |     "print(response.generative.text)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "def paper_rag(paper_id, query, prompt):\n",
148 |     "    papers = client.collections.use(\"Papers\")\n",
149 |     "    ten = papers.with_tenant(paper_id)\n",
150 |     "\n",
151 |     "    response = ten.generate.near_text(\n",
152 |     "        query=query,\n",
153 |     "        limit=5,\n",
154 |     "        grouped_task=prompt + \" Use only the provided content.\",\n",
155 |     "        grouped_properties=[\"chunk\"],\n",
156 |     "    )\n",
157 |     "\n",
158 |     "    return {\n",
159 |     "        \"title\": response.objects[0].properties[\"title\"],\n",
160 |     "        \"source\": [p.properties[\"chunk\"] for p in response.objects],\n",
161 |     "        \"generated\": response.generative.text\n",
162 |     "    }"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "paper_rag(\n",
172 |     "    \"2212-10496\",\n",
173 |     "    \"Unsupervised learning\",\n",
174 |     "    \"Explain how unsupervised learning works\"\n",
175 |     ")"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "papers = client.collections.use(\"Papers\")\n",
185 |     "papers.tenants.get()"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "## Close the client"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "client.close()"
202 |    ]
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": ".venv (3.11.9)",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.11.9"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 2
226 | }
227 | 


--------------------------------------------------------------------------------
/3-rag/2-rag-gen-query-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# !pip install openai"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
 26 |     "\n",
 27 |     "print(WEAVIATE_URL[:10])\n",
 28 |     "print(WEAVIATE_KEY[:10])\n",
 29 |     "print(OPENAI_API_KEY[:10])"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Generate query from prompt"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from openai import OpenAI\n",
 46 |     "\n",
 47 |     "openai_client = OpenAI(\n",
 48 |     "    api_key=OPENAI_API_KEY,\n",
 49 |     "    base_url=\"https://api.openai.com/v1\",\n",
 50 |     ")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "def generate_query_from_promt(prompt):\n",
 60 |     "    response = openai_client.chat.completions.create(\n",
 61 |     "        model=\"gpt-3.5-turbo\",\n",
 62 |     "        messages=[\n",
 63 |     "            { \"role\": \"system\", \"content\": \"Your job is to extract a query from the provided user prompt, the query will then be used to run a query in a vector database.\" },\n",
 64 |     "            { \n",
 65 |     "                \"role\": \"user\",\n",
 66 |     "                \"content\": f\"Please give me a 2-3 word query that can be used to find relevant info to the following prompt - {prompt}\"\n",
 67 |     "            },\n",
 68 |     "        ]\n",
 69 |     "    )\n",
 70 |     "    return response.choices[0].message.content"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# Example of how to generate a query from a prompt\n",
 80 |     "generate_query_from_promt(\"Where do the tallest penguins live?\")"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Connect to Weaviate"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "import weaviate\n",
 97 |     "from weaviate.classes.init import Auth\n",
 98 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 99 |     "\n",
100 |     "client = weaviate.connect_to_weaviate_cloud(\n",
101 |     "    cluster_url=WEAVIATE_URL,\n",
102 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
103 |     "\n",
104 |     "    headers = {\n",
105 |     "        \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
106 |     "    },\n",
107 |     "\n",
108 |     "    # additional_config=AdditionalConfig(\n",
109 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
110 |     "    # )\n",
111 |     ")\n",
112 |     "\n",
113 |     "client.is_ready()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Two-step RAG"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "def two_step_rag(user_prompt):\n",
130 |     "    # Step 1\n",
131 |     "    prompt = user_prompt + \" Please only use the provided content with this prompt. Don't make things up.\"\n",
132 |     "    \n",
133 |     "    generated_query = generate_query_from_promt(prompt)\n",
134 |     "    print(\"=== Generated Query ===\")\n",
135 |     "    print(f\"Generated query: {generated_query}\")\n",
136 |     "\n",
137 |     "    # Step 2\n",
138 |     "    wiki = client.collections.use(\"Wiki\")\n",
139 |     "\n",
140 |     "    response = wiki.generate.near_text(\n",
141 |     "        query=generated_query,\n",
142 |     "        limit=3,\n",
143 |     "        grouped_task=prompt,\n",
144 |     "        grouped_properties=[\"text\", \"title\"]\n",
145 |     "    )\n",
146 |     "\n",
147 |     "    # Print results\n",
148 |     "    print(\"\\n=== Generated Response ===\")\n",
149 |     "    print(response.generative.text)\n",
150 |     "\n",
151 |     "    print(\"\\n=== Source ===\")\n",
152 |     "    for item in response.objects:\n",
153 |     "        print(item.properties)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# two_step_rag(\"What wild animals do we know about?\")\n",
163 |     "two_step_rag(\"Please provide an explanation at a highschool level. How do airplanes fly?\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "two_step_rag(\"What are the pros and cons of automation using computer?\")"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "two_step_rag(\"How do CPUs work?\")"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## Close the client"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "client.close()"
198 |    ]
199 |   }
200 |  ],
201 |  "metadata": {
202 |   "kernelspec": {
203 |    "display_name": ".venv (3.11.9)",
204 |    "language": "python",
205 |    "name": "python3"
206 |   },
207 |   "language_info": {
208 |    "codemirror_mode": {
209 |     "name": "ipython",
210 |     "version": 3
211 |    },
212 |    "file_extension": ".py",
213 |    "mimetype": "text/x-python",
214 |    "name": "python",
215 |    "nbconvert_exporter": "python",
216 |    "pygments_lexer": "ipython3",
217 |    "version": "3.11.9"
218 |   }
219 |  },
220 |  "nbformat": 4,
221 |  "nbformat_minor": 2
222 | }
223 | 


--------------------------------------------------------------------------------
/4-multi-tenancy/3b-load-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multi-tenant Chat with Papers - Load and chunk papers\n",
  8 |     "## Get keys and urls"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import os\n",
 18 |     "from dotenv import load_dotenv\n",
 19 |     "\n",
 20 |     "load_dotenv()\n",
 21 |     "\n",
 22 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 23 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 24 |     "\n",
 25 |     "print(WEAVIATE_URL[:10])\n",
 26 |     "print(WEAVIATE_KEY[:10])"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Connect to Weaviate"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import weaviate\n",
 43 |     "from weaviate.classes.init import Auth\n",
 44 |     "\n",
 45 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 46 |     "    cluster_url=WEAVIATE_URL,\n",
 47 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 48 |     ")\n",
 49 |     "\n",
 50 |     "client.is_ready()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## Load Data from arxiv\n",
 58 |     "\n",
 59 |     "1. Get chunks from paper - `get_chunks_from_paper`\n",
 60 |     "2. Create a tenant for the paper - `create_tenant`\n",
 61 |     "3. Batch import chunks - `batch_import_chunks`"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "### 1. Get chunks from paper - `get_chunks_from_paper`"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from distyll.text import from_arxiv_paper\n",
 78 |     "from distyll.utils import chunk_text\n",
 79 |     "\n",
 80 |     "def get_chunks_from_paper(url):\n",
 81 |     "    paper = from_arxiv_paper(url)\n",
 82 |     "    chunks = chunk_text(source_text=paper[\"text\"])\n",
 83 |     "\n",
 84 |     "    paper[\"arxiv_id\"] = url.replace(\"https://arxiv.org/pdf/\", \"\").replace(\".pdf\", \"\").replace(\".\", \"-\")\n",
 85 |     "    paper[\"chunks\"] = chunks\n",
 86 |     "    return paper"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "chunked_2212 = get_chunks_from_paper(\"https://arxiv.org/pdf/2212.10496.pdf\")\n",
 96 |     "chunked_2212"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "### 2. Create a tenant for the paper - `create_tenant`"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "from weaviate.classes.tenants import Tenant\n",
113 |     "papers = client.collections.use(\"Papers\")\n",
114 |     "\n",
115 |     "def create_tenant(chunked_paper):\n",
116 |     "    tenant_name = chunked_paper[\"arxiv_id\"]\n",
117 |     "\n",
118 |     "    papers.tenants.create([\n",
119 |     "        Tenant(name=tenant_name)\n",
120 |     "    ])\n",
121 |     "\n",
122 |     "    return tenant_name"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "create_tenant(chunked_2212)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "papers.tenants.get()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "### 3. Batch import chunks - `batch_import_chunks`"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "def batch_import_chunks(chunked_paper):\n",
157 |     "    ten = papers.with_tenant(chunked_paper[\"arxiv_id\"])\n",
158 |     "\n",
159 |     "    i=0\n",
160 |     "    with ten.batch.dynamic() as batch:\n",
161 |     "        for chunk in chunked_paper[\"chunks\"]:\n",
162 |     "            batch.add_object({\n",
163 |     "                \"title\": chunked_paper[\"title\"],\n",
164 |     "                \"url\": chunked_paper[\"url\"],\n",
165 |     "                \"chunk\": chunk,\n",
166 |     "                \"chunk_no\": i,\n",
167 |     "            })\n",
168 |     "            i+=1\n",
169 |     "\n",
170 |     "    # if(len(papers.batch.failed_objects)>0):\n",
171 |     "    if(len(ten.batch.failed_objects)>0):\n",
172 |     "        print(\"Import complete with errors\")\n",
173 |     "        for err in papers.batch.failed_objects:\n",
174 |     "            print(err)\n",
175 |     "    else:\n",
176 |     "        print(\"Import complete with no errors\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "batch_import_chunks(chunked_2212)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "## End-to-end paper load"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "def import_paper(url):\n",
202 |     "    cp = get_chunks_from_paper(url)\n",
203 |     "    tenant_name = create_tenant(cp)\n",
204 |     "    batch_import_chunks(cp)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "import_paper(\"https://arxiv.org/pdf/2401.00107.pdf\")"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Close the client"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "client.close()"
230 |    ]
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": ".venv (3.11.9)",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.11.9"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 2
254 | }
255 | 


--------------------------------------------------------------------------------
/5-vector-compression/0-vector-indexes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Vector Indexes\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 47 |     "    cluster_url=WEAVIATE_URL,\n",
 48 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 49 |     ")\n",
 50 |     "\n",
 51 |     "client.is_ready()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Collection with HNSW index (default)\n",
 59 |     "\n",
 60 |     "[HNSW params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#hnsw-index-parameters)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "from weaviate.classes.config import Configure, VectorDistances\n",
 70 |     "\n",
 71 |     "client.collections.delete(\"IndexExample\")\n",
 72 |     "\n",
 73 |     "# Create a collection here - with Weaviate as a vectorizer\n",
 74 |     "client.collections.create(\n",
 75 |     "    name=\"IndexExample\",\n",
 76 |     "\n",
 77 |     "    vector_config=[\n",
 78 |     "        Configure.Vectors.text2vec_weaviate(\n",
 79 |     "            name=\"main_vector\",\n",
 80 |     "\n",
 81 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 82 |     "            source_properties=['title', 'text'],\n",
 83 |     "\n",
 84 |     "            # HSNW example \n",
 85 |     "            vector_index_config=Configure.VectorIndex.hnsw()\n",
 86 |     "        )\n",
 87 |     "    ],\n",
 88 |     ")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "## Collection with Flat index\n",
 96 |     "\n",
 97 |     "[Flat params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#flat-indexes)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "from weaviate.classes.config import Configure, VectorDistances\n",
107 |     "\n",
108 |     "client.collections.delete(\"IndexExample\")\n",
109 |     "\n",
110 |     "# Create a collection here - with Cohere as a vectorizer\n",
111 |     "client.collections.create(\n",
112 |     "    name=\"IndexExample\",\n",
113 |     "\n",
114 |     "    vector_config=[\n",
115 |     "        Configure.Vectors.text2vec_weaviate(\n",
116 |     "            name=\"main_vector\",\n",
117 |     "\n",
118 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
119 |     "            source_properties=['title', 'text'],\n",
120 |     "\n",
121 |     "            # Flat example\n",
122 |     "            vector_index_config=Configure.VectorIndex.flat(\n",
123 |     "                # distance_metric=VectorDistances.COSINE, # optional\n",
124 |     "                vector_cache_max_objects=100_000,\n",
125 |     "            ),\n",
126 |     "        ),\n",
127 |     "    ],\n",
128 |     ")"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "## Collection with Dynamic index\n",
136 |     "\n",
137 |     "[Dynamic params](https://weaviate.io/developers/weaviate/config-refs/schema/vector-index#dynamic-index-parameters)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "from weaviate.classes.config import Configure\n",
147 |     "\n",
148 |     "client.collections.delete(\"IndexExample\")\n",
149 |     "\n",
150 |     "# Create a collection here - with Cohere as a vectorizer\n",
151 |     "client.collections.create(\n",
152 |     "    name=\"IndexExample\",\n",
153 |     "\n",
154 |     "    vector_config=[\n",
155 |     "        Configure.Vectors.text2vec_weaviate(\n",
156 |     "            name=\"main_vector\",\n",
157 |     "\n",
158 |     "            model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
159 |     "            source_properties=['title', 'text'],\n",
160 |     "\n",
161 |     "            # Dynamic example\n",
162 |     "            vector_index_config=Configure.VectorIndex.dynamic(\n",
163 |     "                threshold=10_000, # when to switch to HNSW\n",
164 |     "\n",
165 |     "                flat=Configure.VectorIndex.flat(\n",
166 |     "                    vector_cache_max_objects=100_000,\n",
167 |     "                    # note: can also include a quantizer\n",
168 |     "                    quantizer=Configure.VectorIndex.Quantizer.bq()\n",
169 |     "                ),\n",
170 |     "\n",
171 |     "                hnsw=Configure.VectorIndex.hnsw(\n",
172 |     "                    max_connections=32, # optional\n",
173 |     "                    # note: the quantizer can be different between flat and hnsw\n",
174 |     "                    quantizer=Configure.VectorIndex.Quantizer.pq()\n",
175 |     "                ),\n",
176 |     "            ),\n",
177 |     "        )\n",
178 |     "    ],\n",
179 |     ")"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "## The rest is the same"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "# comment this out if you want to import 25k objects to your collection to test it\n",
196 |     "# from data_loader import import_wiki_data\n",
197 |     "# import_wiki_data(client, \"IndexExample\", 25_000)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "# index_example = client.collections.use(\"IndexExample\")\n",
207 |     "# index_example.aggregate.over_all()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Clean up"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "client.collections.delete(\"IndexExample\")"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "## Close the client"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "client.close()"
240 |    ]
241 |   }
242 |  ],
243 |  "metadata": {
244 |   "kernelspec": {
245 |    "display_name": ".venv (3.11.9)",
246 |    "language": "python",
247 |    "name": "python3"
248 |   },
249 |   "language_info": {
250 |    "codemirror_mode": {
251 |     "name": "ipython",
252 |     "version": 3
253 |    },
254 |    "file_extension": ".py",
255 |    "mimetype": "text/x-python",
256 |    "name": "python",
257 |    "nbconvert_exporter": "python",
258 |    "pygments_lexer": "ipython3",
259 |    "version": "3.11.9"
260 |   }
261 |  },
262 |  "nbformat": 4,
263 |  "nbformat_minor": 2
264 | }
265 | 


--------------------------------------------------------------------------------
/2-pre-vectorised-data/2-wiki-import.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Load Data with Vectors\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 47 |     "    cluster_url=WEAVIATE_URL,\n",
 48 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 49 |     ")\n",
 50 |     "\n",
 51 |     "client.is_ready()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "weaviate.__version__"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "from weaviate.classes.config import Configure\n",
 70 |     "\n",
 71 |     "def create_wiki_collection():\n",
 72 |     "    if client.collections.exists(\"Wiki\"):\n",
 73 |     "        client.collections.delete(\"Wiki\")\n",
 74 |     "\n",
 75 |     "    # Create a collection here - with Weaviate vectorizer and define source properties\n",
 76 |     "    client.collections.create(\n",
 77 |     "        name=\"Wiki\",\n",
 78 |     "\n",
 79 |     "        vector_config=[\n",
 80 |     "            Configure.Vectors.text2vec_weaviate(\n",
 81 |     "                name=\"main_vector\",\n",
 82 |     "\n",
 83 |     "                # TODO: use model Snowflake/snowflake-arctic-embed-l-v2.0\n",
 84 |     "                # TODO: set source properties to title and text\n",
 85 |     "                \n",
 86 |     "            )\n",
 87 |     "        ],\n",
 88 |     "    )\n",
 89 |     "\n",
 90 |     "create_wiki_collection()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "## Load the data from parquet files"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "from datasets import load_dataset\n",
107 |     "\n",
108 |     "def prepare_dataset():\n",
109 |     "    return load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split=\"train\", streaming=True)\n",
110 |     "    # return load_dataset(\"weaviate/wiki-sample\", \"weaviate-snowflake-arctic-v2\", split=\"train\", streaming=True)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "### Dataset Test\n",
118 |     "<!-- The parquet files should be located in \"datasets/openai\". -->"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "dataset = prepare_dataset()\n",
128 |     "\n",
129 |     "counter = 10\n",
130 |     "for i in dataset:\n",
131 |     "    print(i)\n",
132 |     "\n",
133 |     "    counter -= 1\n",
134 |     "    if(counter == 0): break"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "### The import function\n",
142 |     "\n",
143 |     "`TODO:`\n",
144 |     "* add a function to add objects to batch"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "from tqdm import tqdm\n",
154 |     "from weaviate.util import generate_uuid5\n",
155 |     "\n",
156 |     "def import_wiki_data(max_rows=10_000):\n",
157 |     "    print(f\"Importing {max_rows} data items\")\n",
158 |     "\n",
159 |     "    dataset = prepare_dataset()\n",
160 |     "    wiki = client.collections.use(\"Wiki\")\n",
161 |     "\n",
162 |     "    counter = 0\n",
163 |     "\n",
164 |     "    with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n",
165 |     "        for item in tqdm(dataset, total=max_rows):\n",
166 |     "\n",
167 |     "            data_to_insert = {   \n",
168 |     "                \"wiki_id\": item[\"wiki_id\"],\n",
169 |     "                \"text\": item[\"text\"],\n",
170 |     "                \"title\": item[\"title\"],\n",
171 |     "                \"url\": item[\"url\"],\n",
172 |     "            }\n",
173 |     "\n",
174 |     "            item_id = generate_uuid5(item[\"wiki_id\"])\n",
175 |     "\n",
176 |     "            item_vector = {\n",
177 |     "                \"main_vector\": item[\"vector\"]\n",
178 |     "            }\n",
179 |     "\n",
180 |     "            # TODO: add objects to batch using\n",
181 |     "            batch.add_object(\n",
182 |     "                # * data_to_insert\n",
183 |     "                # * item_id\n",
184 |     "                # * item_vector\n",
185 |     "            )\n",
186 |     "\n",
187 |     "            # Check number of errors while running\n",
188 |     "            if(batch.number_errors > 10):\n",
189 |     "                print(f\"Reached {batch.number_errors} errors during batch import\")\n",
190 |     "                break\n",
191 |     "            \n",
192 |     "            # stop after the request number reaches = max_rows\n",
193 |     "            counter += 1\n",
194 |     "            if counter >= max_rows:\n",
195 |     "                break\n",
196 |     "    \n",
197 |     "    # check for errors at the end\n",
198 |     "    if (len(wiki.batch.failed_objects)>0):\n",
199 |     "        print(\"Final error check\")\n",
200 |     "        print(f\"Some errors {len(wiki.batch.failed_objects)}\")\n",
201 |     "        print(wiki.batch.failed_objects[-1])\n",
202 |     "    \n",
203 |     "    print(f\"Imported {counter} items\")\n",
204 |     "    print(\"-----------------------------------\")"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "import_wiki_data(10_000)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Check if data loaded correctly"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "wiki = client.collections.use(\"Wiki\")\n",
230 |     "len(wiki)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "res = wiki.query.fetch_objects(limit=1, include_vector=True)\n",
240 |     "print(res.objects[0].properties)\n",
241 |     "print(res.objects[0].vector)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "client.close()"
251 |    ]
252 |   }
253 |  ],
254 |  "metadata": {
255 |   "kernelspec": {
256 |    "display_name": ".venv (3.11.9)",
257 |    "language": "python",
258 |    "name": "python3"
259 |   },
260 |   "language_info": {
261 |    "codemirror_mode": {
262 |     "name": "ipython",
263 |     "version": 3
264 |    },
265 |    "file_extension": ".py",
266 |    "mimetype": "text/x-python",
267 |    "name": "python",
268 |    "nbconvert_exporter": "python",
269 |    "pygments_lexer": "ipython3",
270 |    "version": "3.11.9"
271 |   }
272 |  },
273 |  "nbformat": 4,
274 |  "nbformat_minor": 2
275 | }
276 | 


--------------------------------------------------------------------------------
/2-pre-vectorised-data/complete/2-wiki-import-complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Load Data with Vectors\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "\n",
 46 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 47 |     "    cluster_url=WEAVIATE_URL,\n",
 48 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 49 |     ")\n",
 50 |     "\n",
 51 |     "client.is_ready()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "from weaviate.classes.config import Configure\n",
 61 |     "\n",
 62 |     "def create_wiki_collection():\n",
 63 |     "    if client.collections.exists(\"Wiki\"):\n",
 64 |     "        client.collections.delete(\"Wiki\")\n",
 65 |     "\n",
 66 |     "    # Create a collection here - with OpenAI vectorizer and define source properties\n",
 67 |     "    client.collections.create(\n",
 68 |     "        name=\"Wiki\",\n",
 69 |     "\n",
 70 |     "        vector_config=[\n",
 71 |     "            Configure.Vectors.text2vec_weaviate(\n",
 72 |     "                name=\"main_vector\",\n",
 73 |     "\n",
 74 |     "                model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
 75 |     "                source_properties=['title', 'text'] # which properties should be used to generate a vector\n",
 76 |     "            )\n",
 77 |     "        ],\n",
 78 |     "    )\n",
 79 |     "\n",
 80 |     "create_wiki_collection()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Load the data from parquet files"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from datasets import load_dataset\n",
 97 |     "\n",
 98 |     "def prepare_dataset():\n",
 99 |     "    return load_dataset('parquet', data_files={'train': ['../../wiki-data/weaviate/snowflake-arctic-v2/*.parquet']}, split=\"train\", streaming=True)\n",
100 |     "    # return load_dataset(\"weaviate/wiki-sample\", \"weaviate-snowflake-arctic-v2\", split=\"train\", streaming=True)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "### Dataset Test\n",
108 |     "<!-- The parquet files should be located in \"datasets/openai\". -->"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "dataset = prepare_dataset()\n",
118 |     "\n",
119 |     "counter = 10\n",
120 |     "for i in dataset:\n",
121 |     "    print(i)\n",
122 |     "\n",
123 |     "    counter -= 1\n",
124 |     "    if(counter == 0): break"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "### The import function\n",
132 |     "\n",
133 |     "`TODO:`\n",
134 |     "* add a function to add objects to batch"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "from tqdm import tqdm\n",
144 |     "from weaviate.util import generate_uuid5\n",
145 |     "\n",
146 |     "def import_wiki_data(max_rows=10_000):\n",
147 |     "    print(f\"Importing {max_rows} data items\")\n",
148 |     "\n",
149 |     "    dataset = prepare_dataset()\n",
150 |     "    wiki = client.collections.use(\"Wiki\")\n",
151 |     "\n",
152 |     "    counter = 0\n",
153 |     "\n",
154 |     "    with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n",
155 |     "        for item in tqdm(dataset, total=max_rows):\n",
156 |     "\n",
157 |     "            data_to_insert = {   \n",
158 |     "                \"wiki_id\": item[\"wiki_id\"],\n",
159 |     "                \"text\": item[\"text\"],\n",
160 |     "                \"title\": item[\"title\"],\n",
161 |     "                \"url\": item[\"url\"],\n",
162 |     "            }\n",
163 |     "\n",
164 |     "            item_id = generate_uuid5(item[\"wiki_id\"])\n",
165 |     "\n",
166 |     "            # vector = item[\"vector\"]\n",
167 |     "            item_vector = {\n",
168 |     "                \"main_vector\": item[\"vector\"]\n",
169 |     "            }\n",
170 |     "\n",
171 |     "            batch.add_object(\n",
172 |     "                properties=data_to_insert,\n",
173 |     "                \n",
174 |     "                uuid=item_id,\n",
175 |     "                vector=item_vector\n",
176 |     "            )\n",
177 |     "\n",
178 |     "            # Check number of errors while running\n",
179 |     "            if(batch.number_errors > 10):\n",
180 |     "                print(f\"Reached {batch.number_errors} Errors during batch import\")\n",
181 |     "                break\n",
182 |     "            \n",
183 |     "            # stop after the request number reaches = max_rows\n",
184 |     "            counter += 1\n",
185 |     "            if(counter >= max_rows):\n",
186 |     "                break\n",
187 |     "    \n",
188 |     "    # check for errors at the end\n",
189 |     "    if (len(wiki.batch.failed_objects)>0):\n",
190 |     "        print(\"Final error check\")\n",
191 |     "        print(f\"Some errors {len(wiki.batch.failed_objects)}\")\n",
192 |     "        print(wiki.batch.failed_objects[-1])\n",
193 |     "    \n",
194 |     "    print(f\"Imported {counter} items\")\n",
195 |     "    print(\"-----------------------------------\")"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "import_wiki_data(10_000)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Check if data loaded correctly"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "wiki = client.collections.use(\"Wiki\")\n",
221 |     "len(wiki)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "res = wiki.query.fetch_objects(limit=1, include_vector=True)\n",
231 |     "print(res.objects[0].properties)\n",
232 |     "print(res.objects[0].vector)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "## Close the client"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "client.close()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": []
257 |   }
258 |  ],
259 |  "metadata": {
260 |   "kernelspec": {
261 |    "display_name": ".venv (3.11.9)",
262 |    "language": "python",
263 |    "name": "python3"
264 |   },
265 |   "language_info": {
266 |    "codemirror_mode": {
267 |     "name": "ipython",
268 |     "version": 3
269 |    },
270 |    "file_extension": ".py",
271 |    "mimetype": "text/x-python",
272 |    "name": "python",
273 |    "nbconvert_exporter": "python",
274 |    "pygments_lexer": "ipython3",
275 |    "version": "3.11.9"
276 |   }
277 |  },
278 |  "nbformat": 4,
279 |  "nbformat_minor": 2
280 | }
281 | 


--------------------------------------------------------------------------------
/3-rag/complete/1-rag-complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# RAG - How to query"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "from dotenv import load_dotenv\n",
 18 |     "\n",
 19 |     "load_dotenv()\n",
 20 |     "\n",
 21 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 22 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 23 |     "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
 24 |     "\n",
 25 |     "print(WEAVIATE_URL[:10])\n",
 26 |     "print(WEAVIATE_KEY[:10])\n",
 27 |     "print(OPENAI_API_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 46 |     "\n",
 47 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 48 |     "    cluster_url=WEAVIATE_URL,\n",
 49 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 50 |     "\n",
 51 |     "    headers = {\n",
 52 |     "        \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
 53 |     "    },\n",
 54 |     "\n",
 55 |     "    # additional_config=AdditionalConfig(\n",
 56 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 57 |     "    # )\n",
 58 |     ")\n",
 59 |     "\n",
 60 |     "client.is_ready()"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### Start with (R) - Retrieval"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "wiki = client.collections.use(\"Wiki\")\n",
 77 |     "\n",
 78 |     "response = wiki.query.near_text(\n",
 79 |     "    query=\"How do planes fly\",\n",
 80 |     "    limit=5,\n",
 81 |     "    return_properties=[\"text\", \"title\"]\n",
 82 |     ")\n",
 83 |     "\n",
 84 |     "for item in response.objects:\n",
 85 |     "    print(item.properties)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Add (AG) - augmented generation - to make full RAG"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "#### Single Prompt\n",
100 |     "\n",
101 |     "> Generate a response per **retrieved** object."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# Let's add some colour to our lives :)\n",
111 |     "BLUE   = \"\\033[94m\"\n",
112 |     "PURPLE = \"\\033[95m\"\n",
113 |     "RESET  = \"\\033[0\""
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "from weaviate.classes.generate import GenerativeConfig\n",
123 |     "\n",
124 |     "wiki = client.collections.use(\"Wiki\")\n",
125 |     "\n",
126 |     "response = wiki.generate.near_text(\n",
127 |     "    query=\"How do planes fly\",\n",
128 |     "    # auto_limit=1,\n",
129 |     "    limit=5,\n",
130 |     "\n",
131 |     "    # TODO: add GenerativeConfig with OpenAI and \"gpt-4o-mini\"\n",
132 |     "    generative_provider=GenerativeConfig.openai(\n",
133 |     "        model=\"gpt-4o-mini\",\n",
134 |     "    ),\n",
135 |     "    \n",
136 |     "    # TODO: add a single prompt \"Explain what this is about? {text}\"\n",
137 |     "    single_prompt=\"Explain what this is about? {text}\"\n",
138 |     ")\n",
139 |     "\n",
140 |     "for item in response.objects:\n",
141 |     "    print(f\"{BLUE}=== Source ===\")\n",
142 |     "    print(item.properties)\n",
143 |     "\n",
144 |     "    print(f\"{PURPLE}=== Generated Response ===\")\n",
145 |     "    print(item.generative.text)\n",
146 |     "    print(\"\\n\")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "#### Grouped Task\n",
154 |     "\n",
155 |     "> Generate one response based on all **retrieved** objects."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "wiki = client.collections.use(\"Wiki\")\n",
165 |     "\n",
166 |     "response = wiki.generate.near_text(\n",
167 |     "    query=\"How do planes fly\",\n",
168 |     "    # auto_limit=1,\n",
169 |     "    limit=5,\n",
170 |     "\n",
171 |     "    generative_provider=GenerativeConfig.openai(\n",
172 |     "        model=\"gpt-4o-mini\",\n",
173 |     "    ),\n",
174 |     "\n",
175 |     "    grouped_task=\"Explain, how do planes fly? Please only use the provided content.\"\n",
176 |     ")\n",
177 |     "\n",
178 |     "print(f\"{PURPLE}=== Generated Response ===\")\n",
179 |     "print(response.generative.text)\n",
180 |     "\n",
181 |     "print(f\"{BLUE}=== Source ===\")\n",
182 |     "for item in response.objects:\n",
183 |     "    print(item.properties)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "#### Specify which properties to use for grouped task"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "wiki = client.collections.use(\"Wiki\")\n",
200 |     "\n",
201 |     "response = wiki.generate.near_text(\n",
202 |     "    query=\"How do planes fly\",\n",
203 |     "    auto_limit=1,\n",
204 |     "    grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
205 |     "    grouped_properties=[\"text\", \"title\"],\n",
206 |     "\n",
207 |     "    generative_provider=GenerativeConfig.openai(\n",
208 |     "        model=\"gpt-4o-mini\",\n",
209 |     "    ),\n",
210 |     ")\n",
211 |     "\n",
212 |     "print(f\"{PURPLE}=== Generated Response ===\")\n",
213 |     "print(response.generative.text)\n",
214 |     "\n",
215 |     "print(f\"{BLUE}=== Source ===\")\n",
216 |     "for item in response.objects:\n",
217 |     "    print(item.properties)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "## Set default Generative model"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "from weaviate.classes.config import Reconfigure\n",
234 |     "\n",
235 |     "wiki = client.collections.use(\"Wiki\")\n",
236 |     "\n",
237 |     "wiki.config.update(\n",
238 |     "    generative_config=Reconfigure.Generative.openai(\n",
239 |     "        model=\"gpt-4o-mini\"  # Update the generative model\n",
240 |     "    )\n",
241 |     ")"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "> Try generative query without providing the model"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "response = wiki.generate.near_text(\n",
258 |     "    query=\"What african animals do we have info on. Please only list those provided in here.\",\n",
259 |     "    auto_limit=1,\n",
260 |     "    \n",
261 |     "    grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
262 |     ")\n",
263 |     "\n",
264 |     "print(f\"{PURPLE}=== Generated Response ===\")\n",
265 |     "print(response.generative.text)\n",
266 |     "\n",
267 |     "print(f\"{BLUE}=== Source ===\")\n",
268 |     "for item in response.objects:\n",
269 |     "    print(item.properties)"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "## Close the client"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "client.close()"
286 |    ]
287 |   }
288 |  ],
289 |  "metadata": {
290 |   "kernelspec": {
291 |    "display_name": ".venv (3.11.9)",
292 |    "language": "python",
293 |    "name": "python3"
294 |   },
295 |   "language_info": {
296 |    "codemirror_mode": {
297 |     "name": "ipython",
298 |     "version": 3
299 |    },
300 |    "file_extension": ".py",
301 |    "mimetype": "text/x-python",
302 |    "name": "python",
303 |    "nbconvert_exporter": "python",
304 |    "pygments_lexer": "ipython3",
305 |    "version": "3.11.9"
306 |   }
307 |  },
308 |  "nbformat": 4,
309 |  "nbformat_minor": 2
310 | }
311 | 


--------------------------------------------------------------------------------
/3-rag/1-rag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# RAG - How to query"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "from dotenv import load_dotenv\n",
 18 |     "\n",
 19 |     "load_dotenv()\n",
 20 |     "\n",
 21 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 22 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 23 |     "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n",
 24 |     "\n",
 25 |     "print(WEAVIATE_URL[:10])\n",
 26 |     "print(WEAVIATE_KEY[:10])\n",
 27 |     "print(OPENAI_API_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 46 |     "\n",
 47 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 48 |     "    cluster_url=WEAVIATE_URL,\n",
 49 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 50 |     "\n",
 51 |     "    headers = {\n",
 52 |     "        \"X-OpenAI-Api-Key\": OPENAI_API_KEY\n",
 53 |     "    },\n",
 54 |     "\n",
 55 |     "    # additional_config=AdditionalConfig(\n",
 56 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 57 |     "    # )\n",
 58 |     ")\n",
 59 |     "\n",
 60 |     "client.is_ready()"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### Start with (R) - Retrieval"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "wiki = client.collections.use(\"Wiki\")\n",
 77 |     "\n",
 78 |     "response = wiki.query.near_text(\n",
 79 |     "    query=\"How do planes fly\",\n",
 80 |     "    limit=5,\n",
 81 |     "    return_properties=[\"text\", \"title\"]\n",
 82 |     ")\n",
 83 |     "\n",
 84 |     "for item in response.objects:\n",
 85 |     "    print(item.properties)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Add (AG) - augmented generation - to make full RAG"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "#### Single Prompt\n",
100 |     "\n",
101 |     "> Generate a response per **retrieved** object."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# Let's add some colour to our lives :)\n",
111 |     "BLUE   = \"\\033[94m\"\n",
112 |     "PURPLE = \"\\033[95m\"\n",
113 |     "RESET  = \"\\033[0\""
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "from weaviate.classes.generate import GenerativeConfig\n",
123 |     "\n",
124 |     "wiki = client.collections.use(\"Wiki\")\n",
125 |     "\n",
126 |     "response = wiki.generate.near_text(\n",
127 |     "    query=\"How do planes fly\",\n",
128 |     "    # auto_limit=1,\n",
129 |     "    limit=5,\n",
130 |     "\n",
131 |     "    # TODO: add GenerativeConfig with OpenAI and \"gpt-4o-mini\"\n",
132 |     "    # generative_provider=GenerativeConfig.\n",
133 |     "\n",
134 |     "    # TODO: add a single prompt \"Explain what this is about? {text}\"\n",
135 |     "    # single_prompt=\n",
136 |     ")\n",
137 |     "\n",
138 |     "# NOTE: the generated responses are included with the each object\n",
139 |     "\n",
140 |     "for item in response.objects:\n",
141 |     "    print(f\"{BLUE}=== Source ===\")\n",
142 |     "    print(item.properties)\n",
143 |     "\n",
144 |     "    print(f\"{PURPLE}=== Generated Response ===\")\n",
145 |     "    # TODO: print the generative.text object\n",
146 |     "    # print(item.)\n",
147 |     "\n",
148 |     "    print(\"\\n\")"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "#### Grouped Task\n",
156 |     "\n",
157 |     "> Generate one response based on all **retrieved** objects."
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "wiki = client.collections.use(\"Wiki\")\n",
167 |     "\n",
168 |     "response = wiki.generate.near_text(\n",
169 |     "    query=\"How do planes fly\",\n",
170 |     "    # auto_limit=1,\n",
171 |     "    limit=5,\n",
172 |     "    \n",
173 |     "    generative_provider=GenerativeConfig.openai(\n",
174 |     "        model=\"gpt-4o-mini\",\n",
175 |     "    ),\n",
176 |     "\n",
177 |     "    # TODO: add a grouped task \"Explain, how do planes fly? Please only use the provided content.\"\n",
178 |     "    # grouped_task=\n",
179 |     ")\n",
180 |     "\n",
181 |     "print(f\"{PURPLE}=== Generated Response ===\")\n",
182 |     "# NOTE: group task response is at response.generative.text\n",
183 |     "# TODO: print the generated text\n",
184 |     "# print(response.)\n",
185 |     "\n",
186 |     "print(f\"{BLUE}=== Source ===\")\n",
187 |     "for item in response.objects:\n",
188 |     "    print(item.properties)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "#### Specify which properties to use for grouped task"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "response = wiki.generate.near_text(\n",
205 |     "    query=\"How do planes fly\",\n",
206 |     "    auto_limit=1,\n",
207 |     "\n",
208 |     "    generative_provider=GenerativeConfig.openai(\n",
209 |     "        model=\"gpt-4o-mini\",\n",
210 |     "    ),\n",
211 |     "\n",
212 |     "    grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
213 |     "    # TODO: add grouped properties to only use \"text\" and \"title\" \n",
214 |     "    # grouped_properties=[]\n",
215 |     ")\n",
216 |     "\n",
217 |     "print(\"=== Generated Response ===\")\n",
218 |     "print(response.generative.text)\n",
219 |     "\n",
220 |     "print(\"=== Source ===\")\n",
221 |     "for item in response.objects:\n",
222 |     "    print(item.properties)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "## Set default Generative model"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "from weaviate.classes.config import Reconfigure\n",
239 |     "\n",
240 |     "wiki = client.collections.use(\"Wiki\")\n",
241 |     "\n",
242 |     "wiki.config.update(\n",
243 |     "    #TODO: set generative model\n",
244 |     "    # generative_config=Reconfigure.Generative.\n",
245 |     ")"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "> Try generative query without providing the model"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "response = wiki.generate.near_text(\n",
262 |     "    query=\"What african animals do we have info on. Please only list those provided in here.\",\n",
263 |     "    auto_limit=1,\n",
264 |     "    \n",
265 |     "    grouped_task=\"Explain, how do planes fly? Please only use the provided content.\",\n",
266 |     ")\n",
267 |     "\n",
268 |     "print(f\"{PURPLE}=== Generated Response ===\")\n",
269 |     "print(response.generative.text)\n",
270 |     "\n",
271 |     "print(f\"{BLUE}=== Source ===\")\n",
272 |     "for item in response.objects:\n",
273 |     "    print(item.properties)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "## Close the client"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "client.close()"
290 |    ]
291 |   }
292 |  ],
293 |  "metadata": {
294 |   "kernelspec": {
295 |    "display_name": ".venv",
296 |    "language": "python",
297 |    "name": "python3"
298 |   },
299 |   "language_info": {
300 |    "codemirror_mode": {
301 |     "name": "ipython",
302 |     "version": 3
303 |    },
304 |    "file_extension": ".py",
305 |    "mimetype": "text/x-python",
306 |    "name": "python",
307 |    "nbconvert_exporter": "python",
308 |    "pygments_lexer": "ipython3",
309 |    "version": "3.11.8"
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 2
314 | }
315 | 


--------------------------------------------------------------------------------
/4-multi-tenancy/1-playground-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "from dotenv import load_dotenv\n",
 11 |     "\n",
 12 |     "load_dotenv()\n",
 13 |     "\n",
 14 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 15 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 16 |     "\n",
 17 |     "print(WEAVIATE_URL[:10])\n",
 18 |     "print(WEAVIATE_KEY[:10])"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# Setup\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import weaviate\n",
 35 |     "from weaviate.classes.init import Auth\n",
 36 |     "\n",
 37 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 38 |     "    cluster_url=WEAVIATE_URL,\n",
 39 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 40 |     ")\n",
 41 |     "\n",
 42 |     "client.is_ready()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Create Tenant-ready collection"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "from weaviate.classes.config import Configure\n",
 59 |     "\n",
 60 |     "if (client.collections.exists(\"Play\")):\n",
 61 |     "    client.collections.delete(\"Play\")\n",
 62 |     "\n",
 63 |     "client.collections.create(\n",
 64 |     "    \"Play\",\n",
 65 |     "    vector_config=Configure.Vectors.self_provided(),\n",
 66 |     "\n",
 67 |     "    multi_tenancy_config=Configure.multi_tenancy(True)\n",
 68 |     "\n",
 69 |     "    # multi_tenancy_config=Configure.multi_tenancy(\n",
 70 |     "    #     enabled=True,\n",
 71 |     "    #     auto_tenant_creation=True, #Assign to non-existant tenant will create\n",
 72 |     "    #     auto_tenant_activation=True\n",
 73 |     "    # )\n",
 74 |     ")"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Create tenants\n",
 82 |     "> tenant name – must be made of alphanumeric characters (a-z, A-Z, 0-9), underscore (_), and hyphen (-), with a length between 1 and 64 characters'\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from weaviate.classes.tenants import Tenant\n",
 92 |     "\n",
 93 |     "play = client.collections.use(\"Play\")\n",
 94 |     "\n",
 95 |     "play.tenants.create([\n",
 96 |     "    Tenant(name=\"ten_A\"),\n",
 97 |     "    Tenant(name=\"ten_B\"),\n",
 98 |     "    Tenant(name=\"ten_C\"),\n",
 99 |     "    Tenant(name=\"ten_D\"),\n",
100 |     "])"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## List Tenants"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "play.tenants.get()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "play.tenants.exists(\"ten_E\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Access Tenants"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# this will fail – multi-tenant collections require us to use tenants\n",
142 |     "play.aggregate.over_all()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "tenA = play.with_tenant(\"ten_A\")\n",
152 |     "\n",
153 |     "tenA.aggregate.over_all()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "### Insert data"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n",
170 |     "play = client.collections.use(\"Play\")\n",
171 |     "tenA = play.with_tenant(\"ten_A\")\n",
172 |     "\n",
173 |     "tenA.data.insert_many([\n",
174 |     "    {\n",
175 |     "       \"title\": \"A book about vector databases\"\n",
176 |     "    },\n",
177 |     "    {\n",
178 |     "       \"title\": \"Tutorial for multimodal collections\"\n",
179 |     "    },\n",
180 |     "])\n",
181 |     "\n",
182 |     "tenA.aggregate.over_all()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "### Query Example"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "from weaviate.classes.query import Filter\n",
199 |     "\n",
200 |     "tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n",
201 |     "\n",
202 |     "response = tenA.query.fetch_objects(\n",
203 |     "    filters=Filter.by_property(\"title\").like(\"about\")\n",
204 |     ")\n",
205 |     "\n",
206 |     "for item in response.objects:\n",
207 |     "    print(item.properties)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "### Delete Tenants"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "play.tenants.remove([\"ten_D\"])\n",
224 |     "play.tenants.get()"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Update Tenants – Active & Inactive & Offloaded\n",
232 |     "Tenants can be:\n",
233 |     "* `Active` (default) - active tenants use  `HOT` resources (RAM)\n",
234 |     "* `Inacative` - inactive tenants cannot be searched on, their index is not loaded into memory, they don't use (RAM)\n",
235 |     "* `Offloaded` - offloaded tenants are moved to a cloud storage\n",
236 |     "\n",
237 |     "> Tenant offloading, requires an extra configuration, which is out of scope for this workshop.<br/>\n",
238 |     "> You can learn more from [How-to: Configure - Tenant Offloading](https://weaviate.io/developers/weaviate/configuration/tenant-offloading)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "### Deactivate – make tenant `Inactive`"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "from weaviate.classes.tenants import Tenant, TenantActivityStatus\n",
255 |     "\n",
256 |     "play.tenants.update([\n",
257 |     "    Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.INACTIVE),\n",
258 |     "])\n",
259 |     "\n",
260 |     "play.tenants.get()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "**Cannot search `Inactive` tenants**"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "# tenA = client.collections.use(\"Play\").with_tenant(\"ten_A\")\n",
277 |     "\n",
278 |     "response = tenA.query.fetch_objects(\n",
279 |     "    filters=Filter.by_property(\"title\").like(\"about\")\n",
280 |     ")\n",
281 |     "\n",
282 |     "for item in response.objects:\n",
283 |     "    print(item.properties)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "### Activate - make tenant `Active`\n",
291 |     "\n",
292 |     "> You can't query an inactive tenant, but you can activate it."
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "play.tenants.update([\n",
302 |     "    Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.ACTIVE),\n",
303 |     "])\n",
304 |     "\n",
305 |     "play.tenants.get()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "response = tenA.query.fetch_objects(\n",
315 |     "    filters=Filter.by_property(\"title\").like(\"about\")\n",
316 |     ")\n",
317 |     "\n",
318 |     "for item in response.objects:\n",
319 |     "    print(item.properties)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "### Offload - make tenant `offloaded`\n",
327 |     "\n",
328 |     "> Tenant offloading, requires an extra configuration, which is out of scope for this workshop.<br/>\n",
329 |     "> You can learn more from [How-to: Configure - Tenant Offloading](https://weaviate.io/developers/weaviate/configuration/tenant-offloading)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "# play.tenants.update([\n",
339 |     "#     Tenant(name=\"ten_A\", activity_status=TenantActivityStatus.OFFLOADED),\n",
340 |     "# ])\n",
341 |     "\n",
342 |     "# play.tenants.get()"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "## Clean up"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "client.collections.delete(\"Play\")"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "## Don't close yet...\n",
366 |     "\n",
367 |     "> You can try again with `auto_tenant_creation=True` and `auto_tenant_activation=True`"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "## Close the client"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": [
383 |     "client.close()"
384 |    ]
385 |   }
386 |  ],
387 |  "metadata": {
388 |   "kernelspec": {
389 |    "display_name": ".venv (3.11.9)",
390 |    "language": "python",
391 |    "name": "python3"
392 |   },
393 |   "language_info": {
394 |    "codemirror_mode": {
395 |     "name": "ipython",
396 |     "version": 3
397 |    },
398 |    "file_extension": ".py",
399 |    "mimetype": "text/x-python",
400 |    "name": "python",
401 |    "nbconvert_exporter": "python",
402 |    "pygments_lexer": "ipython3",
403 |    "version": "3.11.9"
404 |   }
405 |  },
406 |  "nbformat": 4,
407 |  "nbformat_minor": 2
408 | }
409 | 


--------------------------------------------------------------------------------
/1-intro/complete/2-query-complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Query the data\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Connect to Weaviate"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import weaviate\n",
 44 |     "from weaviate.classes.init import Auth\n",
 45 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 46 |     "\n",
 47 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 48 |     "    cluster_url=WEAVIATE_URL,\n",
 49 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 50 |     "\n",
 51 |     "    # additional_config=AdditionalConfig(\n",
 52 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 53 |     "    # )\n",
 54 |     ")\n",
 55 |     "\n",
 56 |     "client.is_ready()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### Helper function"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import json\n",
 73 |     "def print_properties(item):\n",
 74 |     "    print(\n",
 75 |     "        json.dumps(\n",
 76 |     "            item.properties,\n",
 77 |     "            indent=2, sort_keys=True, default=str\n",
 78 |     "        )\n",
 79 |     "    )"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Vector search\n",
 87 |     "[Docs - near_text](https://weaviate.io/developers/weaviate/search/similarity#an-input-medium)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "wiki = client.collections.use(\"Wiki\")\n",
 97 |     "\n",
 98 |     "response = wiki.query.near_text(\n",
 99 |     "    query=\"musical instruments\",\n",
100 |     "    limit=5\n",
101 |     ")\n",
102 |     "\n",
103 |     "for item in response.objects:\n",
104 |     "    print_properties(item)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "from weaviate.classes.query import MetadataQuery\n",
114 |     "\n",
115 |     "wiki = client.collections.use(\"Wiki\")\n",
116 |     "\n",
117 |     "response = wiki.query.near_text(\n",
118 |     "    query=\"musical instruments\",\n",
119 |     "    limit=5,\n",
120 |     "    return_metadata=MetadataQuery(distance=True)\n",
121 |     ")\n",
122 |     "\n",
123 |     "for item in response.objects:\n",
124 |     "    print_properties(item)\n",
125 |     "    print(item.metadata.distance)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### Autocut\n",
133 |     "\n",
134 |     "Return groups of results based on the quality/distance jumps"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "from weaviate.classes.query import MetadataQuery\n",
144 |     "\n",
145 |     "wiki = client.collections.use(\"Wiki\")\n",
146 |     "\n",
147 |     "response = wiki.query.near_text(\n",
148 |     "    query=\"musical instruments\",\n",
149 |     "    auto_limit=1,\n",
150 |     "    return_metadata=MetadataQuery(distance=True)\n",
151 |     ")\n",
152 |     "\n",
153 |     "print(f\"Returned object count: {len(response.objects)}\")\n",
154 |     "\n",
155 |     "for item in response.objects:\n",
156 |     "    print_properties(item)\n",
157 |     "    print(item.metadata.distance)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Filters"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Fetch with filters"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "from weaviate.classes.query import Filter\n",
181 |     "\n",
182 |     "wiki = client.collections.use(\"Wiki\")\n",
183 |     "\n",
184 |     "response = wiki.query.fetch_objects(\n",
185 |     "    limit=5,\n",
186 |     "    filters=Filter.by_property(\"title\").like(\"music\")\n",
187 |     ")\n",
188 |     "\n",
189 |     "for item in response.objects:\n",
190 |     "    print_properties(item)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "from weaviate.classes.query import Filter\n",
200 |     "\n",
201 |     "response = wiki.query.fetch_objects(\n",
202 |     "    limit=5,\n",
203 |     "    filters=Filter.by_property(\"title\").like(\"m*ic\") & Filter.by_property(\"title\").not_equal(\"music\")\n",
204 |     ")\n",
205 |     "\n",
206 |     "for item in response.objects:\n",
207 |     "    print_properties(item)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Search with filters\n",
215 |     "[Docs - Filters](https://weaviate.io/developers/weaviate/search/filters)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "from weaviate.classes.query import Filter\n",
225 |     "\n",
226 |     "response = wiki.query.near_text(\n",
227 |     "    query=\"musical instruments\",\n",
228 |     "    limit=5,\n",
229 |     "    filters=Filter.by_property(\"title\").not_equal(\"music\")\n",
230 |     ")\n",
231 |     "\n",
232 |     "for item in response.objects:\n",
233 |     "    print_properties(item)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "## Keyword Search\n",
241 |     "\n",
242 |     "[Docs - keyword/bm25](https://weaviate.io/developers/weaviate/search/bm25)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "response = wiki.query.bm25(\n",
252 |     "    query=\"musical instruments\",\n",
253 |     "    limit=5,\n",
254 |     ")\n",
255 |     "\n",
256 |     "for item in response.objects:\n",
257 |     "    print_properties(item)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "from weaviate.classes.query import MetadataQuery\n",
267 |     "\n",
268 |     "response = wiki.query.bm25(\n",
269 |     "    query=\"musical instruments\",\n",
270 |     "    query_properties=[\"text\", \"title\"],\n",
271 |     "    limit=5,\n",
272 |     "    return_metadata=MetadataQuery(score=True)\n",
273 |     ")\n",
274 |     "\n",
275 |     "for item in response.objects:\n",
276 |     "    print_properties(item)\n",
277 |     "    print(item.metadata.score)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "from weaviate.classes.query import MetadataQuery\n",
287 |     "\n",
288 |     "response = wiki.query.bm25(\n",
289 |     "    query=\"musical instruments\",\n",
290 |     "    query_properties=[\"text\", \"title^3\"],\n",
291 |     "    limit=5,\n",
292 |     "    return_metadata=MetadataQuery(score=True)\n",
293 |     ")\n",
294 |     "\n",
295 |     "for item in response.objects:\n",
296 |     "    print_properties(item)\n",
297 |     "    print(item.metadata.score)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "## Hybrid search\n",
305 |     "[Docs - hybrid](https://weaviate.io/developers/weaviate/search/hybrid)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "response = wiki.query.hybrid(\n",
315 |     "    query=\"musical instruments\",\n",
316 |     "    alpha=0.7,\n",
317 |     "    limit=5,\n",
318 |     ")\n",
319 |     "\n",
320 |     "for item in response.objects:\n",
321 |     "    print_properties(item)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "### Hybrid - select properties"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "response = wiki.query.hybrid(\n",
338 |     "    query=\"musical instruments\",\n",
339 |     "    alpha=0.7,\n",
340 |     "    limit=5,\n",
341 |     "    query_properties=[\"title\"]\n",
342 |     ")\n",
343 |     "\n",
344 |     "for item in response.objects:\n",
345 |     "    print_properties(item)"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "### Hybrid - Explain score"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "from weaviate.classes.query import MetadataQuery\n",
362 |     "\n",
363 |     "response = wiki.query.hybrid(\n",
364 |     "    query=\"musical instruments\",\n",
365 |     "    alpha=0.7,\n",
366 |     "    limit=5,\n",
367 |     "    query_properties=[\"title\"],\n",
368 |     "    return_metadata=MetadataQuery(score=True, explain_score=True)\n",
369 |     ")\n",
370 |     "\n",
371 |     "for item in response.objects:\n",
372 |     "    print_properties(item)\n",
373 |     "    print(item.metadata.score)\n",
374 |     "    print(item.metadata.explain_score)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "## Close the client"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "client.close()"
391 |    ]
392 |   }
393 |  ],
394 |  "metadata": {
395 |   "kernelspec": {
396 |    "display_name": ".venv (3.11.9)",
397 |    "language": "python",
398 |    "name": "python3"
399 |   },
400 |   "language_info": {
401 |    "codemirror_mode": {
402 |     "name": "ipython",
403 |     "version": 3
404 |    },
405 |    "file_extension": ".py",
406 |    "mimetype": "text/x-python",
407 |    "name": "python",
408 |    "nbconvert_exporter": "python",
409 |    "pygments_lexer": "ipython3",
410 |    "version": "3.11.9"
411 |   }
412 |  },
413 |  "nbformat": 4,
414 |  "nbformat_minor": 2
415 | }
416 | 


--------------------------------------------------------------------------------
/1-intro/2-query.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Query the data\n",
  8 |     "\n",
  9 |     "## Get keys and urls"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "from dotenv import load_dotenv\n",
 20 |     "\n",
 21 |     "load_dotenv()\n",
 22 |     "\n",
 23 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 24 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 25 |     "\n",
 26 |     "print(WEAVIATE_URL[:10])\n",
 27 |     "print(WEAVIATE_KEY[:10])\n",
 28 |     "\n",
 29 |     "if(WEAVIATE_URL == \"UPDATE_ME_WEAVIATE_URL\"):\n",
 30 |     "    raise Exception(\"Please update .env and Restart the notebook (see Restart button, next to Run All)\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Connect to Weaviate"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import weaviate\n",
 47 |     "from weaviate.classes.init import Auth\n",
 48 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 49 |     "\n",
 50 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 51 |     "    cluster_url=WEAVIATE_URL,\n",
 52 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 53 |     "\n",
 54 |     "    # additional_config=AdditionalConfig(\n",
 55 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 56 |     "    # )\n",
 57 |     ")\n",
 58 |     "\n",
 59 |     "client.is_ready()"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "### Helper function"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "import json\n",
 76 |     "def print_properties(item):\n",
 77 |     "    print(\n",
 78 |     "        json.dumps(\n",
 79 |     "            item.properties,\n",
 80 |     "            indent=2, sort_keys=True, default=str\n",
 81 |     "        )\n",
 82 |     "    )"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Vector search\n",
 90 |     "[Docs - near_text](https://weaviate.io/developers/weaviate/search/similarity#an-input-medium)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# TODO: get the Wiki collection\n",
100 |     "# wiki = \n",
101 |     "\n",
102 |     "# TODO: run a near text query, search for musical instruments, with limit 5\n",
103 |     "# response = wiki.query.\n",
104 |     "\n",
105 |     "for item in response.objects:\n",
106 |     "    print_properties(item)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "from weaviate.classes.query import MetadataQuery\n",
116 |     "\n",
117 |     "wiki = client.collections.use(\"Wiki\")\n",
118 |     "\n",
119 |     "response = wiki.query.near_text(\n",
120 |     "    query=\"musical instruments\",\n",
121 |     "    limit=5,\n",
122 |     "    # TODO: add MetadataQuery - request distance\n",
123 |     "    # return_metadata=\n",
124 |     ")\n",
125 |     "\n",
126 |     "for item in response.objects:\n",
127 |     "    print_properties(item)\n",
128 |     "    print(item.metadata.distance)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "### Autocut\n",
136 |     "\n",
137 |     "Return groups of results based on the quality/distance jumps"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "from weaviate.classes.query import MetadataQuery\n",
147 |     "\n",
148 |     "wiki = client.collections.use(\"Wiki\")\n",
149 |     "\n",
150 |     "response = wiki.query.near_text(\n",
151 |     "    query=\"musical instruments\",\n",
152 |     "    # TODO: use auto_limit instead if limit, set it to 1\n",
153 |     "\n",
154 |     "    return_metadata=MetadataQuery(distance=True)\n",
155 |     ")\n",
156 |     "\n",
157 |     "print(f\"Returned object count: {len(response.objects)}\")\n",
158 |     "\n",
159 |     "for item in response.objects:\n",
160 |     "    print_properties(item)\n",
161 |     "    print(item.metadata.distance)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Filters"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "### Fetch with filters"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "from weaviate.classes.query import Filter\n",
185 |     "\n",
186 |     "wiki = client.collections.use(\"Wiki\")\n",
187 |     "\n",
188 |     "response = wiki.query.fetch_objects(\n",
189 |     "    limit=5,\n",
190 |     "    # TODO: filter by property title, search for something like music\n",
191 |     "    # filters=\n",
192 |     ")\n",
193 |     "\n",
194 |     "for item in response.objects:\n",
195 |     "    print_properties(item)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "from weaviate.classes.query import Filter\n",
205 |     "\n",
206 |     "response = wiki.query.fetch_objects(\n",
207 |     "    limit=5,\n",
208 |     "    # NOTE: you can use & as AND operator and | as OR operator\n",
209 |     "    filters=Filter.by_property(\"title\").like(\"m*ic\") & Filter.by_property(\"title\").not_equal(\"music\")\n",
210 |     ")\n",
211 |     "\n",
212 |     "for item in response.objects:\n",
213 |     "    print_properties(item)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Search with filters\n",
221 |     "[Docs - Filters](https://weaviate.io/developers/weaviate/search/filters)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "from weaviate.classes.query import Filter\n",
231 |     "\n",
232 |     "response = wiki.query.near_text(\n",
233 |     "    query=\"musical instruments\",\n",
234 |     "    limit=5,\n",
235 |     "    filters=Filter.by_property(\"title\").not_equal(\"music\")\n",
236 |     ")\n",
237 |     "\n",
238 |     "for item in response.objects:\n",
239 |     "    print_properties(item)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "## Keyword Search\n",
247 |     "\n",
248 |     "[Docs - keyword/bm25](https://weaviate.io/developers/weaviate/search/bm25)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# TODO: use bm25 query, search for musical instruments, set limit to 5\n",
258 |     "\n",
259 |     "# response = wiki.\n",
260 |     "\n",
261 |     "for item in response.objects:\n",
262 |     "    print_properties(item)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "from weaviate.classes.query import MetadataQuery\n",
272 |     "\n",
273 |     "response = wiki.query.bm25(\n",
274 |     "    query=\"musical instruments\",\n",
275 |     "    # TODO: add query properties for \"text\" and \"title\"\n",
276 |     "    # query_properties=[],\n",
277 |     "    limit=5,\n",
278 |     "    return_metadata=MetadataQuery(score=True)\n",
279 |     ")\n",
280 |     "\n",
281 |     "for item in response.objects:\n",
282 |     "    print_properties(item)\n",
283 |     "    print(item.metadata.score)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "from weaviate.classes.query import MetadataQuery\n",
293 |     "\n",
294 |     "response = wiki.query.bm25(\n",
295 |     "    query=\"musical instruments\",\n",
296 |     "    query_properties=[\"text\", \"title^3\"],\n",
297 |     "    limit=5,\n",
298 |     "    return_metadata=MetadataQuery(score=True)\n",
299 |     ")\n",
300 |     "\n",
301 |     "for item in response.objects:\n",
302 |     "    print_properties(item)\n",
303 |     "    print(item.metadata.score)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "## Hybrid search\n",
311 |     "[Docs - hybrid](https://weaviate.io/developers/weaviate/search/hybrid)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "# TODO: use hybrid query, search for musical instruments, set alpha to 0.7, and limit to 5\n",
321 |     "# response = wiki.\n",
322 |     "\n",
323 |     "for item in response.objects:\n",
324 |     "    print_properties(item)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "### Hybrid - select properties"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "response = wiki.query.hybrid(\n",
341 |     "    query=\"musical instruments\",\n",
342 |     "    alpha=0.7,\n",
343 |     "    limit=5,\n",
344 |     "    # TODO: add query properties for \"title\"\n",
345 |     ")\n",
346 |     "\n",
347 |     "for item in response.objects:\n",
348 |     "    print_properties(item)"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "### Hybrid - Explain score"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "from weaviate.classes.query import MetadataQuery\n",
365 |     "\n",
366 |     "response = wiki.query.hybrid(\n",
367 |     "    query=\"musical instruments\",\n",
368 |     "    alpha=0.7,\n",
369 |     "    limit=5,\n",
370 |     "    query_properties=[\"title\"],\n",
371 |     "    return_metadata=MetadataQuery(score=True, explain_score=True)\n",
372 |     ")\n",
373 |     "\n",
374 |     "for item in response.objects:\n",
375 |     "    print_properties(item)\n",
376 |     "    print(item.metadata.score)\n",
377 |     "    print(item.metadata.explain_score)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "## Close the client"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "client.close()"
394 |    ]
395 |   }
396 |  ],
397 |  "metadata": {
398 |   "kernelspec": {
399 |    "display_name": ".venv (3.11.9)",
400 |    "language": "python",
401 |    "name": "python3"
402 |   },
403 |   "language_info": {
404 |    "codemirror_mode": {
405 |     "name": "ipython",
406 |     "version": 3
407 |    },
408 |    "file_extension": ".py",
409 |    "mimetype": "text/x-python",
410 |    "name": "python",
411 |    "nbconvert_exporter": "python",
412 |    "pygments_lexer": "ipython3",
413 |    "version": "3.11.9"
414 |   }
415 |  },
416 |  "nbformat": 4,
417 |  "nbformat_minor": 2
418 | }
419 | 


--------------------------------------------------------------------------------
/2-pre-vectorised-data/1-playground-run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "from dotenv import load_dotenv\n",
 11 |     "\n",
 12 |     "load_dotenv()\n",
 13 |     "\n",
 14 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 15 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 16 |     "\n",
 17 |     "print(WEAVIATE_URL[:10])\n",
 18 |     "print(WEAVIATE_KEY[:10])"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Connect to Weaviate"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import weaviate\n",
 35 |     "from weaviate.classes.init import Auth\n",
 36 |     "\n",
 37 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 38 |     "    cluster_url=WEAVIATE_URL,\n",
 39 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 40 |     ")\n",
 41 |     "\n",
 42 |     "client.is_ready()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Create a collection with no vectorizer"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# Note: in practice, you shouldn't rerun this cell, as it deletes your data\n",
 59 |     "# in \"MyCollection\", and then you need to re-import it again.\n",
 60 |     "from  weaviate.classes.config import Configure, VectorDistances\n",
 61 |     "\n",
 62 |     "# Delete the collection if it already exists\n",
 63 |     "if (client.collections.exists(\"MyCollection\")):\n",
 64 |     "    client.collections.delete(\"MyCollection\")\n",
 65 |     "\n",
 66 |     "client.collections.create(\n",
 67 |     "    name=\"MyCollection\",\n",
 68 |     "    vector_config=Configure.Vectors.self_provided( # No vectorizer needed\n",
 69 |     "        vector_index_config=Configure.VectorIndex.hnsw( # Optional\n",
 70 |     "        distance_metric=VectorDistances.COSINE # select prefered distance metric \n",
 71 |     "        )\n",
 72 |     "    ),\n",
 73 |     ")\n",
 74 |     "\n",
 75 |     "print(f\"Successfully created collection: {'MyCollection'}.\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## Insert an object with a vector"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "my_collection = client.collections.use(\"MyCollection\")\n",
 92 |     "my_collection.data.insert(\n",
 93 |     "    properties={\n",
 94 |     "        \"title\": \"First Object\",\n",
 95 |     "        \"foo\": 11, \n",
 96 |     "    },\n",
 97 |     "    vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]\n",
 98 |     ")"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "res = my_collection.query.fetch_objects(include_vector=True)\n",
108 |     "\n",
109 |     "print(res.objects[0].properties)\n",
110 |     "print(res.objects[0].vector)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "## Insert many objects with their vectors using batch"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "source = [\n",
127 |     "    {\n",
128 |     "        \"title\": \"Second Object\",\n",
129 |     "        \"foo\": 22,\n",
130 |     "        \"vector\": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]\n",
131 |     "    },\n",
132 |     "    {\n",
133 |     "        \"title\": \"Third Object\",\n",
134 |     "        \"foo\": 33,\n",
135 |     "        \"vector\": [0.3, 0.1, -0.1, -0.3, -0.5, -0.7]\n",
136 |     "    },\n",
137 |     "    {\n",
138 |     "        \"title\": \"Fourth Object\",\n",
139 |     "        \"foo\": 44,\n",
140 |     "        \"vector\": [0.4, 0.41, 0.42, 0.43, 0.44, 0.45]\n",
141 |     "    },\n",
142 |     "    {\n",
143 |     "        \"title\": \"Fifth Object\",\n",
144 |     "        \"foo\": 55,\n",
145 |     "        \"vector\": [0.5, 0.5, 0, 0, 0, 0]\n",
146 |     "    },\n",
147 |     "]"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "\n",
157 |     "with my_collection.batch.dynamic() as batch:\n",
158 |     "    for item in source:\n",
159 |     "        batch.add_object(\n",
160 |     "            properties={\n",
161 |     "                \"title\": item[\"title\"],\n",
162 |     "                \"foo\": item[\"foo\"],\n",
163 |     "            },\n",
164 |     "            vector=item[\"vector\"]\n",
165 |     "        )"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "## Example with insert_many"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "# sample_data = [\n",
182 |     "#    wc.DataObject(\n",
183 |     "#       properties={\n",
184 |     "#          \"title\": \"First Object\",\n",
185 |     "#          \"foo\": 11, \n",
186 |     "#       },\n",
187 |     "#       vector=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]\n",
188 |     "#    ),\n",
189 |     "#    wc.DataObject(\n",
190 |     "#       properties={\n",
191 |     "#          \"title\": \"Second Object\",\n",
192 |     "#          \"foo\": 22,\n",
193 |     "#       },\n",
194 |     "#       vector=[0.2, 0.3, 0.4, 0.5, 0.6, 0.7]\n",
195 |     "#    ),\n",
196 |     "#    wc.DataObject(\n",
197 |     "#       properties={\n",
198 |     "#          \"title\": \"Third Object\",\n",
199 |     "#          \"foo\": 33,\n",
200 |     "#       },\n",
201 |     "#       vector=[0.3, 0.1, -0.1, -0.3, -0.5, -0.7]\n",
202 |     "#    ),\n",
203 |     "#    wc.DataObject(\n",
204 |     "#       properties={\n",
205 |     "#          \"title\": \"Fourth Object\",\n",
206 |     "#         \"foo\": 44,\n",
207 |     "#       },\n",
208 |     "#       vector=[0.4, 0.41, 0.42, 0.43, 0.44, 0.45]\n",
209 |     "#    ),\n",
210 |     "#    wc.DataObject(\n",
211 |     "#       properties={\n",
212 |     "#          \"title\": \"Fifth Object\",\n",
213 |     "#          \"foo\": 55,\n",
214 |     "#       },\n",
215 |     "#       vector=[0.5, 0.5, 0, 0, 0, 0]\n",
216 |     "#    ),\n",
217 |     "# ]\n",
218 |     "\n",
219 |     "# my_collection.data.insert_many(sample_data)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Query\n",
227 |     "Available types of queries you can run when working with vector embeddings (without modules) in **Weaviate**:\n",
228 |     "\n",
229 |     "1. [near_vector](https://weaviate.io/developers/weaviate/search/similarity#search-with-a-vector)\n",
230 |     "\n",
231 |     "2. [near_object](https://weaviate.io/developers/weaviate/search/similarity#search-with-an-existing-object)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### nearVector Example\n",
239 |     "**First example** - Search Weaviate with a vector embedding, and return title property.\n",
240 |     "\n",
241 |     "See [the docs](https://weaviate.io/developers/weaviate/search/similarity#search-with-a-vector) for more."
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "response = my_collection.query.near_vector(\n",
251 |     "    near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n",
252 |     "    limit=2,\n",
253 |     ")\n",
254 |     "\n",
255 |     "for item in response.objects:\n",
256 |     "    print(item.uuid)\n",
257 |     "    print(item.properties, \"\\n\")"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "**Second example** - The same search query, but this time also return `distance`, and `vector`."
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "from weaviate.classes.query import MetadataQuery\n",
274 |     "\n",
275 |     "response = my_collection.query.near_vector(\n",
276 |     "    near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n",
277 |     "    include_vector=True,\n",
278 |     "    return_metadata=MetadataQuery(distance=True),\n",
279 |     "    limit=2,\n",
280 |     ")\n",
281 |     "\n",
282 |     "for item in response.objects:\n",
283 |     "    print(item.properties)\n",
284 |     "    print(item.metadata.distance)\n",
285 |     "    print(item.vector, \"\\n\")"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "**Third example** – Same vector query, but this time we will filter on \"foo\" (which should be greater than 44). Also, let's return \"title\" and \"foo\".\n",
293 |     "\n",
294 |     "See [the docs](https://weaviate.io/developers/weaviate/search/filters#filter-with-one-condition) for more."
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "from weaviate.classes.query import Filter, MetadataQuery\n",
304 |     "\n",
305 |     "response = my_collection.query.near_vector(\n",
306 |     "    near_vector=[-0.012, 0.021, -0.23, -0.42, 0.5, 0.5],\n",
307 |     "    return_metadata=MetadataQuery(distance=True),\n",
308 |     "    filters=Filter.by_property(\"foo\").greater_than(30),\n",
309 |     "    limit=2,\n",
310 |     ")\n",
311 |     "\n",
312 |     "for item in response.objects:\n",
313 |     "    print(item.properties)\n",
314 |     "    print(item.metadata.distance, \"\\n\")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "### nearObject Example\n",
322 |     "\n",
323 |     "Weaviate also allows you to search for similar objects.\n",
324 |     "\n",
325 |     "See [the docs](https://weaviate.io/developers/weaviate/search/similarity#search-with-an-existing-object) for more.\n",
326 |     "\n",
327 |     "**Fourth example** - \n",
328 |     "Search through `MyCollection` for similar objects, by providing an id from the previous query. \n",
329 |     "\n",
330 |     "> Note #1: The id was taken from the query above <br/>\n",
331 |     "> The generated id for you might be different.\n",
332 |     "\n",
333 |     "> Note #2: The first object returned is always itself."
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "from weaviate.classes.query import MetadataQuery\n",
343 |     "\n",
344 |     "response = my_collection.query.near_object(\n",
345 |     "    near_object=\"20805faa-f0b6-404a-aa34-8a44e01e0bcd\",\n",
346 |     "    return_metadata=MetadataQuery(distance=True),\n",
347 |     "    limit=3,\n",
348 |     ")\n",
349 |     "\n",
350 |     "for item in response.objects:\n",
351 |     "    print(item.uuid)\n",
352 |     "    print(item.properties)\n",
353 |     "    print(item.metadata.distance, \"\\n\")"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "## Close the client"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "client.close()"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": []
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": ".venv (3.11.9)",
383 |    "language": "python",
384 |    "name": "python3"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.11.9"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 2
401 | }
402 | 


--------------------------------------------------------------------------------
/1-intro/complete/1-load-data-complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Collection setup and data load"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Get keys and urls"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "from dotenv import load_dotenv\n",
 25 |     "\n",
 26 |     "load_dotenv()\n",
 27 |     "\n",
 28 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 29 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 30 |     "\n",
 31 |     "print(WEAVIATE_URL[:10])\n",
 32 |     "print(WEAVIATE_KEY[:10])"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Connect to Weaviate\n",
 40 |     "\n",
 41 |     "You need to pass in your Weaviate Cloud URL and KEY."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import weaviate\n",
 51 |     "from weaviate.classes.init import Auth\n",
 52 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 53 |     "\n",
 54 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 55 |     "    cluster_url=WEAVIATE_URL,\n",
 56 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 57 |     "\n",
 58 |     "    # additional_config=AdditionalConfig(\n",
 59 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 60 |     "    # )\n",
 61 |     ")\n",
 62 |     "\n",
 63 |     "client.is_ready()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Create a collection with a vectorizer\n",
 71 |     "\n",
 72 |     "* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)\n",
 73 |     "* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)\n",
 74 |     "\n",
 75 |     "Examples of other embedding models:\n",
 76 |     "* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)\n",
 77 |     "* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)\n",
 78 |     "* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)\n",
 79 |     "* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from weaviate.classes.config import Configure\n",
 89 |     "\n",
 90 |     "if client.collections.exists(\"Jeopardy\"):\n",
 91 |     "    client.collections.delete(\"Jeopardy\")\n",
 92 |     "\n",
 93 |     "# Create a collection - with Weaviate vectorizer\n",
 94 |     "client.collections.create(\n",
 95 |     "    name=\"Jeopardy\",\n",
 96 |     "\n",
 97 |     "    # https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings\n",
 98 |     "    vector_config=Configure.Vectors.text2vec_weaviate(\n",
 99 |     "        model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
100 |     "        dimensions=256 # options 1024 (default) and 256\n",
101 |     "    ),\n",
102 |     ")"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Import data\n",
110 |     "### Sample Data"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "import json\n",
120 |     "\n",
121 |     "with open(\"../jeopardy_tiny.json\") as file:\n",
122 |     "    data_10 = json.load(file)\n",
123 |     "\n",
124 |     "print(json.dumps(data_10[0:2], indent=2))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "### Insert Many\n",
132 |     "\n",
133 |     "> `insert_many` is only used for inserting small batches of data - must complete within the timeout.\n",
134 |     "\n",
135 |     "[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Insert data\n",
145 |     "jeopardy = client.collections.use(\"Jeopardy\")\n",
146 |     "jeopardy.data.insert_many(data_10)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "### Data preview"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# Show data preview\n",
163 |     "jeopardy = client.collections.use(\"Jeopardy\")\n",
164 |     "response = jeopardy.query.fetch_objects(limit=4)\n",
165 |     "\n",
166 |     "for item in response.objects:\n",
167 |     "    print(item.uuid, item.properties)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "# Show data preview - with vectors\n",
177 |     "jeopardy = client.collections.use(\"Jeopardy\")\n",
178 |     "response = jeopardy.query.fetch_objects(\n",
179 |     "    limit=4,\n",
180 |     "    include_vector=True\n",
181 |     ")\n",
182 |     "\n",
183 |     "for item in response.objects:\n",
184 |     "    print(item.properties)\n",
185 |     "    print(item.vector, '\\n')"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "### Super quick query example"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "response = jeopardy.query.near_text(\n",
202 |     "    query=\"African animals\",\n",
203 |     "    # query=\"weather\",\n",
204 |     "    limit=2\n",
205 |     ")\n",
206 |     "\n",
207 |     "for item in response.objects:\n",
208 |     "    print(item.properties)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "## A bit bigger example - 2k objects"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "### Load data"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "import json\n",
232 |     "\n",
233 |     "with open(\"../wiki-2k.json\") as file:\n",
234 |     "    data_2k = json.load(file)\n",
235 |     "\n",
236 |     "print(json.dumps(data_2k[0:2], indent=2))"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "### Create a collection with Named Vectors and SourceProperties"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "from weaviate.classes.config import Configure, Property, DataType\n",
253 |     "\n",
254 |     "def create_wiki_collection():\n",
255 |     "    if client.collections.exists(\"Wiki\"):\n",
256 |     "        client.collections.delete(\"Wiki\")\n",
257 |     "\n",
258 |     "    # Create a collection here - with Weaviate vectorizer and define source properties\n",
259 |     "    client.collections.create(\n",
260 |     "        name=\"Wiki\",\n",
261 |     "\n",
262 |     "        vector_config=[\n",
263 |     "            Configure.Vectors.text2vec_weaviate(\n",
264 |     "                name=\"main_vector\",\n",
265 |     "                model=\"Snowflake/snowflake-arctic-embed-l-v2.0\", # default\n",
266 |     "                source_properties=['title', 'text'] # which properties should be used to generate a vector\n",
267 |     "            )\n",
268 |     "        ],\n",
269 |     "\n",
270 |     "        # Example: how to define property schema (Optional)\n",
271 |     "        # properties=[  \n",
272 |     "        #     Property(name=\"title\", data_type=DataType.TEXT),\n",
273 |     "        #     Property(name=\"text\", data_type=DataType.TEXT),\n",
274 |     "        #     Property(name=\"url\", data_type=DataType.TEXT),\n",
275 |     "        #     Property(name=\"wiki_id\", data_type=DataType.TEXT),\n",
276 |     "        # ],\n",
277 |     "    )\n",
278 |     "\n",
279 |     "create_wiki_collection()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "### Import data - 2k objects with Batch\n",
287 |     "\n",
288 |     "Batch speeds up the import process by grouping objects to be added in bigger batch groups.\n",
289 |     "\n",
290 |     "Batch creates an internal buffer to collect objects to be added.<br>\n",
291 |     "Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.\n",
292 |     "\n",
293 |     "Types of batch:\n",
294 |     "* `dynamic` - let batch calculate the optimal batch_size based on detected latency\n",
295 |     "* `fixed_size` - provide a fixed batch_size\n",
296 |     "* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "### Take 1 – import sample 100"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "from tqdm import tqdm\n",
313 |     "\n",
314 |     "sample_100 = data_2k[0:100]\n",
315 |     "\n",
316 |     "wiki = client.collections.use(\"Wiki\")\n",
317 |     "\n",
318 |     "with wiki.batch.dynamic() as batch:\n",
319 |     "    for item in tqdm(sample_100):\n",
320 |     "        batch.add_object(item)\n",
321 |     "\n",
322 |     "print(f\"Wiki count: {len(wiki)}\")"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "# check for errors\n",
332 |     "if(len(wiki.batch.failed_objects)>0):\n",
333 |     "    print(\"Import complete with errors\")\n",
334 |     "    for err in wiki.batch.failed_objects:\n",
335 |     "        print(err)\n",
336 |     "else:\n",
337 |     "    print(\"Import complete with no errors\")"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "### Take 2 – import sample 100 – with UUID\n",
345 |     "\n",
346 |     "To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property."
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "from weaviate.util import generate_uuid5\n",
356 |     "\n",
357 |     "print(generate_uuid5(\"This UUID is always the same\"))\n",
358 |     "print(generate_uuid5(\"This UUID is always the same\"))\n",
359 |     "print(generate_uuid5(\"This UUID is always the same\"))\n",
360 |     "print(\"====================================\")\n",
361 |     "\n",
362 |     "print(generate_uuid5(\"This UUID is different\"))\n",
363 |     "print(generate_uuid5(\"This UUID is different\"))\n",
364 |     "print(\"====================================\")\n",
365 |     "\n",
366 |     "obj1 = { \"title\": \"this is an object\", \"count\": 1 }\n",
367 |     "obj2 = { \"title\": \"this is an object\", \"count\": 2 }\n",
368 |     "print(generate_uuid5(obj1))\n",
369 |     "print(generate_uuid5(obj2))\n"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "# recreate the collection to start again\n",
379 |     "create_wiki_collection()"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {},
385 |    "source": [
386 |     "> Rerun the import script multiple times.\n",
387 |     "\n",
388 |     "> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase."
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "from tqdm import tqdm\n",
398 |     "from weaviate.util import generate_uuid5\n",
399 |     "\n",
400 |     "sample_100 = data_2k[0:100]\n",
401 |     "\n",
402 |     "wiki = client.collections.use(\"Wiki\")\n",
403 |     "\n",
404 |     "with wiki.batch.fixed_size(batch_size=20, concurrent_requests=2) as batch:\n",
405 |     "    for item in tqdm(sample_100):\n",
406 |     "        id = generate_uuid5(item[\"wiki_id\"])\n",
407 |     "\n",
408 |     "        batch.add_object(\n",
409 |     "            item,\n",
410 |     "            uuid=id\n",
411 |     "        )\n",
412 |     "\n",
413 |     "print(f\"Wiki count: {len(wiki)}\")"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {},
419 |    "source": [
420 |     "### Take 2 - import the rest of the data - but break if multiple errors"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "from tqdm import tqdm\n",
430 |     "from weaviate.util import generate_uuid5\n",
431 |     "\n",
432 |     "wiki = client.collections.use(\"Wiki\")\n",
433 |     "\n",
434 |     "with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:\n",
435 |     "    for item in tqdm(data_2k):\n",
436 |     "        id = generate_uuid5(item[\"wiki_id\"])\n",
437 |     "        batch.add_object(item, uuid=id)\n",
438 |     "\n",
439 |     "        # Check number of errors while running\n",
440 |     "        if(batch.number_errors > 10):\n",
441 |     "            print(\"Errors during batch import\")\n",
442 |     "            break"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "### Check for errors"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "if(len(wiki.batch.failed_objects)>0):\n",
459 |     "    print(\"Import complete with errors\")\n",
460 |     "    for err in wiki.batch.failed_objects:\n",
461 |     "        print(err)\n",
462 |     "else:\n",
463 |     "    print(\"Import complete with no errors\")"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "## Bonus - iterate through all collection data\n",
471 |     "\n",
472 |     "The client has a built-in function that allows you to iterate through all collection data."
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "wiki = client.collections.use(\"Wiki\")\n",
482 |     "\n",
483 |     "counter = 100\n",
484 |     "\n",
485 |     "for item in wiki.iterator():\n",
486 |     "    print(item.properties)\n",
487 |     "\n",
488 |     "    if (counter == 0): break\n",
489 |     "    \n",
490 |     "    counter -= 1"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "markdown",
495 |    "metadata": {},
496 |    "source": [
497 |     "You can also get `vector embeddings`, by using `include_vector`."
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {},
504 |    "outputs": [],
505 |    "source": [
506 |     "counter = 10\n",
507 |     "\n",
508 |     "for item in wiki.iterator(include_vector=True):\n",
509 |     "    print(item.properties)\n",
510 |     "    print(item.vector)\n",
511 |     "\n",
512 |     "    if (counter == 0): break\n",
513 |     "    \n",
514 |     "    counter -= 1"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "markdown",
519 |    "metadata": {},
520 |    "source": [
521 |     "## Close the client"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "client.close()"
531 |    ]
532 |   }
533 |  ],
534 |  "metadata": {
535 |   "kernelspec": {
536 |    "display_name": ".venv (3.11.9)",
537 |    "language": "python",
538 |    "name": "python3"
539 |   },
540 |   "language_info": {
541 |    "codemirror_mode": {
542 |     "name": "ipython",
543 |     "version": 3
544 |    },
545 |    "file_extension": ".py",
546 |    "mimetype": "text/x-python",
547 |    "name": "python",
548 |    "nbconvert_exporter": "python",
549 |    "pygments_lexer": "ipython3",
550 |    "version": "3.11.9"
551 |   }
552 |  },
553 |  "nbformat": 4,
554 |  "nbformat_minor": 2
555 | }
556 | 


--------------------------------------------------------------------------------
/1-intro/1-load-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Collection setup and data load"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Get keys and urls"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "from dotenv import load_dotenv\n",
 25 |     "\n",
 26 |     "load_dotenv()\n",
 27 |     "\n",
 28 |     "WEAVIATE_URL = os.getenv(\"WEAVIATE_URL\")\n",
 29 |     "WEAVIATE_KEY = os.getenv(\"WEAVIATE_KEY\")\n",
 30 |     "\n",
 31 |     "print(WEAVIATE_URL[:10)\n",
 32 |     "print(WEAVIATE_KEY[:10])\n",
 33 |     "\n",
 34 |     "if(WEAVIATE_URL == \"UPDATE_ME_WEAVIATE_URL\"):\n",
 35 |     "    raise Exception(\"Please update .env and Restart the notebook (see Restart button, next to Run All)\")"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Connect to Weaviate\n",
 43 |     "\n",
 44 |     "You need to pass in your Weaviate Cloud URL and KEY."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import weaviate\n",
 54 |     "from weaviate.classes.init import Auth\n",
 55 |     "# from weaviate.classes.init import AdditionalConfig, Timeout\n",
 56 |     "\n",
 57 |     "client = weaviate.connect_to_weaviate_cloud(\n",
 58 |     "    cluster_url=WEAVIATE_URL,\n",
 59 |     "    auth_credentials=Auth.api_key(WEAVIATE_KEY),\n",
 60 |     "\n",
 61 |     "    # additional_config=AdditionalConfig(\n",
 62 |     "    #     timeout=Timeout(init=2, query=45, insert=120),  # Values in seconds\n",
 63 |     "    # )\n",
 64 |     ")\n",
 65 |     "\n",
 66 |     "client.is_ready()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Create a collection with a vectorizer\n",
 74 |     "\n",
 75 |     "* [Weaviate Docs - collection creation and configuration](https://weaviate.io/developers/weaviate/manage-data/collections)\n",
 76 |     "* [Weaviate integrated embedding models](https://weaviate.io/developers/weaviate/model-providers/weaviate/embeddings)\n",
 77 |     "\n",
 78 |     "Examples of other embedding models:\n",
 79 |     "* [Cohere](https://weaviate.io/developers/weaviate/model-providers/cohere/embeddings)\n",
 80 |     "* [HuggingFace 🤗](https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings)\n",
 81 |     "* [Ollama (self-hosted)](https://weaviate.io/developers/weaviate/model-providers/ollama/embeddings)\n",
 82 |     "* [OpenAI](https://weaviate.io/developers/weaviate/model-providers/openai/embeddings)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from weaviate.classes.config import Configure\n",
 92 |     "\n",
 93 |     "if client.collections.exists(\"Jeopardy\"):\n",
 94 |     "    client.collections.delete(\"Jeopardy\")\n",
 95 |     "\n",
 96 |     "# Create a collection - with Weaviate vectorizer\n",
 97 |     "client.collections.create(\n",
 98 |     "    name=\"Jeopardy\",\n",
 99 |     "    # TODO: add text2vec_weaviate vectorizer - with:\n",
100 |     "    # * model - Snowflake/snowflake-arctic-embed-l-v2.0\n",
101 |     "    \n",
102 |     ")"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Import data\n",
110 |     "### Sample Data"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "import json\n",
120 |     "\n",
121 |     "with open(\"./jeopardy_tiny.json\") as file:\n",
122 |     "    data_10 = json.load(file)\n",
123 |     "\n",
124 |     "print(json.dumps(data_10[0:2], indent=2))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "### Insert Many\n",
132 |     "\n",
133 |     "> `insert_many` is only used for inserting small batches of data - must complete within the timeout.\n",
134 |     "\n",
135 |     "[Weaviate Docs - insert many](https://weaviate.io/developers/weaviate/manage-data/import)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Insert data\n",
145 |     "\n",
146 |     "# TODO: get Jeopardy collection\n",
147 |     "# TODO: insert data_10"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "### Data preview"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "# Show data preview\n",
164 |     "jeopardy = client.collections.use(\"Jeopardy\")\n",
165 |     "\n",
166 |     "# TODO: fetch 4 objects\n",
167 |     "# response = jeopardy\n",
168 |     "\n",
169 |     "for item in response.objects:\n",
170 |     "    print(item.uuid, item.properties)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "# Show data preview - with vectors\n",
180 |     "response = jeopardy.query.fetch_objects(\n",
181 |     "    limit=4,\n",
182 |     "    # TODO: add include_vectors\n",
183 |     ")\n",
184 |     "\n",
185 |     "for item in response.objects:\n",
186 |     "    print(item.properties)\n",
187 |     "    print(item.vector, '\\n')"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "### Super quick query example"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "# TODO: add near text query, search for African animals with limit 2\n",
204 |     "# response = jeopardy.query\n",
205 |     "\n",
206 |     "for item in response.objects:\n",
207 |     "    print(item.properties)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## A bit bigger example - 2k objects"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "### Load data"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "import json\n",
231 |     "\n",
232 |     "with open(\"./wiki-2k.json\") as file:\n",
233 |     "    data_2k = json.load(file)\n",
234 |     "\n",
235 |     "print(json.dumps(data_2k[0:2], indent=2))"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "### Create a collection with Named Vectors and SourceProperties"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "from weaviate.classes.config import Configure, Property, DataType\n",
252 |     "\n",
253 |     "def create_wiki_collection():\n",
254 |     "    if client.collections.exists(\"Wiki\"):\n",
255 |     "        client.collections.delete(\"Wiki\")\n",
256 |     "\n",
257 |     "    # Create a collection here - with Weaviate vectorizer and define source properties\n",
258 |     "    client.collections.create(\n",
259 |     "        name=\"Wiki\",\n",
260 |     "\n",
261 |     "        vector_config=[\n",
262 |     "            # NOTE: we are using NamedVectors here\n",
263 |     "            Configure.Vectors.text2vec_weaviate(\n",
264 |     "                name=\"main_vector\",\n",
265 |     "                model=\"Snowflake/snowflake-arctic-embed-l-v2.0\",\n",
266 |     "\n",
267 |     "                # TODO: set source properties to \"title\" and \"text\"\n",
268 |     "                # source_properties=[] # which properties should be used to generate a vector\n",
269 |     "            )\n",
270 |     "        ],\n",
271 |     "\n",
272 |     "        # Example: how to define property schema (Optional)\n",
273 |     "        # properties=[  \n",
274 |     "        #     Property(name=\"title\", data_type=DataType.TEXT),\n",
275 |     "        #     Property(name=\"text\", data_type=DataType.TEXT),\n",
276 |     "        #     Property(name=\"url\", data_type=DataType.TEXT),\n",
277 |     "        #     Property(name=\"wiki_id\", data_type=DataType.TEXT),\n",
278 |     "        # ],\n",
279 |     "    )\n",
280 |     "\n",
281 |     "create_wiki_collection()"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "### Import data - 2k objects with Batch\n",
289 |     "\n",
290 |     "Batch speeds up the import process by grouping objects to be added in bigger batch groups.\n",
291 |     "\n",
292 |     "Batch creates an internal buffer to collect objects to be added.<br>\n",
293 |     "Each time the buffer count reaches `batch_size`, batch sends the new objects to Weaviate.\n",
294 |     "\n",
295 |     "Types of batch:\n",
296 |     "* `dynamic` - let batch calculate the optimal batch_size based on detected latency\n",
297 |     "* `fixed_size` - provide a fixed batch_size\n",
298 |     "* `rate_limit` - limit the number of requests (per minute), useful for working with models with a rate limit"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "### Take 1 – import sample 100"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "from tqdm import tqdm\n",
315 |     "\n",
316 |     "sample_100 = data_2k[0:100]\n",
317 |     "\n",
318 |     "wiki = client.collections.use(\"Wiki\")\n",
319 |     "\n",
320 |     "# TODO: setup dynamic batch\n",
321 |     "# loop through the sample_100 data\n",
322 |     "# add each object to the batch\n",
323 |     "\n",
324 |     "print(f\"Wiki count: {len(wiki)}\")"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "# check for errors\n",
334 |     "if(len(wiki.batch.failed_objects)>0):\n",
335 |     "    print(\"Import complete with errors\")\n",
336 |     "    for err in wiki.batch.failed_objects:\n",
337 |     "        print(err)\n",
338 |     "else:\n",
339 |     "    print(\"Import complete with no errors\")"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "### Take 2 – import sample 100 – with UUID\n",
347 |     "\n",
348 |     "To avoid inserting duplicates, you can generate a UUID based on the whole object or a unique property."
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "from weaviate.util import generate_uuid5\n",
358 |     "\n",
359 |     "print(generate_uuid5(\"This UUID is always the same\"))\n",
360 |     "print(generate_uuid5(\"This UUID is always the same\"))\n",
361 |     "print(generate_uuid5(\"This UUID is always the same\"))\n",
362 |     "print(\"====================================\")\n",
363 |     "\n",
364 |     "print(generate_uuid5(\"This UUID is different\"))\n",
365 |     "print(generate_uuid5(\"This UUID is different\"))\n",
366 |     "print(\"====================================\")\n",
367 |     "\n",
368 |     "obj1 = { \"title\": \"this is an object\", \"count\": 1 }\n",
369 |     "obj2 = { \"title\": \"this is an object\", \"count\": 2 }\n",
370 |     "print(generate_uuid5(obj1))\n",
371 |     "print(generate_uuid5(obj2))\n"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "# recreate the collection to start again\n",
381 |     "create_wiki_collection()"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "> Rerun the import script multiple times.\n",
389 |     "\n",
390 |     "> Starting from the second run, the script should finish a lot faster, and the wiki count shouldn't increase."
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "from tqdm import tqdm\n",
400 |     "from weaviate.util import generate_uuid5\n",
401 |     "\n",
402 |     "sample_100 = data_2k[0:100]\n",
403 |     "\n",
404 |     "wiki = client.collections.use(\"Wiki\")\n",
405 |     "\n",
406 |     "with wiki.batch.fixed_size(batch_size=50, concurrent_requests=2) as batch:\n",
407 |     "    for item in tqdm(sample_100):\n",
408 |     "        # TODO: generate an id from item[\"wiki_id\"]\n",
409 |     "        # id = \n",
410 |     "\n",
411 |     "        batch.add_object(\n",
412 |     "            item,\n",
413 |     "            # TODO: provide the new id here \n",
414 |     "            # uuid=\n",
415 |     "        )\n",
416 |     "\n",
417 |     "print(f\"Wiki count: {len(wiki)}\")"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "### Take 2 - import the rest of the data - but break if multiple errors"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": [
433 |     "from tqdm import tqdm\n",
434 |     "from weaviate.util import generate_uuid5\n",
435 |     "\n",
436 |     "wiki = client.collections.use(\"Wiki\")\n",
437 |     "\n",
438 |     "with wiki.batch.fixed_size(batch_size=600, concurrent_requests=2) as batch:\n",
439 |     "    for item in tqdm(data_2k):\n",
440 |     "        id = generate_uuid5(item[\"wiki_id\"])\n",
441 |     "        batch.add_object(item, uuid=id)\n",
442 |     "\n",
443 |     "        # Check number of errors while running\n",
444 |     "        if(batch.number_errors > 10):\n",
445 |     "            print(\"Errors during batch import\")\n",
446 |     "            break"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {},
452 |    "source": [
453 |     "### Check for errors"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": null,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "if(len(wiki.batch.failed_objects)>0):\n",
463 |     "    print(\"Import complete with errors\")\n",
464 |     "    for err in wiki.batch.failed_objects:\n",
465 |     "        print(err)\n",
466 |     "else:\n",
467 |     "    print(\"Import complete with no errors\")"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "## Bonus - iterate through all collection data\n",
475 |     "\n",
476 |     "The client has a built-in function that allows you to iterate through all collection data."
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "wiki = client.collections.use(\"Wiki\")\n",
486 |     "\n",
487 |     "counter = 100\n",
488 |     "\n",
489 |     "for item in wiki.iterator():\n",
490 |     "    print(item.properties)\n",
491 |     "\n",
492 |     "    if (counter == 0): break\n",
493 |     "    \n",
494 |     "    counter -= 1"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "You can also get `vector embeddings`, by using `include_vector`."
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "counter = 10\n",
511 |     "\n",
512 |     "for item in wiki.iterator(include_vector=True):\n",
513 |     "    print(item.properties)\n",
514 |     "    print(item.vector)\n",
515 |     "\n",
516 |     "    if (counter == 0): break\n",
517 |     "    \n",
518 |     "    counter -= 1"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "## Close the client"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": null,
531 |    "metadata": {},
532 |    "outputs": [],
533 |    "source": [
534 |     "client.close()"
535 |    ]
536 |   }
537 |  ],
538 |  "metadata": {
539 |   "kernelspec": {
540 |    "display_name": ".venv (3.11.9)",
541 |    "language": "python",
542 |    "name": "python3"
543 |   },
544 |   "language_info": {
545 |    "codemirror_mode": {
546 |     "name": "ipython",
547 |     "version": 3
548 |    },
549 |    "file_extension": ".py",
550 |    "mimetype": "text/x-python",
551 |    "name": "python",
552 |    "nbconvert_exporter": "python",
553 |    "pygments_lexer": "ipython3",
554 |    "version": "3.11.9"
555 |   }
556 |  },
557 |  "nbformat": 4,
558 |  "nbformat_minor": 2
559 | }
560 | 


--------------------------------------------------------------------------------