├── .env.sample ├── .gitignore ├── LICENSE ├── README.md ├── notebooks ├── 1- Setup Hello World.ipynb ├── 10 - Django to LlamaIndex - Part 1.ipynb ├── 11 - Django to LlamaIndex - Part 2 - Creating Documents.ipynb ├── 12 - Django to LlamaIndex - Part 3 - Custom Emeddings.ipynb ├── 13 - Llama Index Semantic Search Modules.ipynb ├── 14 - Text to SQL with Llama Index.ipynb ├── 15 - Creating Page Views for Blog Posts.ipynb ├── 16 - Multiple Models Text to SQL with Llama Index.ipynb ├── 17 - Customize Prompts.ipynb ├── 18 - Talk to Django.ipynb ├── 2 - Getting Started with Embeddings and Comparison.ipynb ├── 3 - Embeddings with Multiple Data Points.ipynb ├── 4 - Embeddings with IDs.ipynb ├── 5 - Connecting Django.ipynb ├── 6 - Semantic Search with Django and pgvector.ipynb ├── 7 - Semantic Search with Generic Foreign Keys Across Multiple Models.ipynb ├── 8 - Services for Search.ipynb ├── 9 - Cosine Similarity with Numpy.ipynb ├── 99 - Demo.ipynb └── setup.py ├── requirements.txt └── src ├── analytics ├── __init__.py ├── admin.py ├── apps.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ └── fake_traffic.py ├── migrations │ ├── 0001_initial.py │ └── __init__.py ├── models.py ├── tests.py └── views.py ├── blog ├── __init__.py ├── admin.py ├── apps.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ └── load_posts.py ├── migrations │ ├── 0001_initial.py │ ├── 0002_blogpost_embedding.py │ ├── 0003_blogpost_can_delete.py │ ├── 0004_blogpost__content.py │ └── __init__.py ├── models.py ├── services.py ├── tests.py └── views.py ├── cfehome ├── __init__.py ├── asgi.py ├── settings.py ├── urls.py └── wsgi.py ├── manage.py ├── products ├── __init__.py ├── admin.py ├── apps.py ├── migrations │ ├── 0001_initial.py │ └── __init__.py ├── models.py ├── tests.py └── views.py └── rag ├── __init__.py ├── db.py ├── embeddings.py ├── engines.py ├── patches.py ├── prompts.py ├── settings.py ├── sync.py └── updaters.py /.env.sample: -------------------------------------------------------------------------------- 1 | DJANGO_DEBUG=1 2 | DATABASE_URL="" 3 | OPENAI_API_KEY="" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 113 | .pdm.toml 114 | .pdm-python 115 | .pdm-build/ 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Coding For Entrepreneurs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Talk to Django 2 | Talk to Django is simply a project that allows humans to make advanced queries to your Django project. In this you, will learn Semantic Search, Text to SQL, and Retrieval-Augmented Generation (aka RAG) with Django, Embeddings, Sentence Transformers, Neon Postgres Vector, Llama Index, OpenAI, Ollama, Llama 3.1, and more. 3 | -------------------------------------------------------------------------------- /notebooks/1- Setup Hello World.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f38bcb2a-d951-4b34-9978-0f84eef2930e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", 14 | " from tqdm.autonotebook import tqdm, trange\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from sentence_transformers import SentenceTransformer\n", 20 | "\n", 21 | "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "cb69288f-b5f4-4285-823f-052bf401135e", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import numpy as np" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "b2e6aea1-68f6-49cd-855a-31345d1e058b", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "92838f97-feac-40b0-ab00-d5f84d9aed17", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 3 (ipykernel)", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.11.8" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 5 72 | } 73 | -------------------------------------------------------------------------------- /notebooks/10 - Django to LlamaIndex - Part 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "2b5f7338-de48-48b2-a2ce-956150fabd56", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "1e3a536a-02b6-4243-97ba-751815314c37", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from decouple import config\n", 23 | "from blog.models import BlogPost \n", 24 | "from blog import services" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "id": "b66c90da-a020-4ab6-8f77-7b2797d17e68", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# qs = BlogPost.objects.filter(can_delete=True)\n", 35 | "# qs" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "id": "7436777e-4817-4f18-84ec-1e6975000e0c", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# !pip install llama-index sqlalchemy llama-index-vector-stores-postgres" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "id": "a9f2d465-9e22-40ff-b565-3a06e4c00f61", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from llama_index.llms.openai import OpenAI\n", 56 | "from llama_index.embeddings.openai import OpenAIEmbedding" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 6, 62 | "id": "e3f4d9bd-d4d2-41ab-b024-d8c8654f63c7", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "LLM_MODEL = config(\"LLM_MODEL\", default=\"gpt-4o\") # not in use use\n", 67 | "EMEDDING_LENGTH = config(\"EMEDDING_LENGTH\", default=1536, cast=int)\n", 68 | "EMEDDING_MODEL =config(\"EMEDDING_MODEL\", default=\"text-embedding-3-small\")\n", 69 | "OPENAI_API_KEY = config(\"OPENAI_API_KEY\")\n", 70 | "\n", 71 | "llm = OpenAI(model=LLM_MODEL, api_key=OPENAI_API_KEY)\n", 72 | "embed_model = OpenAIEmbedding(model=EMEDDING_MODEL, api_key=OPENAI_API_KEY)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 7, 78 | "id": "cd88f24c-5b04-46ec-b431-eafbbcedad6b", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from llama_index.core import Settings\n", 83 | "\n", 84 | "Settings.llm = llm\n", 85 | "Settings.embed_model = embed_model" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 8, 91 | "id": "5db2ae42-5719-4213-8336-91319d9d013b", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "vector_db_name = \"vector_db\"\n", 96 | "vector_db_table_name = \"blogpost\" # -> data_blogpost" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 9, 102 | "id": "466f8d1d-ee71-4c46-96ba-823ccaeb4bc8", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "DATABASE_URL = config(\"DATABASE_URL_POOL\")\n", 107 | "if DATABASE_URL.startswith(\"postgres://\"):\n", 108 | " DATABASE_URL = DATABASE_URL.replace(\"postgres://\", \"postgresql://\", 1)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 12, 114 | "id": "7571ae4f-9d31-40f9-935a-c44eb8c1bb86", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# create a new database\n", 119 | "from sqlalchemy import create_engine, text\n", 120 | "\n", 121 | "engine = create_engine(DATABASE_URL, isolation_level=\"AUTOCOMMIT\")\n", 122 | "with engine.connect() as connection:\n", 123 | " result = connection.execute(text(\"SELECT 1 FROM pg_database WHERE datname = :db_name\"), {\"db_name\": vector_db_name})\n", 124 | " db_exists = result.scalar() == 1\n", 125 | " if not db_exists:\n", 126 | " session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))\n", 127 | " connection.execute(text(f\"CREATE DATABASE {vector_db_name}\"))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 18, 133 | "id": "2aaea3c8-d241-41c7-aa8a-ade68da9fda6", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from sqlalchemy import make_url\n", 138 | "from llama_index.vector_stores.postgres import PGVectorStore\n", 139 | "\n", 140 | "url = make_url(DATABASE_URL)\n", 141 | "vector_store = PGVectorStore.from_params(\n", 142 | " database=vector_db_name,\n", 143 | " host=url.host,\n", 144 | " password=url.password,\n", 145 | " port=url.port or 5432,\n", 146 | " user=url.username,\n", 147 | " table_name=vector_db_table_name,\n", 148 | " embed_dim=EMEDDING_LENGTH,\n", 149 | ")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 19, 155 | "id": "69d03215-8be1-4c37-87a9-8e9d2adb5d11", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "from llama_index.core import VectorStoreIndex, StorageContext\n", 160 | "\n", 161 | "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", 162 | "index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)\n", 163 | "query_engine = index.as_query_engine()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 20, 169 | "id": "fd4da322-844c-4d33-b389-c2af636a8c18", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "Response(response='Empty Response', source_nodes=[], metadata=None)" 176 | ] 177 | }, 178 | "execution_count": 20, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "query_engine.query(\"My query\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "dcc9e537-494b-4ef7-94ad-ba1460341e64", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [] 194 | } 195 | ], 196 | "metadata": { 197 | "kernelspec": { 198 | "display_name": "Python 3 (ipykernel)", 199 | "language": "python", 200 | "name": "python3" 201 | }, 202 | "language_info": { 203 | "codemirror_mode": { 204 | "name": "ipython", 205 | "version": 3 206 | }, 207 | "file_extension": ".py", 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "nbconvert_exporter": "python", 211 | "pygments_lexer": "ipython3", 212 | "version": "3.11.8" 213 | } 214 | }, 215 | "nbformat": 4, 216 | "nbformat_minor": 5 217 | } 218 | -------------------------------------------------------------------------------- /notebooks/12 - Django to LlamaIndex - Part 3 - Custom Emeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "2b5f7338-de48-48b2-a2ce-956150fabd56", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "1e3a536a-02b6-4243-97ba-751815314c37", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from decouple import config\n", 23 | "from blog.models import BlogPost\n", 24 | "from blog import services" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "id": "b66c90da-a020-4ab6-8f77-7b2797d17e68", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# qs = BlogPost.objects.filter(can_delete=True)\n", 35 | "# qs" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "id": "7436777e-4817-4f18-84ec-1e6975000e0c", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# !pip install llama-index sqlalchemy llama-index-vector-stores-postgres" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "id": "a9f2d465-9e22-40ff-b565-3a06e4c00f61", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from llama_index.llms.openai import OpenAI\n", 56 | "from llama_index.embeddings.openai import OpenAIEmbedding" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 16, 62 | "id": "e3f4d9bd-d4d2-41ab-b024-d8c8654f63c7", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "LLM_MODEL = config(\"LLM_MODEL\", default=\"gpt-4o\") # not in use use\n", 67 | "EMEDDING_LENGTH = config(\"EMEDDING_LENGTH\", default=1536, cast=int)\n", 68 | "EMEDDING_MODEL =config(\"EMEDDING_MODEL\", default=\"text-embedding-3-small\")\n", 69 | "OPENAI_API_KEY = config(\"OPENAI_API_KEY\")\n", 70 | "\n", 71 | "llm = OpenAI(model=LLM_MODEL, api_key=OPENAI_API_KEY)\n", 72 | "embed_model = OpenAIEmbedding(model=EMEDDING_MODEL, api_key=OPENAI_API_KEY)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 17, 78 | "id": "50981128-db62-4a5c-9756-5c39b2da129b", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "from typing import List\n", 83 | "\n", 84 | "class MyOpenAIEmbedding(OpenAIEmbedding):\n", 85 | " \n", 86 | " def _get_query_embedding(self, query: str) -> List[float]:\n", 87 | " \"\"\"Get query embedding.\"\"\"\n", 88 | " print('my query', query) \n", 89 | " # obj, created = Query.objects.get_or_create(query=query)\n", 90 | " # obj.get_query_embedding()\n", 91 | " return super()._get_query_embedding(query)\n", 92 | "\n", 93 | " def _get_text_embedding(self, text: str) -> List[float]:\n", 94 | " \"\"\"Get text embedding.\"\"\"\n", 95 | " print(\"texts\", text)\n", 96 | " return super()._get_text_embedding(text)\n", 97 | "\n", 98 | " def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:\n", 99 | " \"\"\"Get text embeddings.\n", 100 | "\n", 101 | " By default, this is a wrapper around _get_text_embedding.\n", 102 | " Can be overridden for batch queries.\n", 103 | " \"\"\"\n", 104 | " print(\"texts\", texts)\n", 105 | " return super()._get_text_embeddings(texts)\n", 106 | " \n", 107 | "embed_model = MyOpenAIEmbedding(model=EMEDDING_MODEL, api_key=OPENAI_API_KEY)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 41, 113 | "id": "cd88f24c-5b04-46ec-b431-eafbbcedad6b", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "from llama_index.core import Settings\n", 118 | "\n", 119 | "Settings.llm = llm\n", 120 | "Settings.embed_model = embed_model" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 12, 126 | "id": "5db2ae42-5719-4213-8336-91319d9d013b", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "vector_db_name = \"vector_db\"\n", 131 | "vector_db_table_name = \"blogpost\" # -> data_blogpost" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 13, 137 | "id": "466f8d1d-ee71-4c46-96ba-823ccaeb4bc8", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "DATABASE_URL = config(\"DATABASE_URL_POOL\")\n", 142 | "if DATABASE_URL.startswith(\"postgres://\"):\n", 143 | " DATABASE_URL = DATABASE_URL.replace(\"postgres://\", \"postgresql://\", 1)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 14, 149 | "id": "7571ae4f-9d31-40f9-935a-c44eb8c1bb86", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# create a new database\n", 154 | "from sqlalchemy import create_engine, text\n", 155 | "\n", 156 | "engine = create_engine(DATABASE_URL, isolation_level=\"AUTOCOMMIT\")\n", 157 | "with engine.connect() as connection:\n", 158 | " result = connection.execute(text(\"SELECT 1 FROM pg_database WHERE datname = :db_name\"), {\"db_name\": vector_db_name})\n", 159 | " db_exists = result.scalar() == 1\n", 160 | " if not db_exists:\n", 161 | " session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))\n", 162 | " connection.execute(text(f\"CREATE DATABASE {vector_db_name}\"))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 18, 168 | "id": "2aaea3c8-d241-41c7-aa8a-ade68da9fda6", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "from sqlalchemy import make_url\n", 173 | "from llama_index.vector_stores.postgres import PGVectorStore\n", 174 | "\n", 175 | "url = make_url(DATABASE_URL)\n", 176 | "vector_store = PGVectorStore.from_params(\n", 177 | " database=vector_db_name,\n", 178 | " host=url.host,\n", 179 | " password=url.password,\n", 180 | " port=url.port or 5432,\n", 181 | " user=url.username,\n", 182 | " table_name=vector_db_table_name,\n", 183 | " embed_dim=EMEDDING_LENGTH,\n", 184 | ")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 46, 190 | "id": "69d03215-8be1-4c37-87a9-8e9d2adb5d11", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "from llama_index.core import VectorStoreIndex, StorageContext\n", 195 | "\n", 196 | "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", 197 | "index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)\n", 198 | "query_engine = index.as_query_engine()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "id": "fd4da322-844c-4d33-b389-c2af636a8c18", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 52, 212 | "id": "dcc9e537-494b-4ef7-94ad-ba1460341e64", 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "from llama_index.core import Document\n", 217 | "\n", 218 | "docs = []\n", 219 | "qs = BlogPost.objects.filter(can_delete=True)\n", 220 | "for obj in qs:\n", 221 | " docs.append(\n", 222 | " Document(\n", 223 | " text=f\"{obj.get_embedding_text_raw()}\",\n", 224 | " doc_id=str(obj.id),\n", 225 | " embedding=obj.embedding.tolist(),\n", 226 | " metadata = {\n", 227 | " \"pk\": obj.pk,\n", 228 | " \"title\": obj.title\n", 229 | " }\n", 230 | " )\n", 231 | " )\n", 232 | "\n", 233 | "# docs" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 53, 239 | "id": "5890e396-f2f0-456e-8b12-48b8e06a7c95", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "for doc in docs:\n", 244 | " index.delete_ref_doc(f\"{doc.id_}\", delete_from_docstore=True)\n", 245 | " index.insert(doc)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 49, 251 | "id": "130ead94-e6a6-4566-b673-c185b8f9b7f8", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "my query The dog jumped\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "response = query_engine.query(\"The dog jumped\")" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 16, 269 | "id": "fff374aa-4331-4f48-96eb-cb8c0cb76cf8", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "pk 34\n", 277 | "title Blog Post 1\n", 278 | "pk 35\n", 279 | "title Blog Post 2\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "for k in response.metadata.keys():\n", 285 | " for subk, v in response.metadata[k].items():\n", 286 | " print(subk, v)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 19, 292 | "id": "dee1dd1a-1605-43ad-8d88-09c8a311082a", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "port = url.port or 5432\n", 297 | "db_url = f\"postgresql://{url.username}:{url.password}@{url.host}:{port}/{vector_db_name}\"\n", 298 | "\n", 299 | "\n", 300 | "from sqlalchemy import create_engine, text\n", 301 | "import numpy as np\n", 302 | "\n", 303 | "\n", 304 | "# Create the SQLAlchemy engine\n", 305 | "engine = create_engine(db_url)\n", 306 | "\n", 307 | "with engine.connect() as connection:\n", 308 | " # Define the SQL query to select only the id and embedding columns\n", 309 | " query = text(f\"SELECT * FROM data_{vector_db_table_name}\")\n", 310 | " query = text(f\"SELECT metadata_, embedding FROM data_{vector_db_table_name}\")\n", 311 | " \n", 312 | " # Execute the query\n", 313 | " result = connection.execute(query)\n", 314 | " \n", 315 | " # Fetch all rows\n", 316 | " rows = result.fetchall()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 20, 322 | "id": "51a8d6d2-aafe-41fa-8991-fbb6d63c8a18", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "def calculate_cosine_metrics(v1, v2):\n", 327 | " dot_product = np.dot(v1, v2)\n", 328 | " magnitude1 = np.linalg.norm(v1)\n", 329 | " magnitude2 = np.linalg.norm(v2)\n", 330 | " cosine_similarity = dot_product / (magnitude1 * magnitude2)\n", 331 | " cosine_distance = 1 - cosine_similarity\n", 332 | " return int(cosine_similarity* 100), int(cosine_distance * 100)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 22, 338 | "id": "fd1b03b6-64e6-414e-a192-32a4af3a2a38", 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "(100, 0)\n", 346 | "(100, 0)\n", 347 | "(100, 0)\n", 348 | "(100, 0)\n", 349 | "(100, 0)\n", 350 | "(100, 0)\n", 351 | "(100, 0)\n", 352 | "(100, 0)\n", 353 | "(100, 0)\n", 354 | "(100, 0)\n", 355 | "(100, 0)\n", 356 | "(100, 0)\n", 357 | "(100, 0)\n", 358 | "(100, 0)\n", 359 | "(100, 0)\n", 360 | "(100, 0)\n", 361 | "(100, 0)\n", 362 | "(100, 0)\n", 363 | "(100, 0)\n", 364 | "(100, 0)\n", 365 | "(100, 0)\n", 366 | "(100, 0)\n", 367 | "(100, 0)\n", 368 | "(100, 0)\n", 369 | "(100, 0)\n", 370 | "(100, 0)\n", 371 | "(100, 0)\n", 372 | "(100, 0)\n", 373 | "(100, 0)\n", 374 | "(100, 0)\n", 375 | "(100, 0)\n", 376 | "(100, 0)\n", 377 | "(100, 0)\n", 378 | "(100, 0)\n", 379 | "(100, 0)\n", 380 | "(100, 0)\n", 381 | "(100, 0)\n", 382 | "(100, 0)\n", 383 | "(100, 0)\n", 384 | "(100, 0)\n", 385 | "(100, 0)\n", 386 | "(100, 0)\n", 387 | "(100, 0)\n", 388 | "(100, 0)\n", 389 | "(100, 0)\n", 390 | "(100, 0)\n", 391 | "(100, 0)\n", 392 | "(100, 0)\n", 393 | "(100, 0)\n", 394 | "(100, 0)\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "for row in rows:\n", 400 | " metadata_, embedding = row[0], row[1]\n", 401 | " # print(metadata_)\n", 402 | " blog_post_pk = metadata_.get(\"pk\")\n", 403 | " # try:\n", 404 | " # obj = BlogPost.objects.get(pk=blog_post_pk)\n", 405 | " # except:\n", 406 | " # continue\n", 407 | " embedding_array = np.array(embedding.strip('[]').split(','), dtype=float)\n", 408 | " obj_embedding_array = np.array(obj.embedding, dtype=float)\n", 409 | " print(calculate_cosine_metrics(embedding_array.shape, obj_embedding_array.shape))\n", 410 | " # print(obj.embedding, embedding)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "id": "cf34ef2a-c398-40bd-9dbd-89c9ca8cada9", 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [] 420 | } 421 | ], 422 | "metadata": { 423 | "kernelspec": { 424 | "display_name": "Python 3 (ipykernel)", 425 | "language": "python", 426 | "name": "python3" 427 | }, 428 | "language_info": { 429 | "codemirror_mode": { 430 | "name": "ipython", 431 | "version": 3 432 | }, 433 | "file_extension": ".py", 434 | "mimetype": "text/x-python", 435 | "name": "python", 436 | "nbconvert_exporter": "python", 437 | "pygments_lexer": "ipython3", 438 | "version": "3.11.8" 439 | } 440 | }, 441 | "nbformat": 4, 442 | "nbformat_minor": 5 443 | } 444 | -------------------------------------------------------------------------------- /notebooks/13 - Llama Index Semantic Search Modules.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c5ddf7b8-af88-4ce2-9e93-3c3a1e8f5905", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "d6a815a3-2c20-4459-b857-6aa6b7391ae7", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Syncing 4 docs\n", 26 | "Sync done.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "from rag import engines, sync\n", 32 | "\n", 33 | "sync.sync_blog_docs()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "id": "24f74377-4413-4f7e-a01d-eeb7597ee61d", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "query_engine = engines.get_semantic_query_engine()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "id": "ac911eb0-f9fe-4418-a2d8-c3a12d1c682a", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Hello! How can I assist you today?\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "response = query_engine.query(\"hi there\")\n", 62 | "print(response.response)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "id": "da1125b8-b8ef-48b4-b15d-8c7e2fd2b7df", 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Score 0.6115641593933149\n", 76 | "Contents The cat jumped over the dog\n", 77 | "Metadata {'pk': 35, 'title': 'Blog Post 2'}\n", 78 | "Score 0.6085933809634089\n", 79 | "Contents The dog jumped over the cat\n", 80 | "Metadata {'pk': 34, 'title': 'Blog Post 1'}\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "response = query_engine.query(\"Dog jumping\")\n", 86 | "nodes = response.source_nodes\n", 87 | "\n", 88 | "for node in response.source_nodes:\n", 89 | " print(\"Score\", node.score)\n", 90 | " print(\"Contents\", node.node.get_content())\n", 91 | " print(\"Metadata\", node.node.metadata)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "id": "712a2237-7740-4989-a41e-f25b73ae2952", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "query_retriever_engine = engines.get_semantic_query_retriever_engine(top_k=5)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 7, 107 | "id": "3ac5c76d-da82-4c7f-8fd9-94560d9b7ec5", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "ranked_response = query_retriever_engine.query(\"Dog jumping\")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 8, 117 | "id": "93c22326-6ba9-4d07-a286-ecc3ee8e23c1", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Score 0.6115641593933149\n", 125 | "Contents The cat jumped over the dog\n", 126 | "Metadata {'pk': 35, 'title': 'Blog Post 2'}\n", 127 | "Score 0.6085933809634089\n", 128 | "Contents The dog jumped over the cat\n", 129 | "Metadata {'pk': 34, 'title': 'Blog Post 1'}\n", 130 | "Score 0.3220669819156523\n", 131 | "Contents The cat is yellow and the dog is red\n", 132 | "Metadata {'pk': 37, 'title': 'Blog Post 4'}\n", 133 | "Score 0.15181733082636206\n", 134 | "Contents It is very warm today\n", 135 | "Metadata {'pk': 36, 'title': 'Blog Post 3'}\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "for node in ranked_response.source_nodes:\n", 141 | " print(\"Score\", node.score)\n", 142 | " print(\"Contents\", node.node.get_content())\n", 143 | " print(\"Metadata\", node.node.metadata)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "d1c1fb26-fe3a-4ee3-91e5-ba3e643f859d", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3 (ipykernel)", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.11.8" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 5 176 | } 177 | -------------------------------------------------------------------------------- /notebooks/14 - Text to SQL with Llama Index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "940f26d4-29c0-4488-a590-b38592563b89", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7de1120e-935f-425c-8b07-f844861f1942", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from decouple import config" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "id": "47b39382-a884-49c0-9440-1dcd0b68ccb5", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from blog.models import BlogPost\n", 33 | "from rag import db as rag_db, settings as rag_settings" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "id": "cb95572d-9a8e-44bb-991c-a68db4a8bb4b", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from sqlalchemy import (\n", 44 | " create_engine,\n", 45 | " inspect,\n", 46 | ")\n", 47 | "\n", 48 | "from llama_index.core import SQLDatabase\n", 49 | "from llama_index.core.query_engine import NLSQLTableQueryEngine\n", 50 | "from llama_index.core.retrievers import NLSQLRetriever" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "id": "0cea0a40-a1b6-46b8-85fa-73f133403a5e", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# initialize default LlamaIndex settings\n", 61 | "rag_settings.init()\n", 62 | "# get pooled Neon database string from .env or env vars\n", 63 | "vector_database_url = rag_db.get_database_url(use_pooling=True)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 6, 69 | "id": "cd919039-8f8d-4a99-a5f0-2105b9534b50", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "engine = create_engine(vector_database_url)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 7, 79 | "id": "39a2e483-dfc1-452f-a960-6dc65c6afbe3", 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "['django_migrations',\n", 86 | " 'django_content_type',\n", 87 | " 'auth_permission',\n", 88 | " 'auth_group',\n", 89 | " 'auth_group_permissions',\n", 90 | " 'auth_user',\n", 91 | " 'auth_user_groups',\n", 92 | " 'auth_user_user_permissions',\n", 93 | " 'django_admin_log',\n", 94 | " 'django_session',\n", 95 | " 'blog_blogpost',\n", 96 | " 'products_embedding',\n", 97 | " 'products_product']" 98 | ] 99 | }, 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "inspect(engine).get_table_names()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 8, 112 | "id": "d043d078-2b31-4737-a2e6-94633f000a56", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "tables = []\n", 117 | "models = [BlogPost]\n", 118 | "for model in models:\n", 119 | " table = model._meta.db_table\n", 120 | " tables.append(table)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 9, 126 | "id": "a078145e-165c-4fa2-9fb2-c28c6f9f6081", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "['blog_blogpost']" 133 | ] 134 | }, 135 | "execution_count": 9, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "tables" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 10, 147 | "id": "fd31bc90-9f6c-4858-9cf0-617d95f628ec", 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stderr", 152 | "output_type": "stream", 153 | "text": [ 154 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/llama_index/core/utilities/sql_wrapper.py:110: SAWarning: Did not recognize type 'vector' of column 'embedding'\n", 155 | " self._metadata.reflect(\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "sql_database = SQLDatabase(engine, include_tables=tables)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 11, 166 | "id": "f395572c-a467-4dde-9378-6214018f2626", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "sql_query_engine = NLSQLTableQueryEngine(\n", 171 | " sql_database=sql_database,\n", 172 | " tables=tables,\n", 173 | ")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 18, 179 | "id": "4bd53412-5601-4af3-b705-28f47f5ad691", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "Response(response='Your most recent blog post ID is 37.', source_nodes=[NodeWithScore(node=TextNode(id_='71c339cc-6c0a-4a8c-a872-021415e117e8', embedding=None, metadata={'sql_query': 'SELECT id FROM blog_blogpost ORDER BY timestamp DESC LIMIT 1;', 'result': [(37,)], 'col_keys': ['id']}, excluded_embed_metadata_keys=['sql_query', 'result', 'col_keys'], excluded_llm_metadata_keys=['sql_query', 'result', 'col_keys'], relationships={}, text='[(37,)]', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=None)], metadata={'71c339cc-6c0a-4a8c-a872-021415e117e8': {'sql_query': 'SELECT id FROM blog_blogpost ORDER BY timestamp DESC LIMIT 1;', 'result': [(37,)], 'col_keys': ['id']}, 'sql_query': 'SELECT id FROM blog_blogpost ORDER BY timestamp DESC LIMIT 1;', 'result': [(37,)], 'col_keys': ['id']})" 186 | ] 187 | }, 188 | "execution_count": 18, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "response = sql_query_engine.query(\"What is my most recent blog post id?\")\n", 195 | "response" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 19, 201 | "id": "14020123-622c-4b9f-b9a6-92333c965dd3", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "[(37,)]\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "for node in response.source_nodes:\n", 214 | " print(node.node.get_content())" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 26, 220 | "id": "14e80a4f-1f91-4278-975e-79b060b57e34", 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "nl_sql_retriever = NLSQLRetriever(\n", 225 | " sql_database, tables=tables, return_raw=True\n", 226 | ")\n", 227 | "\n", 228 | "r = nl_sql_retriever.retrieve(\"What is my least most recent blog post?\")" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 27, 234 | "id": "51590562-fe10-4ad4-a98f-bfcaa53ecf02", 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "[NodeWithScore(node=TextNode(id_='ff5bb486-d8c8-4973-adf1-0c38bad839ad', embedding=None, metadata={'sql_query': 'SELECT id, title, timestamp FROM blog_blogpost ORDER BY timestamp ASC LIMIT 1;', 'result': [(29, 'New Blog Post', datetime.datetime(2024, 7, 31, 18, 19, 27, 623803, tzinfo=datetime.timezone.utc))], 'col_keys': ['id', 'title', 'timestamp']}, excluded_embed_metadata_keys=['sql_query', 'result', 'col_keys'], excluded_llm_metadata_keys=['sql_query', 'result', 'col_keys'], relationships={}, text=\"[(29, 'New Blog Post', datetime.datetime(2024, 7, 31, 18, 19, 27, 623803, tzinfo=datetime.timezone.utc))]\", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=None)]\n", 242 | "Node ID: ff5bb486-d8c8-4973-adf1-0c38bad839ad\n", 243 | "Text: [(29, 'New Blog Post', datetime.datetime(2024, 7, 31, 18, 19,\n", 244 | "27, 623803, tzinfo=datetime.timezone.utc))]\n", 245 | "Score: None\n", 246 | "\n", 247 | "{'sql_query': 'SELECT id, title, timestamp FROM blog_blogpost ORDER BY timestamp ASC LIMIT 1;', 'result': [(29, 'New Blog Post', datetime.datetime(2024, 7, 31, 18, 19, 27, 623803, tzinfo=datetime.timezone.utc))], 'col_keys': ['id', 'title', 'timestamp']}\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "print(r)\n", 253 | "for node in r:\n", 254 | " print(node)\n", 255 | " print(node.metadata)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "id": "90410ef3-2a54-4790-98e2-dbe6207310e3", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "217508cb-e6c9-4cfc-a3fa-ead86bc22d4f", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [] 273 | } 274 | ], 275 | "metadata": { 276 | "kernelspec": { 277 | "display_name": "Python 3 (ipykernel)", 278 | "language": "python", 279 | "name": "python3" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 3 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython3", 291 | "version": "3.11.8" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 5 296 | } 297 | -------------------------------------------------------------------------------- /notebooks/15 - Creating Page Views for Blog Posts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "bd852dda-18ea-4aad-a127-4c5d9fcb44e4", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7ac8239e-a4d4-40ac-b0b4-352d875db445", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import random\n", 23 | "from analytics.models import PageView\n", 24 | "from blog.models import BlogPost" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "id": "6878cda6-e5db-4e8f-9636-75714400f3a0", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "qs = BlogPost.objects.filter(can_delete=True)\n", 35 | "\n", 36 | "for obj in qs:\n", 37 | " rand_views = random.randint(500, 2500)\n", 38 | " for x in range(rand_views):\n", 39 | " PageView.objects.create(post=obj)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "4f283055-f807-4e14-bf81-95d9318bcd4e", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# PageView.objects.bulk_create()" 50 | ] 51 | } 52 | ], 53 | "metadata": { 54 | "kernelspec": { 55 | "display_name": "Python 3 (ipykernel)", 56 | "language": "python", 57 | "name": "python3" 58 | }, 59 | "language_info": { 60 | "codemirror_mode": { 61 | "name": "ipython", 62 | "version": 3 63 | }, 64 | "file_extension": ".py", 65 | "mimetype": "text/x-python", 66 | "name": "python", 67 | "nbconvert_exporter": "python", 68 | "pygments_lexer": "ipython3", 69 | "version": "3.11.8" 70 | } 71 | }, 72 | "nbformat": 4, 73 | "nbformat_minor": 5 74 | } 75 | -------------------------------------------------------------------------------- /notebooks/16 - Multiple Models Text to SQL with Llama Index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "940f26d4-29c0-4488-a590-b38592563b89", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7de1120e-935f-425c-8b07-f844861f1942", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from decouple import config" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "id": "47b39382-a884-49c0-9440-1dcd0b68ccb5", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from analytics.models import PageView\n", 33 | "from blog.models import BlogPost\n", 34 | "from rag import db as rag_db, settings as rag_settings" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "id": "cb95572d-9a8e-44bb-991c-a68db4a8bb4b", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from sqlalchemy import (\n", 45 | " create_engine,\n", 46 | " inspect,\n", 47 | ")\n", 48 | "\n", 49 | "from llama_index.core import SQLDatabase\n", 50 | "from llama_index.core.query_engine import NLSQLTableQueryEngine\n", 51 | "from llama_index.core.retrievers import NLSQLRetriever" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "id": "0cea0a40-a1b6-46b8-85fa-73f133403a5e", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# initialize default LlamaIndex settings\n", 62 | "rag_settings.init()\n", 63 | "# get pooled Neon database string from .env or env vars\n", 64 | "vector_database_url = rag_db.get_database_url(use_pooling=True)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 6, 70 | "id": "cd919039-8f8d-4a99-a5f0-2105b9534b50", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "engine = create_engine(vector_database_url)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "id": "39a2e483-dfc1-452f-a960-6dc65c6afbe3", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "['django_migrations',\n", 87 | " 'django_content_type',\n", 88 | " 'auth_permission',\n", 89 | " 'auth_group',\n", 90 | " 'auth_group_permissions',\n", 91 | " 'auth_user',\n", 92 | " 'auth_user_groups',\n", 93 | " 'auth_user_user_permissions',\n", 94 | " 'django_admin_log',\n", 95 | " 'django_session',\n", 96 | " 'blog_blogpost',\n", 97 | " 'products_embedding',\n", 98 | " 'products_product',\n", 99 | " 'analytics_pageview']" 100 | ] 101 | }, 102 | "execution_count": 7, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "inspect(engine).get_table_names()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "id": "d043d078-2b31-4737-a2e6-94633f000a56", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "tables = []\n", 119 | "models = [BlogPost, PageView]\n", 120 | "for model in models:\n", 121 | " table = model._meta.db_table\n", 122 | " tables.append(table)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 9, 128 | "id": "a078145e-165c-4fa2-9fb2-c28c6f9f6081", 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "['blog_blogpost', 'analytics_pageview']" 135 | ] 136 | }, 137 | "execution_count": 9, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "tables" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 10, 149 | "id": "fd31bc90-9f6c-4858-9cf0-617d95f628ec", 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stderr", 154 | "output_type": "stream", 155 | "text": [ 156 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/llama_index/core/utilities/sql_wrapper.py:110: SAWarning: Did not recognize type 'vector' of column 'embedding'\n", 157 | " self._metadata.reflect(\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "sql_database = SQLDatabase(engine, include_tables=tables)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 11, 168 | "id": "f395572c-a467-4dde-9378-6214018f2626", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "sql_query_engine = NLSQLTableQueryEngine(\n", 173 | " sql_database=sql_database,\n", 174 | " tables=tables,\n", 175 | ")" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 12, 181 | "id": "4bd53412-5601-4af3-b705-28f47f5ad691", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stderr", 186 | "output_type": "stream", 187 | "text": [ 188 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/llama_index/core/utilities/sql_wrapper.py:167: SAWarning: Did not recognize type 'vector' of column 'embedding'\n", 189 | " for column in self._inspector.get_columns(table_name, schema=self._schema):\n" 190 | ] 191 | }, 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "The blog post with the most views is titled \"Taking it very seriously,\" which has garnered a total of 2,484 views.\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "response = sql_query_engine.query(\"What blog post has the most views?\")\n", 202 | "print(str(response))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 15, 208 | "id": "14020123-622c-4b9f-b9a6-92333c965dd3", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "[('Blog Post 1', 1523)]\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "for node in response.source_nodes:\n", 221 | " print(node.node.get_content())" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 18, 227 | "id": "14e80a4f-1f91-4278-975e-79b060b57e34", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "nl_sql_retriever = NLSQLRetriever(\n", 232 | " sql_database, tables=tables, return_raw=True\n", 233 | ")\n", 234 | "\n", 235 | "r = nl_sql_retriever.retrieve(\"What is my least most viewed blog post?\")" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 19, 241 | "id": "51590562-fe10-4ad4-a98f-bfcaa53ecf02", 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "[NodeWithScore(node=TextNode(id_='df4d692b-8c86-4094-a258-4784d067b810', embedding=None, metadata={'sql_query': 'SELECT blog_blogpost.id, blog_blogpost.title, COUNT(analytics_pageview.id) AS view_count\\nFROM blog_blogpost\\nLEFT JOIN analytics_pageview ON blog_blogpost.id = analytics_pageview.post_id\\nGROUP BY blog_blogpost.id\\nORDER BY view_count ASC\\nLIMIT 1;', 'result': [(29, 'New Blog Post', 0)], 'col_keys': ['id', 'title', 'view_count']}, excluded_embed_metadata_keys=['sql_query', 'result', 'col_keys'], excluded_llm_metadata_keys=['sql_query', 'result', 'col_keys'], relationships={}, text=\"[(29, 'New Blog Post', 0)]\", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=None)]\n", 249 | "Node ID: df4d692b-8c86-4094-a258-4784d067b810\n", 250 | "Text: [(29, 'New Blog Post', 0)]\n", 251 | "Score: None\n", 252 | "\n", 253 | "{'sql_query': 'SELECT blog_blogpost.id, blog_blogpost.title, COUNT(analytics_pageview.id) AS view_count\\nFROM blog_blogpost\\nLEFT JOIN analytics_pageview ON blog_blogpost.id = analytics_pageview.post_id\\nGROUP BY blog_blogpost.id\\nORDER BY view_count ASC\\nLIMIT 1;', 'result': [(29, 'New Blog Post', 0)], 'col_keys': ['id', 'title', 'view_count']}\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "print(r)\n", 259 | "for node in r:\n", 260 | " print(node)\n", 261 | " print(node.metadata)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "id": "90410ef3-2a54-4790-98e2-dbe6207310e3", 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "217508cb-e6c9-4cfc-a3fa-ead86bc22d4f", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "Python 3 (ipykernel)", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.11.8" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } 303 | -------------------------------------------------------------------------------- /notebooks/17 - Customize Prompts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "940f26d4-29c0-4488-a590-b38592563b89", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "7de1120e-935f-425c-8b07-f844861f1942", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from decouple import config" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "47b39382-a884-49c0-9440-1dcd0b68ccb5", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from rag import db as rag_db, settings as rag_settings, engines as rag_engines" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "f30419da-7e57-4210-857d-b359767e9cc1", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "rag_settings.init()\n", 43 | "semantic_query_engine = rag_engines.get_semantic_query_engine()\n", 44 | "sql_query_engine = rag_engines.get_sql_query_engine(sql_only=True)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "1f0058f5-0598-4dfa-9ff0-fd2c60fbe9f1", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "print_default_prompts = True" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "577a4dc1-5fcb-45e7-b74d-40e404c17318", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "prompts_dict = sql_query_engine.get_prompts()\n", 65 | "\n", 66 | "if print_default_prompts:\n", 67 | " print(prompts_dict['response_synthesis_prompt'].template)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "708feb74-d5c7-48e7-9b5f-96b91eb26114", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "if print_default_prompts:\n", 78 | " print(sql_query_engine.sql_retriever.get_prompts()['text_to_sql_prompt'].template)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "89f98fa3-4898-4cab-bcae-9b7a16f1cb34", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from llama_index.core import PromptTemplate" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "36a85106-f24c-4399-acaa-142f02693f43", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "custom_sql_prompt = PromptTemplate(\"\"\"Given an input question, create a precise {dialect} PostgreSQL query to answer it. Follow these guidelines:\n", 99 | "\n", 100 | "1. Use only tables and columns from the provided schema.\n", 101 | "2. Select only relevant columns, never all columns.\n", 102 | "3. Qualify column names with table names when necessary.\n", 103 | "4. Use appropriate JOINs, WHERE clauses, and aggregations.\n", 104 | "5. Order results to highlight the most pertinent information.\n", 105 | "6. Avoid querying non-existent columns or tables.\n", 106 | "7. Optimize the query for performance where possible.\n", 107 | "\n", 108 | "Your response should contain only the SQL query, without any additional explanation or formatting. Do not use markdown or prepend the query with the term `sql`.\n", 109 | "\n", 110 | "Schema:\n", 111 | "{schema}\n", 112 | "\n", 113 | "Question: {query_str}\n", 114 | "SQL Query:\n", 115 | "\"\"\")" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "f6f92429-977f-4171-9df2-5b719ed69ba8", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "custom_response_synthesis_prompt = PromptTemplate(\"\"\"Given an input question, synthesize a response from the query results.\n", 126 | "Query: {query_str}\n", 127 | "SQL: {sql_query}\n", 128 | "SQL Response: {context_str}\n", 129 | "Response: \n", 130 | "\"\"\")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "9d168d4b-90cb-448f-9986-bd89fe46d249", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "custom_response_synthesis_prompt.template_vars" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "cab7d9b7-8bfc-49a7-9d55-4418166a485d", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "sql_query_engine = rag_engines.get_sql_query_engine(\n", 151 | " sql_only=False,\n", 152 | " synthesize_response=True\n", 153 | ")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "6ce8fc88-4ce0-40e4-a6a6-b213d42a45d9", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# print(sql_query_engine.sql_retriever.get_prompts()['text_to_sql_prompt'].template)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "4bd53412-5601-4af3-b705-28f47f5ad691", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "response = sql_query_engine.query(\"What blog post has the most views from today?\")\n", 174 | "print(str(response))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "14020123-622c-4b9f-b9a6-92333c965dd3", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "for node in response.source_nodes:\n", 185 | " print(node.node.get_content())" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "90410ef3-2a54-4790-98e2-dbe6207310e3", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "217508cb-e6c9-4cfc-a3fa-ead86bc22d4f", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3 (ipykernel)", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.11.8" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 5 226 | } 227 | -------------------------------------------------------------------------------- /notebooks/18 - Talk to Django.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8dd700e2-34e8-4c9a-803c-0270c0f7c9f8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "f5adc0e1-26f7-4430-9de7-0a2df550063c", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from rag import (\n", 23 | " db as rag_db, \n", 24 | " engines as rag_engines,\n", 25 | " settings as rag_settings, \n", 26 | " updaters as rag_updaters,\n", 27 | ")" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "id": "ba98d441-cc40-441f-b695-4f950bcfc130", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from typing import Optional, Union\n", 38 | "from sqlalchemy import create_engine, text" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "id": "bca05a98-dca8-4c61-be99-533eb9511039", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "rag_settings.init()\n", 49 | "rag_db.init_vector_db()\n", 50 | "rag_updaters.update_llama_index_documents(use_saved_embeddings=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "id": "d31ead22-ac7b-4174-b2cb-4086a6459473", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "vector_index = rag_engines.get_semantic_query_index()\n", 61 | "semantic_query_retriever = rag_engines.get_semantic_query_retriever_engine()\n", 62 | "sql_query_engine = rag_engines.get_sql_query_engine()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 6, 68 | "id": "3a76d593-450d-48fc-bbb6-74e0d2298bca", 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "vector_db blogpost\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "print(rag_settings.VECTOR_DB_NAME, rag_settings.VECTOR_DB_TABLE_NAME)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "id": "872f70d5-fd38-4146-9eee-0f4ba554a8d7", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "from llama_index.core.tools import QueryEngineTool\n", 91 | "\n", 92 | "vector_tool = QueryEngineTool.from_defaults(\n", 93 | " query_engine=semantic_query_retriever,\n", 94 | " description=(\n", 95 | " f\"Useful for answering semantic questions about different blog posts\"\n", 96 | " ),\n", 97 | ")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 8, 103 | "id": "0f155a6a-4d60-40e4-b43e-2428a086c8d1", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "sql_tool = QueryEngineTool.from_defaults(\n", 108 | " query_engine=sql_query_engine,\n", 109 | " description=(\n", 110 | " \"Useful for translating a natural language query into a SQL query over\"\n", 111 | " \" a table containing: blog posts and page views each blog post\"\n", 112 | " ),\n", 113 | ")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 9, 119 | "id": "cb330033-935f-4f89-a238-32ddd18a9e79", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from typing import Any, Optional, Union\n", 124 | "\n", 125 | "\n", 126 | "from llama_index.core.query_engine import SQLAutoVectorQueryEngine\n", 127 | "from llama_index.core.query_engine.sql_vector_query_engine import *\n", 128 | "\n", 129 | "class MySQLAutoVectorQueryEngine(SQLAutoVectorQueryEngine):\n", 130 | " def __init__(\n", 131 | " self,\n", 132 | " sql_query_tool: QueryEngineTool,\n", 133 | " vector_query_tool: QueryEngineTool,\n", 134 | " selector: Optional[Union[LLMSingleSelector, PydanticSingleSelector]] = None,\n", 135 | " llm: Optional[LLM] = None,\n", 136 | " service_context: Optional[ServiceContext] = None,\n", 137 | " sql_vector_synthesis_prompt: Optional[BasePromptTemplate] = None,\n", 138 | " sql_augment_query_transform: Optional[SQLAugmentQueryTransform] = None,\n", 139 | " use_sql_vector_synthesis: bool = True,\n", 140 | " callback_manager: Optional[CallbackManager] = None,\n", 141 | " verbose: bool = True,\n", 142 | " ) -> None:\n", 143 | " \"\"\"Initialize params.\"\"\"\n", 144 | " # validate that the query engines are of the right type\n", 145 | " if not isinstance(\n", 146 | " sql_query_tool.query_engine,\n", 147 | " (BaseSQLTableQueryEngine, NLSQLTableQueryEngine),\n", 148 | " ):\n", 149 | " raise ValueError(\n", 150 | " \"sql_query_tool.query_engine must be an instance of \"\n", 151 | " \"BaseSQLTableQueryEngine or NLSQLTableQueryEngine\"\n", 152 | " )\n", 153 | " if not isinstance(vector_query_tool.query_engine, RetrieverQueryEngine):\n", 154 | " raise ValueError(\n", 155 | " \"vector_query_tool.query_engine must be an instance of \"\n", 156 | " \"RetrieverQueryEngine\"\n", 157 | " )\n", 158 | " # if not isinstance(\n", 159 | " # vector_query_tool.query_engine.retriever, VectorIndexAutoRetriever\n", 160 | " # ):\n", 161 | " # raise ValueError(\n", 162 | " # \"vector_query_tool.query_engine.retriever must be an instance \"\n", 163 | " # \"of VectorIndexAutoRetriever\"\n", 164 | " # )\n", 165 | "\n", 166 | " sql_vector_synthesis_prompt = (\n", 167 | " sql_vector_synthesis_prompt or DEFAULT_SQL_VECTOR_SYNTHESIS_PROMPT\n", 168 | " )\n", 169 | " SQLJoinQueryEngine.__init__(\n", 170 | " self,\n", 171 | " sql_query_tool,\n", 172 | " vector_query_tool,\n", 173 | " selector=selector,\n", 174 | " llm=llm,\n", 175 | " service_context=service_context,\n", 176 | " sql_join_synthesis_prompt=sql_vector_synthesis_prompt,\n", 177 | " sql_augment_query_transform=sql_augment_query_transform,\n", 178 | " use_sql_join_synthesis=use_sql_vector_synthesis,\n", 179 | " callback_manager=callback_manager,\n", 180 | " verbose=verbose,\n", 181 | " )" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 11, 187 | "id": "298422a3-bf77-4ade-a79d-e5f3dd69d172", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "# from llama_index.core.query_engine import SQLAutoVectorQueryEngine\n", 192 | "\n", 193 | "query_engine = MySQLAutoVectorQueryEngine(\n", 194 | " sql_tool, \n", 195 | " vector_tool,\n", 196 | ")" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 12, 202 | "id": "50961604-48ca-437d-ad24-7ac7855cab96", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "\u001b[1;3;34mQuerying other query engine: The question 'What kind of org is discussed?' is a semantic question about the content of blog posts. Therefore, choice (2) is the most relevant as it is useful for answering semantic questions about different blog posts.\n", 210 | "\u001b[0m" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "response = query_engine.query(\n", 216 | " \"What kind of org is discussed?\"\n", 217 | ")" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 13, 223 | "id": "0af92a90-2914-4172-9c5d-8e3efbbb9c59", 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "'The discussion contrasts two types of entities: an organization and an organism. An organization is structured, with systems, charts, and approval processes, while an organism is dynamic, constantly changing, and adapting to its environment. The text suggests that engaging with a culture as part of an organism can lead to better understanding and resilience compared to a traditional organization.'" 230 | ] 231 | }, 232 | "execution_count": 13, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "response.response" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 14, 244 | "id": "78548bec-2197-42f8-ab50-3d095d1374c0", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "\u001b[1;3;34mQuerying SQL database: The question requires translating a natural language query into a SQL query to retrieve the top 5 most viewed blog posts and analyze their content for keywords. Choice (1) is relevant as it deals with translating queries into SQL over a table containing blog posts and page views.\n", 252 | "\u001b[0m\u001b[1;3;33mSQL query: SELECT \n", 253 | " blog_blogpost.id, \n", 254 | " blog_blogpost.title, \n", 255 | " blog_blogpost.content, \n", 256 | " COUNT(analytics_pageview.id) AS view_count\n", 257 | "FROM \n", 258 | " blog_blogpost\n", 259 | "JOIN \n", 260 | " analytics_pageview ON blog_blogpost.id = analytics_pageview.post_id\n", 261 | "GROUP BY \n", 262 | " blog_blogpost.id, blog_blogpost.title, blog_blogpost.content\n", 263 | "ORDER BY \n", 264 | " view_count DESC\n", 265 | "LIMIT 5;\n", 266 | "\u001b[0m\u001b[1;3;33mSQL response: The top 5 most viewed blog posts and their content keywords are as follows:\n", 267 | "\n", 268 | "1. **Title:** Taking it very seriously\n", 269 | " - **View Count:** 2493\n", 270 | " - **Content Keywords:** April first, greeting, New Year’s, internet, smile, pretend, apocalypse\n", 271 | "\n", 272 | "2. **Title:** “But we were comfortable”\n", 273 | " - **View Count:** 2490\n", 274 | " - **Content Keywords:** shift to digital, unwanted, risk, lonely, powerful, efficient, comfortable, follow, stay\n", 275 | "\n", 276 | "3. **Title:** All models are wrong, some models are useful\n", 277 | " - **View Count:** 2471\n", 278 | " - **Content Keywords:** model, map, territory, approximation, problem, organization, opportunity, simplified version\n", 279 | "\n", 280 | "4. **Title:** The Fremen principle\n", 281 | " - **View Count:** 2391\n", 282 | " - **Content Keywords:** new resources, limited resources, population, alternatives, Harvard, distance learning, in-person lectures, tenure, accreditation, waiting list\n", 283 | "\n", 284 | "5. **Title:** Portfolio school: Get better clients\n", 285 | " - **View Count:** 2383\n", 286 | " - **Content Keywords:** tragedy, health, economy, panic, focus, overwhelmed, health care workers, burning the candle\n", 287 | "\n", 288 | "These blog posts cover a range of topics from digital transformation and resource management to health and economic challenges, each resonating with a significant number of readers.\n", 289 | "\u001b[0m\u001b[1;3;34mTransformed query given SQL response: None\n", 290 | "\u001b[0m" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "response = query_engine.query(\n", 296 | " \"Are are the top 5 most viewed blog posts? What keywords do their content have?\"\n", 297 | ")" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 15, 303 | "id": "96f78be4-960a-432c-98db-2c0af2078963", 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/markdown": [ 309 | "The top 5 most viewed blog posts and their content keywords are as follows:\n", 310 | "\n", 311 | "1. **Title:** Taking it very seriously\n", 312 | " - **View Count:** 2493\n", 313 | " - **Content Keywords:** April first, greeting, New Year’s, internet, smile, pretend, apocalypse\n", 314 | "\n", 315 | "2. **Title:** “But we were comfortable”\n", 316 | " - **View Count:** 2490\n", 317 | " - **Content Keywords:** shift to digital, unwanted, risk, lonely, powerful, efficient, comfortable, follow, stay\n", 318 | "\n", 319 | "3. **Title:** All models are wrong, some models are useful\n", 320 | " - **View Count:** 2471\n", 321 | " - **Content Keywords:** model, map, territory, approximation, problem, organization, opportunity, simplified version\n", 322 | "\n", 323 | "4. **Title:** The Fremen principle\n", 324 | " - **View Count:** 2391\n", 325 | " - **Content Keywords:** new resources, limited resources, population, alternatives, Harvard, distance learning, in-person lectures, tenure, accreditation, waiting list\n", 326 | "\n", 327 | "5. **Title:** Portfolio school: Get better clients\n", 328 | " - **View Count:** 2383\n", 329 | " - **Content Keywords:** tragedy, health, economy, panic, focus, overwhelmed, health care workers, burning the candle\n", 330 | "\n", 331 | "These blog posts cover a range of topics from digital transformation and resource management to health and economic challenges, each resonating with a significant number of readers." 332 | ], 333 | "text/plain": [ 334 | "" 335 | ] 336 | }, 337 | "metadata": {}, 338 | "output_type": "display_data" 339 | } 340 | ], 341 | "source": [ 342 | "from IPython.display import Markdown, display\n", 343 | "\n", 344 | "display(Markdown(response.response))" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 20, 350 | "id": "9787d4a9-901a-4fc3-a97d-d04fd15b9599", 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "\u001b[1;3;34mQuerying SQL database: The question requires translating a natural language query into a SQL query to retrieve the top 5 least viewed blog posts from today. Choice (1) is relevant as it is useful for translating natural language queries into SQL queries over a table containing blog posts and page views.\n", 358 | "\u001b[0m\u001b[1;3;33mSQL query: SELECT bp.id, bp.title, COUNT(pv.id) AS view_count\n", 359 | "FROM blog_blogpost bp\n", 360 | "LEFT JOIN analytics_pageview pv ON bp.id = pv.post_id\n", 361 | "WHERE pv.timestamp::date = CURRENT_DATE\n", 362 | "GROUP BY bp.id, bp.title\n", 363 | "ORDER BY view_count ASC\n", 364 | "LIMIT 5;\n", 365 | "\u001b[0m\u001b[1;3;33mSQL response: Based on today's data, here are the top 5 least viewed blog posts:\n", 366 | "\n", 367 | "1. **Monopoly and network effects** - 525 views\n", 368 | "2. **A Sunday book reading** - 558 views\n", 369 | "3. **Helping leaders in college reboot** - 624 views\n", 370 | "4. **Is everything going to be okay?** - 648 views\n", 371 | "5. **You’re surrounded** - 654 views\n", 372 | "\n", 373 | "These posts have received the fewest views today.\n", 374 | "\u001b[0m\u001b[1;3;34mTransformed query given SQL response: None\n", 375 | "\u001b[0mBased on today's data, here are the top 5 least viewed blog posts:\n", 376 | "\n", 377 | "1. **Monopoly and network effects** - 525 views\n", 378 | "2. **A Sunday book reading** - 558 views\n", 379 | "3. **Helping leaders in college reboot** - 624 views\n", 380 | "4. **Is everything going to be okay?** - 648 views\n", 381 | "5. **You’re surrounded** - 654 views\n", 382 | "\n", 383 | "These posts have received the fewest views today.\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "response = query_engine.query(\n", 389 | " \"What are the top 5 least viewed blog posts from today?\"\n", 390 | ")\n", 391 | "print(response.response)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 22, 397 | "id": "90fac10c-1674-49b4-8f5a-73eb4aa43461", 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/markdown": [ 403 | "Based on today's data, here are the top 5 least viewed blog posts:\n", 404 | "\n", 405 | "1. **Monopoly and network effects** - 525 views\n", 406 | "2. **A Sunday book reading** - 558 views\n", 407 | "3. **Helping leaders in college reboot** - 624 views\n", 408 | "4. **Is everything going to be okay?** - 648 views\n", 409 | "5. **You’re surrounded** - 654 views\n", 410 | "\n", 411 | "These posts have received the fewest views today." 412 | ], 413 | "text/plain": [ 414 | "" 415 | ] 416 | }, 417 | "metadata": {}, 418 | "output_type": "display_data" 419 | } 420 | ], 421 | "source": [ 422 | "display(Markdown(response.response))" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "d633c0de-04d7-4232-9151-5b081f7d3e50", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [] 432 | } 433 | ], 434 | "metadata": { 435 | "kernelspec": { 436 | "display_name": "Python 3 (ipykernel)", 437 | "language": "python", 438 | "name": "python3" 439 | }, 440 | "language_info": { 441 | "codemirror_mode": { 442 | "name": "ipython", 443 | "version": 3 444 | }, 445 | "file_extension": ".py", 446 | "mimetype": "text/x-python", 447 | "name": "python", 448 | "nbconvert_exporter": "python", 449 | "pygments_lexer": "ipython3", 450 | "version": "3.11.8" 451 | } 452 | }, 453 | "nbformat": 4, 454 | "nbformat_minor": 5 455 | } 456 | -------------------------------------------------------------------------------- /notebooks/2 - Getting Started with Embeddings and Comparison.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f38bcb2a-d951-4b34-9978-0f84eef2930e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", 14 | " from tqdm.autonotebook import tqdm, trange\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import numpy as np\n", 20 | "from sentence_transformers import SentenceTransformer" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "b2e6aea1-68f6-49cd-855a-31345d1e058b", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 6, 36 | "id": "7a7246cc-9adf-4183-a208-fde93f73e164", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "['The', 'cat', 'dog', 'jumped', 'over', 'the']" 43 | ] 44 | }, 45 | "execution_count": 6, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "split_docs = sorted([\"The dog jumped over the cat\"][0].split(' '))\n", 52 | "split_docs" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 7, 58 | "id": "7f6ca091-f29a-4445-a168-4aaef829325d", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "['The dog jumped over the cat']" 65 | ] 66 | }, 67 | "execution_count": 7, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "docs = [\"The dog jumped over the cat\"]\n", 74 | "docs" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 9, 80 | "id": "8c92b811-c453-439c-b63e-b2df4edfe2a6", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "embeddings = model.encode(docs)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 11, 90 | "id": "228f4651-6298-4b74-8a32-22ced882e9de", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "docs2 = [\"The cat jumped over the dog\"]" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 12, 100 | "id": "d41a784a-4bfa-4388-9396-6a2a064eb7a8", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "['The', 'cat', 'dog', 'jumped', 'over', 'the']" 107 | ] 108 | }, 109 | "execution_count": 12, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "split_docs2 = sorted(docs2[0].split(' '))\n", 116 | "split_docs2" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 13, 122 | "id": "af5b7a83-9997-4c60-abbe-8e274b7edf94", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "embeddings2 = model.encode(docs2)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 21, 132 | "id": "96e7db7b-635b-4653-8174-edcbdbaf6fd0", 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "tensor([[1.0000]])" 139 | ] 140 | }, 141 | "execution_count": 21, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "model.similarity(embeddings,embeddings)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 16, 153 | "id": "943b41ed-4486-4e97-a741-3753a1c98dd7", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "tensor([[0.9895]])" 160 | ] 161 | }, 162 | "execution_count": 16, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "model.similarity(embeddings,embeddings2)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 19, 174 | "id": "ab5c2dba-7f33-4a99-8650-353840eabafd", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "embeddings3 = model.encode([\"The dog is hot from jumping\"])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 20, 184 | "id": "120dcaba-806b-4896-b495-5fb6af565839", 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "tensor([[0.4310]])" 191 | ] 192 | }, 193 | "execution_count": 20, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "model.similarity(embeddings,embeddings3)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 22, 205 | "id": "7766c6bf-837f-4d30-9dc7-7270cc61e906", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "tensor([[-0.0660]])" 212 | ] 213 | }, 214 | "execution_count": 22, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "embeddings4 = model.encode([\"The weather is hot\"])\n", 221 | "model.similarity(embeddings,embeddings4)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 23, 227 | "id": "87746496-8039-46a9-a1a5-3c7017dd1b86", 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "array([[ 4.72092927e-02, 3.36049646e-02, 6.90095648e-02,\n", 234 | " 6.85966387e-02, -1.26861297e-02, -6.25286764e-03,\n", 235 | " -3.72543260e-02, 1.55753568e-02, 2.41127852e-02,\n", 236 | " 2.21969979e-03, 7.17992783e-02, 2.92151421e-02,\n", 237 | " 5.03158709e-03, -6.37696981e-02, 4.89643309e-03,\n", 238 | " 1.07082445e-03, -1.23178743e-01, 9.51716863e-03,\n", 239 | " 5.84049039e-02, -1.34706022e-02, -2.93736849e-02,\n", 240 | " -4.55944873e-02, -1.06726373e-02, -4.11552712e-02,\n", 241 | " -3.63231786e-02, 3.73865589e-02, -8.08678716e-02,\n", 242 | " -6.92620501e-02, 1.39805004e-02, -1.62780806e-02,\n", 243 | " -2.12399829e-02, -8.98967776e-03, -3.35779712e-02,\n", 244 | " 5.97752631e-02, -6.10135272e-02, -7.63887540e-02,\n", 245 | " 6.19942881e-02, 3.30313668e-03, 3.82678136e-02,\n", 246 | " 1.23094983e-01, -3.98203246e-02, 7.74865597e-03,\n", 247 | " -1.22557981e-02, -2.10484420e-03, -3.81270237e-02,\n", 248 | " 6.25126734e-02, -1.30796088e-02, -6.36047423e-02,\n", 249 | " 3.17579880e-02, 5.67504615e-02, -1.24571053e-02,\n", 250 | " 9.28938538e-02, 1.28329648e-02, 4.26348625e-03,\n", 251 | " -3.06692068e-02, -2.12268997e-02, -1.05060926e-02,\n", 252 | " -1.06700696e-03, 9.03413631e-03, -9.17110816e-02,\n", 253 | " -3.65995839e-02, 5.63249737e-02, 1.30695961e-02,\n", 254 | " 7.12005049e-02, 2.41018347e-02, -4.02708817e-03,\n", 255 | " -2.95967516e-02, 8.73772800e-03, -3.27959545e-02,\n", 256 | " 7.57316276e-02, 6.74551427e-02, -5.97921833e-02,\n", 257 | " -2.18018293e-02, -4.76419851e-02, 8.67142444e-05,\n", 258 | " 1.69717856e-02, 4.07828987e-02, -1.44925695e-02,\n", 259 | " 2.73144431e-02, 3.89508046e-02, 2.99173929e-02,\n", 260 | " -4.69833985e-02, -2.68170014e-02, 8.42823740e-03,\n", 261 | " 5.28471991e-02, 6.88643530e-02, 1.32485619e-02,\n", 262 | " -1.13096333e-03, -4.26305979e-02, 5.93895698e-03,\n", 263 | " 8.36270601e-02, -7.49354437e-02, -2.25446802e-02,\n", 264 | " 2.79743504e-02, -6.21221916e-05, -1.44020608e-02,\n", 265 | " 1.41254440e-02, -3.81299518e-02, 4.55967039e-02,\n", 266 | " -1.70166066e-04, 1.67241730e-02, 1.98970921e-02,\n", 267 | " -4.46504988e-02, 3.20600648e-03, -1.03553981e-02,\n", 268 | " 5.18428497e-02, 2.58499328e-02, -1.82424746e-02,\n", 269 | " 3.90933678e-02, 4.49479138e-03, 1.81336678e-03,\n", 270 | " -4.20809016e-02, -2.64772563e-03, 2.49012001e-02,\n", 271 | " 4.76350356e-03, 2.95294691e-02, -1.00575397e-02,\n", 272 | " -7.25563169e-02, 4.65890467e-02, -1.00567088e-01,\n", 273 | " 1.10430168e-02, 3.41703705e-02, -2.87686270e-02,\n", 274 | " 3.63497995e-02, -8.35470930e-02, -5.64337615e-03,\n", 275 | " -3.21779735e-02, -6.29329333e-33, -4.08413773e-03,\n", 276 | " -6.72595724e-02, -1.14535823e-01, -8.60982314e-02,\n", 277 | " 2.95935180e-02, 2.69726384e-02, 4.59496416e-02,\n", 278 | " -3.08204978e-03, -1.23914972e-01, 6.18214114e-03,\n", 279 | " -1.08736321e-01, 2.45321598e-02, -3.44574712e-02,\n", 280 | " -6.73222616e-02, -4.37792949e-02, -1.23018515e-04,\n", 281 | " -7.90959522e-02, 2.48696283e-03, 1.44435475e-02,\n", 282 | " 5.67662530e-02, 2.45719776e-02, -1.87794561e-03,\n", 283 | " -6.87089702e-03, 3.22064459e-02, -7.62377447e-03,\n", 284 | " 3.58338617e-02, -8.37533101e-02, -7.97527377e-03,\n", 285 | " -7.58397020e-03, 5.47426613e-03, 3.07897162e-02,\n", 286 | " -7.25283055e-03, 3.01134698e-02, 8.13684389e-02,\n", 287 | " -1.01099722e-01, -2.73132091e-03, 6.38261205e-03,\n", 288 | " -4.21030186e-02, 6.91264542e-03, 5.66436164e-02,\n", 289 | " 3.23042274e-02, 3.59569602e-02, 3.09580676e-02,\n", 290 | " -4.45846580e-02, -2.08867416e-02, -3.62370797e-02,\n", 291 | " -5.14244772e-02, 5.91557182e-04, -5.85282631e-02,\n", 292 | " 4.29066643e-02, 9.78493411e-03, -3.97967696e-02,\n", 293 | " 3.03792730e-02, 2.88908388e-02, -6.26284629e-02,\n", 294 | " 3.13719846e-02, 7.99964648e-03, -3.94616760e-02,\n", 295 | " -4.05244716e-02, 1.53085679e-01, 2.59496439e-02,\n", 296 | " -5.89517364e-03, 9.26737785e-02, -3.77385467e-02,\n", 297 | " 1.05251536e-01, -9.34893787e-02, 1.95522085e-02,\n", 298 | " 1.82549562e-02, 2.28665713e-02, 3.25189978e-02,\n", 299 | " -6.00351989e-02, -2.13980880e-02, -1.39885722e-02,\n", 300 | " -2.26340350e-02, 1.07471868e-02, 1.63239613e-03,\n", 301 | " -1.05444759e-01, -1.04314154e-02, -2.98149306e-02,\n", 302 | " 1.97162647e-02, 1.02182806e-01, -5.51808961e-02,\n", 303 | " 1.12160996e-01, 4.55091968e-02, -8.66248552e-03,\n", 304 | " 9.44400504e-02, 1.07874526e-02, -4.32968810e-02,\n", 305 | " -7.35958740e-02, 4.46881317e-02, -5.76534644e-02,\n", 306 | " 3.13753523e-02, 5.50542101e-02, -5.84391542e-02,\n", 307 | " 7.15372637e-02, 3.72109018e-33, 1.58393886e-02,\n", 308 | " -6.40949011e-02, 4.42647822e-02, 1.83350332e-02,\n", 309 | " -1.41719088e-01, 2.54531950e-02, -4.73350193e-03,\n", 310 | " -4.48370948e-02, -5.70793860e-02, -1.83952954e-02,\n", 311 | " 4.54419479e-03, 3.00055519e-02, 9.58208293e-02,\n", 312 | " 1.51274819e-02, 5.74189983e-02, -3.56427091e-03,\n", 313 | " -5.65455072e-02, 1.41205946e-02, -7.86725897e-03,\n", 314 | " -1.85281485e-02, -7.67951608e-02, 3.52232456e-02,\n", 315 | " 3.77433486e-02, 6.88037053e-02, 1.03567079e-01,\n", 316 | " -1.87856965e-02, 4.22471538e-02, 2.04601754e-02,\n", 317 | " 1.42207175e-01, -1.70064643e-01, 8.77113864e-02,\n", 318 | " -9.17931646e-02, 1.76482927e-02, 3.39259878e-02,\n", 319 | " 4.80807945e-02, 7.87265748e-02, -1.78247243e-02,\n", 320 | " 3.90493013e-02, 4.34681028e-02, -3.24376412e-02,\n", 321 | " -9.82895959e-04, -2.58320663e-02, -9.11474526e-02,\n", 322 | " -1.98325440e-02, -1.88254472e-02, 1.21096885e-02,\n", 323 | " -4.97272462e-02, 8.03967938e-04, 1.00843143e-02,\n", 324 | " -4.19993028e-02, -8.24327096e-02, -8.95676315e-02,\n", 325 | " 1.09379835e-01, 2.12305058e-02, 3.33094001e-02,\n", 326 | " 5.06718345e-02, 1.20277844e-01, 8.50599911e-03,\n", 327 | " -1.20349936e-02, -1.97899248e-02, 2.11428516e-02,\n", 328 | " 5.40899634e-02, 7.98444673e-02, -8.20083730e-03,\n", 329 | " -7.38502443e-02, -1.08506912e-02, 1.69663448e-02,\n", 330 | " -1.82815045e-02, 1.01783648e-02, -9.47182029e-02,\n", 331 | " -1.03601953e-03, 7.16945231e-02, -1.56212896e-02,\n", 332 | " -2.91726086e-02, 8.48290278e-04, 6.66561797e-02,\n", 333 | " 8.25966075e-02, 3.73382606e-02, 3.72808948e-02,\n", 334 | " -8.93745571e-02, -2.57701706e-02, -3.89345810e-02,\n", 335 | " -3.50353494e-02, 5.20156734e-02, 2.95812450e-02,\n", 336 | " 2.24320926e-02, 1.80870746e-04, 6.74418136e-02,\n", 337 | " -4.30699177e-02, -5.22616357e-02, -2.33668163e-02,\n", 338 | " 2.24496797e-02, 2.82612368e-02, -4.13209908e-02,\n", 339 | " 2.69286102e-03, -1.54635593e-08, -6.67733252e-02,\n", 340 | " 6.34676358e-03, -9.27698240e-02, 2.62686461e-02,\n", 341 | " 8.18805173e-02, 5.19817658e-02, -4.02192399e-02,\n", 342 | " -8.42549875e-02, -5.93853481e-02, -5.99686764e-02,\n", 343 | " 2.73107737e-02, -7.36553818e-02, 2.12238133e-02,\n", 344 | " 3.83131281e-02, -2.64682695e-02, 1.61006663e-03,\n", 345 | " -4.24509645e-02, 5.99909760e-02, 8.10987130e-02,\n", 346 | " 2.12833017e-01, -9.78133157e-02, 2.45644674e-02,\n", 347 | " -9.40957516e-02, -4.70139757e-02, 2.56397054e-02,\n", 348 | " 4.24515232e-02, -5.95228113e-02, 2.67450102e-02,\n", 349 | " 2.17074845e-02, -3.45471166e-02, -6.88105589e-03,\n", 350 | " -4.83707152e-02, -2.75955871e-02, -4.16629435e-03,\n", 351 | " 1.21461833e-02, 3.29857180e-03, 8.06904435e-02,\n", 352 | " -8.01259652e-02, -7.88580813e-03, -6.09368086e-02,\n", 353 | " 3.13197486e-02, 8.74250978e-02, 5.24751358e-02,\n", 354 | " -3.71866971e-02, 1.21963331e-02, 2.96359211e-02,\n", 355 | " 1.92116294e-02, -7.66872093e-02, 2.92952415e-02,\n", 356 | " 6.28225356e-02, -8.35423078e-03, 7.12382942e-02,\n", 357 | " 2.44259741e-02, -6.16908399e-03, -2.69643925e-02,\n", 358 | " -2.53030658e-02, -4.07200083e-02, -1.37236863e-02,\n", 359 | " 1.74375903e-02, 4.65289690e-02, 2.35924944e-02,\n", 360 | " -3.43679078e-03, -3.65919108e-03, 9.21025425e-02]], dtype=float32)" 361 | ] 362 | }, 363 | "execution_count": 23, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "embeddings2" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "4d3f1589-9ff8-4fd3-87ce-743ff4cd56b7", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3 (ipykernel)", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.11.8" 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 5 402 | } 403 | -------------------------------------------------------------------------------- /notebooks/3 - Embeddings with Multiple Data Points.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f38bcb2a-d951-4b34-9978-0f84eef2930e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", 14 | " from tqdm.autonotebook import tqdm, trange\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import numpy as np\n", 20 | "from sentence_transformers import SentenceTransformer" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "b2e6aea1-68f6-49cd-855a-31345d1e058b", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 12, 36 | "id": "87bca28d-5202-49f2-ae6f-7c2461e243cd", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "data = [1,2,3,4,5,6,7]\n", 41 | "query = 5.3 # 97% match to 5" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 14, 47 | "id": "f5fb3278-e22e-46c9-8287-1ec71886db8c", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# data.index(query)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 26, 57 | "id": "7f6ca091-f29a-4445-a168-4aaef829325d", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "docs = [\n", 62 | " \"The dog jumped over the cat\", \n", 63 | " \"The cat jumped over the dog\",\n", 64 | " \"It is very warm today\",\n", 65 | " \"The cat is yellow and the dog is red\",\n", 66 | "]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "8739e303-2d98-4b87-b13d-b2e2b0ec218b", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 27, 80 | "id": "8c92b811-c453-439c-b63e-b2df4edfe2a6", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "embeddings = model.encode(docs)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 28, 90 | "id": "53fa7db5-0d31-4893-b7ec-4ea4b822f181", 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "(4, 384)" 97 | ] 98 | }, 99 | "execution_count": 28, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "embeddings.shape" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 29, 111 | "id": "d002cdd4-61cc-46a1-8ef4-4da7a1a8cb5c", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "query = \"The cat is yellow and the dog is purple\"" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 30, 121 | "id": "943b41ed-4486-4e97-a741-3753a1c98dd7", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "tensor([[0.5122],\n", 128 | " [0.5164],\n", 129 | " [0.0782],\n", 130 | " [0.9099]])" 131 | ] 132 | }, 133 | "execution_count": 30, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "model.similarity(embeddings, model.encode([query]))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "4d3f1589-9ff8-4fd3-87ce-743ff4cd56b7", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python 3 (ipykernel)", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.11.8" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 5 172 | } 173 | -------------------------------------------------------------------------------- /notebooks/4 - Embeddings with IDs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f38bcb2a-d951-4b34-9978-0f84eef2930e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/Users/cfe/Dev/talk-to-django/venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", 14 | " from tqdm.autonotebook import tqdm, trange\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import numpy as np\n", 20 | "from sentence_transformers import SentenceTransformer" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "b2e6aea1-68f6-49cd-855a-31345d1e058b", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "model = SentenceTransformer(\"all-MiniLM-L6-v2\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "id": "7f6ca091-f29a-4445-a168-4aaef829325d", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "docs = [\n", 41 | " \"The dog jumped over the cat\", \n", 42 | " \"The cat jumped over the dog\",\n", 43 | " \"It is very warm today\",\n", 44 | " \"The cat is yellow and the dog is red\",\n", 45 | "]" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "id": "8739e303-2d98-4b87-b13d-b2e2b0ec218b", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "[{'index': 0, 'data': 'The dog jumped over the cat'},\n", 58 | " {'index': 1, 'data': 'The cat jumped over the dog'},\n", 59 | " {'index': 2, 'data': 'It is very warm today'},\n", 60 | " {'index': 3, 'data': 'The cat is yellow and the dog is red'}]" 61 | ] 62 | }, 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "documents = []\n", 70 | "for i, x in enumerate(docs):\n", 71 | " row = {\n", 72 | " \"index\": i,\n", 73 | " \"data\": x\n", 74 | " }\n", 75 | " documents.append(row)\n", 76 | "\n", 77 | "documents" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "id": "94f03282-7cd2-4af4-bc20-9f55dea32a17", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "dataset = [\n", 88 | " {'id': 1, 'text': 'The dog jumped over the cat'},\n", 89 | " {'id': 2, 'text': 'The cat jumped over the dog'},\n", 90 | " {'id': 3, 'text': 'It is very warm today'},\n", 91 | " {'id': 4, 'text': 'The cat is yellow and the dog is red'},\n", 92 | " {'id': 5, 'text': 'The dog jumped over the purple cow'}\n", 93 | "]" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "id": "1128ec0e-837a-4af5-a1c1-5f10415f59d0", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "document_embeddings = []\n", 104 | "for x in dataset:\n", 105 | " embed = model.encode(x.get('text'))\n", 106 | " final_data = (x.get('id'), embed)\n", 107 | " document_embeddings.append(final_data)\n", 108 | "\n", 109 | "# document_embeddings[0][1]" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "id": "388fabf5-44f0-4193-af2f-c967f036c749", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# embeddings = model.encode(dataset)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 8, 125 | "id": "d002cdd4-61cc-46a1-8ef4-4da7a1a8cb5c", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "query = \"The dog jumped over the green cow\"\n", 130 | "query_embedding = model.encode([query])" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 9, 136 | "id": "77b22442-2326-474a-8d69-345f81585052", 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "(1, 384)" 143 | ] 144 | }, 145 | "execution_count": 9, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "query_embedding.shape" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 74, 157 | "id": "943b41ed-4486-4e97-a741-3753a1c98dd7", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# model.similarity(embeddings, model.encode([query]))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 75, 167 | "id": "4d3f1589-9ff8-4fd3-87ce-743ff4cd56b7", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "[(1, tensor([[0.5991]])),\n", 174 | " (2, tensor([[0.5876]])),\n", 175 | " (3, tensor([[-0.0063]])),\n", 176 | " (4, tensor([[0.2445]])),\n", 177 | " (5, tensor([[0.8564]]))]" 178 | ] 179 | }, 180 | "execution_count": 75, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "results = []\n", 187 | "for doc in document_embeddings:\n", 188 | " doc_id = doc[0]\n", 189 | " doc_embedding = doc[1]\n", 190 | " rank = model.similarity(doc_embedding, query_embedding)\n", 191 | " results.append(\n", 192 | " (doc_id, rank)\n", 193 | " )\n", 194 | "results" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 76, 200 | "id": "fdd9a2d6-e1c5-450d-aaaa-8bed9a7ab4be", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "results.sort(key=lambda x: x[1], reverse=True)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 77, 210 | "id": "b0c227e2-ea26-4343-a03d-35d3844ac42d", 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "5 tensor([[0.8564]]) {'id': 5, 'text': 'The dog jumped over the purple cow'}\n", 218 | "1 tensor([[0.5991]]) {'id': 1, 'text': 'The dog jumped over the cat'}\n", 219 | "2 tensor([[0.5876]]) {'id': 2, 'text': 'The cat jumped over the dog'}\n", 220 | "4 tensor([[0.2445]]) {'id': 4, 'text': 'The cat is yellow and the dog is red'}\n", 221 | "3 tensor([[-0.0063]]) {'id': 3, 'text': 'It is very warm today'}\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "for result in results:\n", 227 | " dataset_id = result[0]\n", 228 | " rank = result[1]\n", 229 | " doc = next(doc for doc in dataset if doc['id'] == dataset_id)\n", 230 | " print(dataset_id, rank, doc)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "4ae9dde0-dff7-4f05-b11e-710421403e02", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3 (ipykernel)", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 3 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython3", 258 | "version": "3.11.8" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 5 263 | } 264 | -------------------------------------------------------------------------------- /notebooks/5 - Connecting Django.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e094b89c-fd72-402a-8985-2ba56af8f609", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "acffb2ef-bfc9-458e-bbac-fa19371208e4", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from blog.models import BlogPost" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "b85be6b1-1cc6-4d45-beba-809d37e47dc2", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "qs = BlogPost.objects.all()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "bd56202a-bb40-4c99-bb67-a7e2eba6dc29", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "qs" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "7622db10-7632-45b9-b5ce-8ac2c6d2d50a", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "Python 3 (ipykernel)", 57 | "language": "python", 58 | "name": "python3" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 3 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.11.8" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 5 75 | } 76 | -------------------------------------------------------------------------------- /notebooks/6 - Semantic Search with Django and pgvector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e094b89c-fd72-402a-8985-2ba56af8f609", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "acffb2ef-bfc9-458e-bbac-fa19371208e4", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from openai import OpenAI\n", 23 | "from blog.models import BlogPost\n", 24 | "from decouple import config\n", 25 | "EMEDDING_MODEL=config(\"EMEDDING_MODEL\", default=\"text-embedding-3-small\")\n", 26 | "RECREATE_DATA=True" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "7622db10-7632-45b9-b5ce-8ac2c6d2d50a", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "docs = [\n", 37 | " \"The dog jumped over the cat\", \n", 38 | " \"The cat jumped over the dog\",\n", 39 | " \"It is very warm today\",\n", 40 | " \"The cat is yellow and the dog is red\",\n", 41 | "]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "id": "0fb8ad2a-4d1e-4f17-bbec-157fd2050e41", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "new_data = []\n", 52 | "for i, x in enumerate(docs):\n", 53 | " new_data.append(\n", 54 | " BlogPost(title=f\"Blog Post {i+1}\", content=x, can_delete=True)\n", 55 | " )\n", 56 | "\n", 57 | "if RECREATE_DATA:\n", 58 | " qs = BlogPost.objects.filter(can_delete=True)\n", 59 | " qs.delete()\n", 60 | " BlogPost.objects.bulk_create(new_data)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "id": "5ca93a4e-3a38-450f-8ff2-0e556ca08814", 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "4" 73 | ] 74 | }, 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "qs = BlogPost.objects.filter(can_delete=True)\n", 82 | "qs.count()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "id": "7c2be94a-b5f6-4954-a0d5-5c346006c6da", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "client = OpenAI(\n", 93 | " api_key=config(\"OPENAI_API_KEY\")\n", 94 | ")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "id": "0723b843-075c-4730-bd68-03c1d4c80fa3", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "def get_embedding(text, model=EMEDDING_MODEL):\n", 105 | " text = text.replace(\"\\n\", \" \").strip()\n", 106 | " return client.embeddings.create(input=[text], model=model).data[0].embedding" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 8, 112 | "id": "9452e908-3231-4f4e-877c-20f17537da4d", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "'text-embedding-3-small'" 119 | ] 120 | }, 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "EMEDDING_MODEL" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 9, 133 | "id": "bad94fd7-d990-4cd8-a035-4b6502160c2b", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "for obj in qs:\n", 138 | " if obj.embedding is None:\n", 139 | " obj.embedding = get_embedding(obj.get_embedding_text_raw())\n", 140 | " obj.save()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 21, 146 | "id": "dbbb0a69-55b8-4cc2-a9d9-4d361b449f33", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "query = \"The dog jumped over the green cow\"\n", 151 | "# query = \"The dog jumped over the cat\"\n", 152 | "query_embedding = get_embedding(query)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 22, 158 | "id": "140fbd78-ab74-449b-9138-468ea5714ab1", 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "Blog Post 2 0.2821284104662174 71.78715895337827\n", 166 | "Blog Post 1 0.2862588550525559 71.37411449474442\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "from pgvector.django import CosineDistance\n", 172 | "from django.db.models import F\n", 173 | "\n", 174 | "qs = BlogPost.objects.annotate(\n", 175 | " distance=CosineDistance('embedding',query_embedding),\n", 176 | " similarity=1 - F(\"distance\")\n", 177 | ").order_by(\"distance\")[:2]\n", 178 | "for obj in qs:\n", 179 | " print(obj.title, obj.distance, obj.similarity * 100)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "id": "1906176d-73c3-43db-9f2b-2a6f357ceb8b", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.11.8" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 5 212 | } 213 | -------------------------------------------------------------------------------- /notebooks/7 - Semantic Search with Generic Foreign Keys Across Multiple Models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e094b89c-fd72-402a-8985-2ba56af8f609", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "acffb2ef-bfc9-458e-bbac-fa19371208e4", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from openai import OpenAI\n", 23 | "from products.models import Embedding\n", 24 | "from decouple import config\n", 25 | "EMEDDING_MODEL=config(\"EMEDDING_MODEL\", default=\"text-embedding-3-small\")\n", 26 | "RECREATE_DATA=True" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "e3c02b1f-54a2-43e9-be63-b38f08ce4853", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "client = OpenAI(\n", 37 | " api_key=config(\"OPENAI_API_KEY\")\n", 38 | ")\n", 39 | "def get_embedding(text, model=EMEDDING_MODEL):\n", 40 | " text = text.replace(\"\\n\", \" \").strip()\n", 41 | " return client.embeddings.create(input=[text], model=model).data[0].embedding" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 6, 47 | "id": "dbbb0a69-55b8-4cc2-a9d9-4d361b449f33", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "query = \"The dog jumped over the green cow\"\n", 52 | "# query = \"The dog jumped over the cat\"\n", 53 | "query_embedding = get_embedding(query)\n", 54 | "# query_embedding" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 9, 60 | "id": "140fbd78-ab74-449b-9138-468ea5714ab1", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "28 28 Blog Post 4 0.615726500749588 38.4273499250412\n", 68 | "1 1 Random not real product 0.615726500749588 38.4273499250412\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "from pgvector.django import CosineDistance\n", 74 | "from django.db.models import F\n", 75 | "\n", 76 | "qs = Embedding.objects.annotate(\n", 77 | " distance=CosineDistance('embedding',query_embedding),\n", 78 | " similarity=1 - F(\"distance\")\n", 79 | ").order_by(\"distance\")[:2]\n", 80 | "for obj in qs:\n", 81 | " print(obj.object_id, obj.content_object.id, obj.content_object.title, obj.distance, obj.similarity * 100)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "1906176d-73c3-43db-9f2b-2a6f357ceb8b", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3 (ipykernel)", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.11.8" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 5 114 | } 115 | -------------------------------------------------------------------------------- /notebooks/8 - Services for Search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "e094b89c-fd72-402a-8985-2ba56af8f609", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "id": "77cc299d-ba3b-4b87-8eae-a81b3ebb1813", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from blog import services" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 5, 28 | "id": "1906176d-73c3-43db-9f2b-2a6f357ceb8b", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "results = services.search_posts(\"This is amazing\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "id": "99f1d145-65fc-4193-a820-d915fb176a54", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "0.2951098856395462" 45 | ] 46 | }, 47 | "execution_count": 7, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "results.first().similarity" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "cb8372e0-42f7-4474-95db-e95d996c0c52", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [] 63 | } 64 | ], 65 | "metadata": { 66 | "kernelspec": { 67 | "display_name": "Python 3 (ipykernel)", 68 | "language": "python", 69 | "name": "python3" 70 | }, 71 | "language_info": { 72 | "codemirror_mode": { 73 | "name": "ipython", 74 | "version": 3 75 | }, 76 | "file_extension": ".py", 77 | "mimetype": "text/x-python", 78 | "name": "python", 79 | "nbconvert_exporter": "python", 80 | "pygments_lexer": "ipython3", 81 | "version": "3.11.8" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 5 86 | } 87 | -------------------------------------------------------------------------------- /notebooks/9 - Cosine Similarity with Numpy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "c2700210-4c68-416b-85da-292c29337785", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 3, 16 | "id": "91e805d5-92df-4f85-9f65-548a0d299308", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "a = np.array([1, 2, 3])\n", 21 | "b = np.array([1, 2.1, 3.1])\n", 22 | "c = np.array([-1, 2, 0])" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 4, 28 | "id": "fd74fe32-a4aa-413d-8f48-c802758898aa", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "(3,)" 35 | ] 36 | }, 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "a.shape" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "id": "658fced1-4928-4e3c-bd8b-8698fb0e5cf5", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "3.7416573867739413" 56 | ] 57 | }, 58 | "execution_count": 5, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "magnitude = (1**2 + 2**2 + 3**2) ** (1/2)\n", 65 | "magnitude" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "id": "e443498c-8f84-4487-95d6-6e691a251a7f", 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "np.float64(3.7416573867739413)" 78 | ] 79 | }, 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "mag_a = np.linalg.norm(a)\n", 87 | "mag_a" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 7, 93 | "id": "a0581e8b-59e5-4622-94c0-cf25d835bbc4", 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "np.float64(3.8755644750152203)" 100 | ] 101 | }, 102 | "execution_count": 7, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "mag_b = np.linalg.norm(b)\n", 109 | "mag_b" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 8, 115 | "id": "e98a252b-8b1c-412e-acff-74e896df693e", 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "np.float64(2.23606797749979)" 122 | ] 123 | }, 124 | "execution_count": 8, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "mag_c = np.linalg.norm(c)\n", 131 | "mag_c" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "id": "78a73264-ef22-4f41-8b20-92ab9896a8c4", 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "np.float64(0.9999286639954388)" 144 | ] 145 | }, 146 | "execution_count": 9, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "similiar = (np.dot(a, b)) / (mag_a * mag_b)\n", 153 | "similiar" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "id": "1f01245d-e097-4ea8-b835-342dabd37006", 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "np.float64(0.35856858280031806)" 166 | ] 167 | }, 168 | "execution_count": 10, 169 | "metadata": {}, 170 | "output_type": "execute_result" 171 | } 172 | ], 173 | "source": [ 174 | "similiar = (np.dot(a, c)) / (mag_a * mag_c)\n", 175 | "similiar" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "id": "f393d5ce-e165-4e71-8eea-eb214ff609f2", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "embedding = np.array([0.00860841479152441, -0.020805347710847855, 0.01674642413854599, -0.005117336288094521, -0.01579217240214348, -0.010631157085299492, 0.004210126120597124, 0.052470337599515915, -0.03362724930047989, -0.028976958245038986, -0.01891028694808483, -0.030347853899002075, 0.004122765269130468, -0.018466763198375702, 0.005617981776595116, 0.05806143581867218, -0.05703998729586601, 0.011867649853229523, -0.027874866500496864, 0.023412736132740974, 0.006340389605611563, -0.01768723502755165, -0.0051542967557907104, 0.007660883944481611, 0.02932640165090561, 0.006565512157976627, -0.03790121525526047, 0.04362671822309494, 0.05881408601999283, -0.032498277723789215, 0.042632147669792175, -0.04927157983183861, 0.013547668233513832, 0.018184520304203033, 0.009025058709084988, -0.006599112413823605, 0.015509930439293385, 0.009045219048857689, -0.010496755130589008, -0.01628945954144001, -0.043546076864004135, -0.04311599209904671, 0.039997879415750504, 0.01675986312329769, -0.019152211025357246, 0.022633207961916924, -0.04429872706532478, 0.01482448261231184, 0.036180876195430756, 0.045857783406972885, 0.006854475010186434, 0.04174509644508362, 0.015496490523219109, 0.1183808222413063, 0.008245530538260937, -0.017552832141518593, 0.04502449557185173, 0.06386758387088776, -0.018802765756845474, -0.02150423638522625, 0.026141086593270302, 0.0253481175750494, 0.014179355464875698, 0.0161281768232584, 0.007116558030247688, 0.0022814651019871235, -0.013540948741137981, 0.013070543296635151, -0.025912605226039886, 0.042632147669792175, -0.0006602472276426852, 0.026947496458888054, -0.014421278610825539, -0.022364405915141106, -0.01845332235097885, -0.007519762497395277, 0.003113074228167534, -0.012338055297732353, 0.01611473597586155, -0.05257785692811012, -0.056932464241981506, 0.005026615224778652, -0.019098449498414993, -0.059835538268089294, -0.025899164378643036, -0.014730401337146759, -0.0491640605032444, -0.0010945319663733244, -0.0022109043784439564, -0.0030425135046243668, 0.013137743808329105, 0.03599271550774574, -0.010194351896643639, -0.002731709973886609, -0.001493536401540041, 0.002686349442228675, -0.039540912955999374, 0.04639538750052452, 0.0691361203789711, 0.01627601869404316, 0.04701363667845726, -0.03413797542452812, 0.019474774599075317, 0.0037632412277162075, 0.030616655945777893, 0.022539127618074417, -0.014716961421072483, -0.01856084354221821, -0.016638902947306633, -0.0022629848681390285, -0.12354183942079544, -0.014071834273636341, -0.0073584807105362415, -0.031100502237677574, 0.022001521661877632, 0.0415031760931015, 0.06822218745946884, -0.05005110800266266, 0.013278866186738014, -0.01657170243561268, 0.03521318733692169, -0.02580508403480053, -0.001697658677585423, -0.029917769134044647, -0.0506693571805954, -0.018493643030524254, -0.013164624571800232, 0.054298195987939835, -0.05574973300099373, -0.02857375331223011, 0.0460728257894516, 0.022176243364810944, -0.00913930032402277, -0.007331600412726402, -0.01876244507730007, 0.013735830783843994, -0.040159162133932114, -0.028466232120990753, -0.02593948505818844, 0.011437565088272095, 0.07795285433530807, -0.043734241276979446, -0.02317081391811371, -0.04510513320565224, -0.02640989050269127, -0.009582825005054474, -0.0009937308495864272, 0.024366987869143486, -0.008185049518942833, -0.009475303813815117, 0.01829204149544239, 0.030777938663959503, -0.058168958872556686, 0.022794490680098534, -0.021840238943696022, -0.014259996823966503, 0.015214246697723866, -0.01983765698969364, 0.015227687545120716, 0.041261252015829086, 0.01374927069991827, 0.015321767888963223, -0.05962049588561058, -0.002624188782647252, 0.00753320287913084, 0.0018362601986154914, -0.025227157399058342, -0.010416114702820778, -0.055050842463970184, -0.05023927241563797, 0.013923992402851582, 0.008621854707598686, -0.030320972204208374, -0.023399297147989273, 0.0222837645560503, 0.005785983521491289, 0.024326667189598083, -0.042336463928222656, -0.044916972517967224, -0.016034096479415894, 0.03314340487122536, 0.058276478201150894, 0.010926839895546436, 0.00070602772757411, -0.019125329330563545, -0.029702726751565933, 0.0009172900463454425, -0.011713088490068913, 0.01659858226776123, 0.007875926792621613, -0.02349337749183178, 0.08859745413064957, 0.053518667817115784, 0.013346066698431969, 0.015106725506484509, -0.03655720129609108, 0.08365147560834885, -0.0005783463711850345, -0.029864007607102394, 0.022203123196959496, 0.007304720114916563, -0.023479938507080078, -0.031234903261065483, -0.032498277723789215, 0.015872813761234283, -0.008581534028053284, -0.013762710615992546, 0.07655508071184158, 0.005604541394859552, -0.03010592982172966, 0.05542716756463051, -0.0075802430510520935, -0.02532123774290085, -0.023157374933362007, -0.04343855753540993, -0.004636850673705339, 0.05919041112065315, 0.00866889487951994, 0.01646418124437332, -0.011545086279511452, -0.04897589609026909, 0.06085698679089546, 0.07935062795877457, -0.02520027570426464, 0.0384119413793087, -0.010281712748110294, 0.033250924199819565, 0.0007917087059468031, 0.04193326085805893, 0.025267478078603745, 0.048035088926553726, 0.011081401258707047, 0.04642226919531822, -0.052927300333976746, 0.017633473500609398, -0.007324880454689264, 0.05190584808588028, -0.011424125172197819, -0.004542769864201546, -0.01579217240214348, -0.014085274189710617, 0.034164853394031525, 0.014327197335660458, 0.013238545507192612, -0.0009996109874919057, -0.029917769134044647, -0.0008912498014979064, -0.008783136494457722, -0.013722390867769718, 0.008574814535677433, 0.029837127774953842, 0.03169186785817146, 0.018238279968500137, -0.03499814495444298, 0.0142196761444211, -0.008776416070759296, -0.009125860407948494, -0.01796947792172432, 0.06257732957601547, -0.030536014586687088, -0.020361823961138725, -0.05499708279967308, -0.014730401337146759, -0.022512247785925865, -0.02903071790933609, -0.04042796418070793, 0.018318921327590942, -0.023103613406419754, 0.004189965780824423, 0.008507613092660904, 0.029595205560326576, -0.00821864977478981, 0.05849152058362961, -0.014179355464875698, 0.013628309592604637, -0.02505243569612503, 0.008117849007248878, 0.029407043009996414, 0.03596583381295204, 0.024622350931167603, -0.014165915548801422, 0.00564822182059288, 0.010879799723625183, -0.00776168517768383, -0.019568854942917824, 0.006081666797399521, -0.0033264365047216415, 0.02705501765012741, 0.03284772112965584, -0.004428528714925051, 0.03056289628148079, 0.024380426853895187, 0.015550251118838787, 0.00874281581491232, -0.03736361116170883, 0.02225688472390175, 0.0019068209221586585, 0.007613843772560358, 0.02290201187133789, -0.03825065866112709, 0.06494279205799103, -0.031557466834783554, -0.0033852371852844954, 0.041583817452192307, 0.005809503607451916, 0.04704051464796066, 0.03564327210187912, -0.025616921484470367, 0.03131554275751114, 0.01506640575826168, -0.028466232120990753, -0.012418696656823158, 0.04470193013548851, -0.010953720659017563, -0.007741524837911129, -0.031261783093214035, 0.029003838077187538, 0.010893239639699459, 0.021477356553077698, -0.028627514839172363, 0.04792756587266922, -0.030079049989581108, -0.014394397847354412, -0.0026729092933237553, 0.020375262945890427, -0.0050232550129294395, 0.05919041112065315, -0.030025290325284004, -0.002513307612389326, -0.01903124898672104, -0.027122218161821365, -0.022203123196959496, 0.011020921170711517, 0.020334942266345024, 0.008944418281316757, 0.009455143474042416, -0.014864803291857243, 0.010812598280608654, 0.013191504403948784, 0.005933824926614761, -0.03717544674873352, -0.017001787200570107, 0.013419987633824348, -0.0075668031349778175, -0.031234903261065483, 0.003032433334738016, 0.05158328637480736, -0.03636904060840607, -0.022471927106380463, -0.003931242972612381, -0.0027787506114691496, -0.00821193028241396, 0.018493643030524254, -0.02365466021001339, 0.007875926792621613, 0.008467293344438076, 0.03884202614426613, -0.0011037720832973719, 0.002064742613583803, 0.02981024794280529, -0.02428634651005268, -0.029245760291814804, 0.003113074228167534, -0.010604276321828365, 0.038385059684515, 0.03612711653113365, -0.04311599209904671, -0.024649230763316154, 0.0023251455277204514, 0.04252462834119797, -0.004176525864750147, -0.0011726528173312545, 0.033707890659570694, 0.003769961418583989, -0.05206713080406189, -0.023412736132740974, 0.0353744700551033, -0.0008479892858304083, 0.05177144706249237, 0.05427131429314613, 0.02733726054430008, -0.024178825318813324, -0.001118892221711576, -0.004599890671670437, 0.00136417499743402, -0.011706368066370487, 0.0028745115268975496, -0.002286505186930299, -0.0008563894079998136, -0.017566272988915443, 0.07026509195566177, -0.03424549475312233, 0.011860930360853672, -0.002271384932100773, 0.01036907359957695, -0.013339346274733543, -0.024514829739928246, 0.005476860329508781, -0.032202593982219696, 0.006347109563648701, 0.04537393897771835, -0.014501919038593769, 0.006652873009443283, -0.015120166353881359, -0.02905759960412979, -0.03655720129609108, -0.021396715193986893, 0.0237890612334013, 0.005544060841202736, 0.009166180156171322, -0.0077751255594193935, -0.008951137773692608, 0.022216564044356346, 0.04929846152663231, -0.024985233321785927, -0.005903584882616997, -0.0036254797596484423, -0.03058977611362934, -0.005476860329508781, 0.03362724930047989, 0.043707359582185745, 0.031396184116601944, 0.003937963396310806, 0.02194776013493538, -0.03026721253991127, 0.01813075877726078, 0.04467505216598511, 0.0076676043681800365, 0.00644791079685092, -0.05698622390627861, -0.021477356553077698, -0.012398536317050457, -0.036772243678569794, 0.0017925796564668417, 0.011074681766331196, -0.05639486014842987, -0.0072576794773340225, -0.010530355386435986, 0.019582295790314674, 0.003524678759276867, 0.03244451433420181, 0.007526482455432415, -0.01542928908020258, -0.026221727952361107, -0.006256388500332832, -0.027606062591075897, -0.053518667817115784, -0.027686703950166702, 0.0038606824818998575, 0.004865333437919617, 0.01520080678164959, -0.037713054567575455, 0.02948768436908722, 0.05725502967834473, -0.008890657685697079, -0.010866358876228333, -0.024931473657488823, 0.00614550756290555, 0.01749907247722149, 0.02056342549622059, 0.0184264425188303, -0.0026359488256275654, -0.024555150419473648, 0.0384119413793087, 0.028493111953139305, 0.011726528406143188, 0.03408421576023102, 0.02214936353266239, -0.0028005908243358135, -0.03368100896477699, 0.014192795380949974, 0.01826515980064869, 0.007586963474750519, -0.03870762512087822, -0.04956726357340813, 0.002079862868413329, -0.023479938507080078, 0.033116523176431656, -0.027901746332645416, -0.01260013785213232, -0.003487718291580677, 0.03131554275751114, -0.0030038729310035706, 0.006777194328606129, 0.014421278610825539, -0.00020097220840398222, -0.044943854212760925, -0.006404230371117592, -0.024393867701292038, 0.006478151306509972, -0.03467557951807976, 0.06101826950907707, 0.04515889659523964, -0.01474384218454361, -0.0034154774621129036, -0.04873397573828697, -0.018184520304203033, -0.0024225865490734577, -0.04575026035308838, 0.0026443491224199533, -0.02611420676112175, -0.059405453503131866, 0.0066495127975940704, 0.01104780100286007, 0.07945815473794937, 0.0013759351568296552, -0.051690805703401566, -0.026087326928973198, 0.045078255236148834, -0.02167895808815956, -0.03669160231947899, 0.006397510413080454, -0.007701204624027014, -0.03101986087858677, 0.005191256757825613, -0.005953985266387463, 0.012606858275830746, -0.00679735466837883, 0.003334836568683386, -0.026060445234179497, 0.02209560200572014, 0.04792756587266922, -0.024676110595464706, -0.0284393522888422, 0.028385590761899948, 0.023238014429807663, -0.022498806938529015, -0.028170548379421234, -0.02380250208079815, 0.01829204149544239, -0.023694980889558792, -0.021625196561217308, 0.02658461220562458, -0.014985764399170876, 0.0016254178481176496, -0.017297469079494476, -0.02690717577934265, -0.00018595704750623554, -0.022162802517414093, -0.04752436280250549, 0.005026615224778652, 0.02981024794280529, -0.001849700347520411, 0.011551806703209877, -0.021302632987499237, 0.04196013882756233, 0.05956673249602318, -0.005171096883714199, 0.012149893678724766, 0.02322457544505596, 0.030831698328256607, -0.022619768977165222, 0.001192813040688634, -0.019703255966305733, 0.0037296409718692303, -0.016020655632019043, -0.02136983536183834, -0.001797619741410017, -0.010422834195196629, 0.05467452108860016, 0.052309054881334305, 0.029622085392475128, 0.023305214941501617, 0.00968362670391798, -0.0069351159036159515, 0.0230364128947258, 0.024313226342201233, 0.003591879503801465, -0.02487771213054657, -0.040320441126823425, -0.009522344917058945, -0.042013902217149734, -0.009602985344827175, -0.08800608664751053, 0.016531381756067276, -0.0130638238042593, 0.01873556524515152, 0.04720179736614227, 0.021477356553077698, -0.01127628330141306, -0.024850832298398018, -0.055212125182151794, -0.04655667021870613, 0.03274019807577133, -0.03265955671668053, 0.04443312808871269, 0.015362088568508625, -0.005389499012380838, -0.013077263720333576, 0.0013330946676433086, -0.011834049597382545, 0.0010290113277733326, 0.012210373766720295, 0.04669107124209404, -0.015335208736360073, -0.03085857816040516, -0.0010584116680547595, -0.00047124517732299864, 0.00718375900760293, 0.008312731049954891, 0.01544272992759943, 0.03747113049030304, 0.002486427314579487, -0.044325605034828186, -0.0007606283761560917, -0.02702813595533371, 0.008568094111979008, -0.004888853523880243, 0.00038220419082790613, -0.03040161356329918, -0.002486427314579487, 0.015281448140740395, 0.009777707047760487, -0.03198755159974098, -0.03260579705238342, 0.034648701548576355, -0.018990928307175636, 0.00529877794906497, 0.020482784137129784, -0.02198808081448078, -0.03389605134725571, -0.03932587057352066, 0.018977487459778786, 0.015375528484582901, -0.008621854707598686, 0.029729606583714485, -0.03058977611362934, 0.0017875395715236664, -0.017700674012303352, -0.003312996355816722, 0.010416114702820778, 0.010026349686086178, 0.02409818395972252, -0.04037420451641083, -0.007177038583904505, 0.012875661253929138, 0.0020966629963368177, -0.005886784754693508, -0.0032121953554451466, -0.032793961465358734, 0.03857322409749031, -0.007371921092271805, -0.0027737105265259743, -0.055534690618515015, 0.03282083943486214, 0.014515358954668045, 0.02886943705379963, 0.022673528641462326, 0.01860116422176361, -0.016168497502803802, 0.026141086593270302, 0.0008719295728951693, 0.036315277218818665, -0.00876297615468502, -0.000782048562541604, -0.032471396028995514, -0.024608910083770752, -0.008964578621089458, -0.02639644965529442, 0.004714131820946932, -0.0051374961622059345, 0.06343749910593033, -0.02179992012679577, -0.029890887439250946, -0.02197464182972908, 0.008474012836813927, -0.027391020208597183, -0.007096397690474987, -0.004462128970772028, -0.002689709421247244, 0.008924257941544056, -0.021853679791092873, 0.058437760919332504, 0.005658301990479231, 0.027068456634879112, 0.00016957685875240713, -0.020845668390393257, -0.03072417713701725, -0.014259996823966503, -0.029245760291814804, 0.011451005935668945, 0.026181407272815704, -0.04577714204788208, -0.006216068286448717, 0.014206236228346825, 0.015268007293343544, 0.01861460506916046, 0.021127911284565926, -0.012176773510873318, -0.005413019564002752, -0.04023980349302292, -0.017915716394782066, -0.01529488805681467, -0.012512777000665665, -0.010133870877325535, -0.0737057700753212, 0.02658461220562458, -0.011316603980958462, -0.004226926248520613, 0.029272641986608505, 0.02059030532836914, -0.018197959288954735, -0.02489115297794342, -0.02827806957066059, 0.0015817373059689999, -0.011356924660503864, -0.008124569430947304, 0.03507878631353378, 0.05596477538347244, -0.012949582189321518, -0.04502449557185173, -0.014246555976569653, -0.04782004654407501, -0.015872813761234283, -0.017754435539245605, -0.006138787604868412, -0.0142196761444211, -0.03169186785817146, -0.00117097282782197, -0.00019257211533840746, -0.009938988834619522, 0.011807169765233994, -0.018009796738624573, -0.00791624654084444, -0.026678692549467087, 0.017848515883088112, -0.029407043009996414, -0.01106124185025692, 0.030482254922389984, 0.0023083453997969627, -0.007889366708695889, -0.011719808913767338, -0.0032609158661216497, -0.016517940908670425, 0.007190478965640068, -0.007586963474750519, -0.03169186785817146, -0.009468584321439266, -0.015496490523219109, -0.040777407586574554, -0.010705077089369297, -0.016679223626852036, 0.011101561598479748, -0.0032138752285391092, -0.027149097993969917, 0.011175482533872128, 0.007331600412726402, 0.00799016747623682, -0.026087326928973198, 0.0028442712500691414, -0.009623145684599876, 0.010335473343729973, -0.01296974252909422, 0.01827860064804554, 0.002753550186753273, -0.037551771849393845, 0.014273436740040779, 0.004015244077891111, -0.02241816557943821, 0.01627601869404316, -0.0032743560150265694, 0.03360036760568619, 0.013500628061592579, 0.018547402694821358, 0.007600403390824795, 0.009542504325509071, -0.008648734539747238, -0.017431871965527534, -0.016706103459000587, 0.01690770499408245, 0.03300900384783745, -0.005705342628061771, 0.045857783406972885, 0.005281977821141481, 0.01642386056482792, -0.01876244507730007, -0.013137743808329105, 0.0071434383280575275, -0.03185315057635307, 0.004368048161268234, 0.025697562843561172, 0.021625196561217308, -0.00891081802546978, 0.014918563887476921, 0.022834811359643936, -0.022915450856089592, -0.0016363379545509815, -0.010375794023275375, 0.01934037171304226, -0.028681274503469467, -0.007002316880971193, -0.019891418516635895, 0.01459600031375885, -0.013964313082396984, 0.009253541938960552, 0.0645127072930336, 0.02718941867351532, -0.04698675498366356, 0.032632678747177124, 0.02903071790933609, 0.005312218330800533, -0.005164376460015774, 0.04972854629158974, 0.03072417713701725, -0.01795603707432747, -0.024219145998358727, -0.02951456420123577, 0.04066988453269005, -0.003884202567860484, -0.007210639305412769, -0.020509665831923485, 0.022028401494026184, -0.008521053940057755, -0.03239075466990471, 0.0037262809928506613, -0.024232586845755577, 0.00891081802546978, -0.009078819304704666, 0.02443418838083744, -0.019595734775066376, 0.01059083640575409, -0.02241816557943821, -0.04819636791944504, -0.02194776013493538, 0.04394928365945816, 0.0012070932425558567, -0.024219145998358727, 0.012788300402462482, 0.0168808251619339, -0.007869206368923187, -0.008238810114562511, -0.023694980889558792, -0.03255203738808632, -0.034164853394031525, 0.03010592982172966, 0.03397669270634651, 0.005859904456883669, -0.016074417158961296, -0.04701363667845726, -0.011565246619284153, -0.0029786727391183376, 0.005755743011832237, -0.0008542893920093775, -0.040938690304756165, -0.008850337006151676, -0.016827065497636795, -0.0008215290145017207, -0.022969212383031845, -0.011242683045566082, 0.029729606583714485, -0.03134242445230484, -0.01628945954144001, 0.05790015682578087, 0.01643729954957962, -0.009381222538650036, 0.034353017807006836, -0.006014465820044279, 0.006054786499589682, -0.01389711257070303, -0.03196066990494728, -0.0013893753057345748, 0.008171609602868557, 0.024541709572076797, -0.02119511179625988, 0.040643006563186646, -0.020173661410808563, 0.004979574587196112, 0.013164624571800232, 0.018480202183127403, 0.01876244507730007, 0.010604276321828365, -0.008003607392311096, 0.017136188223958015, -0.00687127560377121, -0.008332891389727592, 0.04359984025359154, -0.03058977611362934, 0.021907441318035126, -0.013554388657212257, -0.05209401249885559, 0.02318425476551056, 0.015321767888963223, -0.0020395424216985703, -0.02318425476551056, -0.0016069376142695546, -0.006824234966188669, -0.03499814495444298, 0.0207381471991539, 0.021154792979359627, -0.024662669748067856, 0.0012642138171941042, -0.013184784911572933, -0.004650291055440903, -0.039540912955999374, -0.0039043629076331854, 0.020966630429029465, -0.009690346196293831, 0.031073620542883873, 0.012358215637505054, 0.026356128975749016, -0.01567121222615242, -0.016208818182349205, 0.007815445773303509, 0.01235149521380663, 0.015362088568508625, 0.02319769375026226, -0.011148602701723576, -0.005792703479528427, 0.026463650166988373, -0.008474012836813927, -0.015765292569994926, -0.014958884567022324, 0.022485366091132164, 0.029407043009996414, 0.02518683671951294, -0.0007371081155724823, 0.007418961264193058, 0.04859957471489906, -0.02444762922823429, 0.04029356315732002, 0.003012272994965315, -0.022834811359643936, 0.021087590605020523, 0.0005195456906221807, 0.0025855484418570995, -0.018251720815896988, -0.03443365916609764, 0.02567068114876747, -0.013258705846965313, -0.017740994691848755, -0.00299379276111722, 0.013003342784941196, -0.00944170355796814, 0.009885228238999844, 0.02151767536997795, 0.0023738660383969545, -0.022041842341423035, 0.02362777851521969, -0.01336622703820467, -0.01435407716780901, -0.019017808139324188, -0.00974410679191351, 0.009119139984250069, -0.01948821358382702, 0.026302369311451912, 0.0036926805041730404, 0.019931739196181297, 0.022216564044356346, 0.003927883226424456, -0.00045234497520141304, -0.0067368741147220135, 0.005651582032442093, 0.03728296980261803, 0.007849046029150486, -0.0024141864851117134, 0.00991210900247097, 0.029595205560326576, -0.014434718526899815, 0.006773834116756916, -0.008890657685697079, -0.022377844899892807, -0.03454117849469185, 0.002293225144967437, -0.004462128970772028, -3.5910394217353314e-05, 0.007056077476590872, -0.010496755130589008, -0.041234374046325684, 0.036180876195430756, 0.022821370512247086, -0.01951509341597557, -0.004109324887394905, 0.037686172872781754, -0.02747166156768799, 0.02287513017654419, 0.006740233860909939, -0.006807434838265181, 0.021894000470638275, 0.060426902025938034, 0.0070426370948553085, -0.0028694714419543743, 0.014797602780163288, 0.02643677033483982, 0.0024511469528079033, -0.01544272992759943, 0.01811731792986393, 0.0253481175750494, 0.030616655945777893, -0.029245760291814804, -0.010187631472945213, -0.0010550515726208687, -0.026073886081576347, -0.022861691191792488, 0.018964048475027084, 0.0074861622415483, -0.002427626634016633, -0.023990662768483162, -0.026651812717318535, -0.027149097993969917, -0.002338585676625371, 0.014958884567022324, 0.0253346785902977, 0.0025956283789128065, 0.02243160642683506, -0.013514067977666855, -0.0024477869737893343, -0.0023520258255302906, 0.025926044210791588, 0.016800183802843094, -0.022337526082992554, 0.03712168708443642, -0.009327461943030357, -0.006367269903421402, 0.020321503281593323, 0.017284030094742775, 0.030670417472720146, -0.017431871965527534, 0.012183493934571743, 0.010926839895546436, -0.015698092058300972, -0.01206925231963396, -0.010631157085299492, -0.0033885971643030643, 0.010953720659017563, -0.002184024080634117, -0.00976426713168621, 0.016786744818091393, -0.0011592126684263349, -0.0065419916063547134, 0.025576600804924965, -0.005765823181718588, 0.026759333908557892, -0.0028812317177653313, 0.025711001828312874, -0.016235698014497757, 0.007889366708695889, -0.01706898771226406, -0.03806249797344208, -0.0014926963485777378, 0.01168620865792036, -0.0039715636521577835, -0.012788300402462482, -0.029111359268426895, 0.00011697128502419218, 0.008621854707598686, -0.008736096322536469, -0.01826515980064869, 0.004626770969480276, -0.0044150883331894875, -0.02091286890208721, -0.02060374617576599, -0.008480733260512352, -0.017566272988915443, 0.027874866500496864, 0.029541444033384323, 0.0009567704983055592, 0.038761384785175323, -0.0088368970900774, -3.018783161223837e-07, 0.01951509341597557, -0.04666419327259064, -0.012862221337854862, -0.046798594295978546, 0.007217359263449907, 0.01166604831814766, -0.014958884567022324, 0.030670417472720146, 0.047067396342754364, 0.024689551442861557, 0.01794259622693062, -0.009387942962348461, -0.008998178876936436, -0.03903018683195114, 0.012842060998082161, 0.002274744911119342, -0.014098715037107468, 0.013856791891157627, -0.01737811043858528, -0.002689709421247244, -0.025603480637073517, 0.0049459743313491344, 0.012311175465583801, 0.004851893056184053, -0.01814419962465763, -0.040481723845005035, -0.008037208579480648, 0.01948821358382702, -0.021477356553077698, 0.03483686223626137, 0.0015338568482547998, 0.011330043897032738, 0.008185049518942833, -0.012465736828744411, -0.007069517392665148, -0.04437936842441559, -0.0010441314661875367, -0.013285585679113865, 0.014461598359048367, 0.04529329761862755, -0.021894000470638275, -0.03088545985519886, -0.00846057292073965, 0.004882133565843105, -0.003877482609823346, -0.005873344372957945, 0.016934586688876152, 0.00602454598993063, -0.014878243207931519, -0.008339611813426018, 0.0035851593129336834, 0.004693971481174231, -0.015415849164128304, -0.013278866186738014, 0.012102852575480938, 0.0019118610071018338, -0.033707890659570694, -0.01935381256043911, -0.008521053940057755, -0.014475039206445217, -0.008870497345924377, -0.005906944628804922, -0.01766035333275795, 0.012909261509776115, 0.0005565061001107097, -0.011773569509387016, -0.0050534955225884914, 0.06483527272939682, 0.015496490523219109, -0.010772278532385826, 0.005208057351410389, -0.006488231010735035, 0.00897129811346531, 0.015348648652434349, -0.01826515980064869, 0.003050913568586111, 0.03628839924931526, -0.01735123060643673, -0.007009036839008331, -0.033546607941389084, 0.043734241276979446, 0.044648170471191406, 0.005322298500686884, -0.03515942767262459, -0.022660087794065475, -0.011632448062300682, -0.011471166275441647, -0.008810016326606274, 0.1136498898267746, -0.01657170243561268, -0.032014429569244385, -0.00821193028241396, 0.008191769942641258, -0.011605567298829556, -0.034783102571964264, -0.004579730331897736, 0.0008013687911443412, -0.012499337084591389, -0.003336516674607992, -0.054782040417194366, -0.04534705728292465, -0.001874900539405644, 0.010792438872158527, 0.04351919889450073, -0.012842060998082161, -0.002184024080634117, -0.00830601155757904, -0.0015682971570640802, 0.016974905505776405, -0.07451217621564865, 0.004438608884811401, 0.013500628061592579, 0.017257150262594223, -0.025388438254594803, -0.013540948741137981, -0.020953189581632614, 0.007385361008346081, 0.041879501193761826, 0.0013062143698334694, 0.008883937261998653, 0.03491750359535217, -0.007022477220743895, 0.01673298329114914, 0.02747166156768799, -0.012398536317050457, -0.031557466834783554, -0.02397722378373146, 0.025012115016579628, -0.04881461709737778, 0.003880842588841915, 0.023990662768483162, -0.02396378293633461, 0.0036826003342866898, 0.006263108924031258, -0.002268024953082204, -0.025012115016579628, 0.0027602703776210546, -0.014165915548801422, 0.002079862868413329, 0.04029356315732002, -0.0011944931466132402, 0.02671901322901249, 0.006014465820044279, -0.013151184655725956, 0.011430845595896244, 0.031261783093214035, 0.05411003530025482, -0.005574301350861788, -0.01036907359957695, -0.013527508825063705, 0.021920880302786827, 0.033439088612794876, -0.008198490366339684, -0.006256388500332832, -0.01720338873565197, 0.020980069413781166, -0.01584593392908573, 0.022337526082992554, 0.019783897325396538, -0.002360425889492035, 0.027229739353060722, 0.01436751801520586, 0.027928626164793968, 0.018076999112963676, -0.02213592268526554, 0.01467664074152708, -0.0008601694134995341, 0.03698728606104851, 0.0021403434220701456, -0.010799158364534378, 0.02701469697058201, 0.005533980671316385, 0.023896582424640656, 0.005050135310739279, 0.005214777309447527, 0.010577396489679813, -0.04282030835747719, -0.007445841562002897, -0.013010063208639622, -0.023748740553855896, -0.012317894957959652, -0.009428263641893864, 0.01860116422176361, 0.004761172458529472, -0.05730878934264183, -0.0038270819932222366, -0.013097424060106277, 0.018654923886060715, 0.016679223626852036, 0.028842557221651077, 0.004411728587001562, 0.04252462834119797, -0.019783897325396538, -0.012573258019983768, -0.015926575288176537, -0.020791908726096153, -0.04806196689605713, 0.019703255966305733, -0.013843351975083351, -0.0034742781426757574, 0.011182202957570553, 0.020536545664072037, 0.026490529999136925, -0.08671583235263824, -0.03881514444947243, -0.03072417713701725, -0.004929174203425646, -0.01764691434800625, 0.04841141030192375, 0.028950078412890434, 0.005544060841202736, -0.02886943705379963, 0.03857322409749031, -0.008581534028053284, 0.0399441197514534, 0.026342689990997314, 0.015133606269955635, -0.03854634240269661, 0.0095895454287529, -0.002916512079536915, 0.0036053196527063847, 0.016638902947306633, -0.019286612048745155, 0.031718749552965164, 0.015577130950987339, -0.044621288776397705, -0.0023772260174155235, 0.0006644473178312182, 0.004008524119853973, -0.020509665831923485, -0.005802783649414778, -0.008245530538260937, -0.014703521504998207, -0.00967018585652113, 0.02857375331223011, 0.0052382973954081535, -0.00928714219480753, -0.03322404623031616, -0.03577767312526703, -0.022660087794065475, -0.00464693084359169, -0.014085274189710617, -0.006330309435725212, 0.0008929297910071909, -0.026530850678682327, -0.016060976311564445, 0.021423595026135445, -0.025428758934140205, -0.02395034208893776, -0.015362088568508625, -0.0053693391382694244, -0.01212301291525364, -0.03489062190055847, 0.008628575131297112, -0.02765982411801815, 0.009105700068175793, -0.027874866500496864, 0.011410685256123543, 0.007217359263449907, -0.03903018683195114, -0.006914956029504538, 0.025858843699097633, 0.04136877506971359, -0.00733832037076354, 0.0026157887186855078, 0.03521318733692169, 0.030509134754538536, 0.004798132460564375, -0.043384797871112823, -0.033385325223207474, 0.031073620542883873, -0.0253346785902977, 0.001041611423715949, 0.023560578003525734, -0.041234374046325684, 0.00023100253019947559, -0.011309884488582611, -0.04873397573828697, -0.029138239100575447, -0.005305498372763395, -0.0007673484506085515, -0.010234672576189041, -0.010624436661601067, -0.03507878631353378, -0.05238969624042511, -0.036933526396751404, 0.010328752920031548, -0.0029417122714221478, -0.032014429569244385, -0.020859109237790108, 0.014273436740040779, 0.008043928071856499, -0.013184784911572933, 0.029138239100575447, -0.0291920006275177, 0.027377581223845482, -0.01964949630200863, 0.0007816285942681134, 0.01720338873565197, -0.0037229207810014486, -0.030159691348671913, -0.0038035616744309664, 0.048922136425971985, 0.04282030835747719, 0.005859904456883669, 0.03604647517204285, 0.022982651367783546, 0.004099245183169842, 0.013776151463389397, -0.0027619502507150173, 0.010933560319244862, -0.017163068056106567, -0.010160751640796661, 0.0337885320186615, -0.01467664074152708, -0.018036678433418274, -0.009266981855034828, 0.0011432525934651494, 0.05249721556901932, 0.004902293905615807, 0.01889684796333313, 0.0322832353413105, -0.007620563730597496, 0.014018073678016663, 0.004025324247777462, 0.0019404212944209576, 0.0053693391382694244, 0.011504766531288624, -0.019232850521802902, -0.013164624571800232, 0.00015991675900295377, -0.02104727178812027, 0.027270060032606125, 0.03714856877923012, 0.027149097993969917, -0.009865067899227142, 0.017082426697015762, -0.008144729770720005, -0.010080110281705856, 0.028197430074214935, -0.030186571180820465, -0.030670417472720146, 0.008393372409045696, 0.012983182445168495, 0.018197959288954735, -0.009428263641893864, 0.026759333908557892, -0.004912374075502157, -0.01435407716780901, -0.008715935982763767, 0.03147682547569275, -0.023264896124601364, 0.0019320212304592133, -0.009697066619992256, 0.020778467878699303, -0.0023083453997969627, 0.022807929664850235, 0.01542928908020258, 0.008521053940057755, -0.0013297345722094178, 0.013419987633824348, -0.014259996823966503, -0.010933560319244862, 0.027552302926778793, 0.002931632101535797, -0.009468584321439266, 0.022794490680098534, 0.00013408646918833256, -0.013191504403948784, -0.013776151463389397, -0.027552302926778793, 0.0009382903226651251, -0.055373407900333405, -0.01689426600933075, 0.006400870159268379, 0.003934603184461594, 0.011692928150296211, 0.007304720114916563, 0.038626983761787415, 0.022619768977165222, -0.006948556285351515, 0.003850602312013507, 0.013393106870353222, 0.03830442205071449, -0.026073886081576347, 0.0018093799008056521, 0.032471396028995514, -0.0006686473498120904, -0.002916512079536915, -0.009770987555384636, 0.0026258688885718584, -0.016383539885282516, -0.010947000235319138, 0.0026729092933237553, 0.018668364733457565, 0.007970007136464119, -0.040481723845005035, -0.015630891546607018, 0.01081931870430708, 0.009031779132783413, 0.011498046107590199, -0.012795020826160908, -0.02424602583050728, 0.004280686844140291, -0.04282030835747719, -0.002550268080085516, -0.007197198923677206, 0.04865333437919617, 0.04128813371062279, 0.03241763636469841, 0.001601057592779398, -0.028923196718096733, -0.011887810193002224, -0.03779369592666626, 0.03292836248874664, -0.020334942266345024, 0.009576105512678623, -0.04397616162896156, 0.0046166907995939255, -0.010826039128005505, 0.01782163605093956, -0.01686738431453705, 0.021463915705680847, -0.01768723502755165, -0.013937433250248432, 0.007392080966383219, 0.014259996823966503, -0.014152475632727146, 0.025697562843561172, 0.02271384932100773, -7.334330439334735e-05, 0.01737811043858528, -0.0024813872296363115, 0.01436751801520586, 0.009818027727305889, -0.03870762512087822, -0.016638902947306633, 0.014327197335660458, -0.011531646363437176, -0.0039010029286146164, -0.001152492593973875, -0.021853679791092873, -0.0399709977209568, -0.011854209937155247, -0.005937185138463974, -0.013332626782357693, 0.026167966425418854, -0.03507878631353378, -0.0415031760931015, -0.023009533062577248, -0.01827860064804554, -0.002108423039317131, -0.024635789915919304, 0.042175181210041046, -0.02674589306116104, -0.030052170157432556, 0.0005938865360803902, -0.005318938288837671, 0.008729375898838043, -0.026598051190376282, -0.005399579182267189, -0.022404726594686508, 0.01780819520354271, -0.029460802674293518, -0.0033113162498921156, -0.039083950221538544, 0.010261552408337593, 0.009925548918545246, -0.028788795694708824, -0.03454117849469185, -0.014071834273636341, 0.002056342549622059, 0.030966099351644516, 0.001849700347520411, -0.019730135798454285, -0.008890657685697079, 0.0019555415492504835])" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 12, 191 | "id": "882cabd8-d54d-48ca-84ad-e29aa62ef0f2", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "np.float64(1.0000000005023455)" 198 | ] 199 | }, 200 | "execution_count": 12, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "mag_e = np.linalg.norm(embedding)\n", 207 | "mag_e" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 15, 213 | "id": "b03d43ef-fe40-4a62-9b79-5999dd71f4f9", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "(92, 7)\n", 221 | "(73, 26)\n", 222 | "(73, 26)\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "def calculate_cosine_metrics(v1, v2):\n", 228 | " dot_product = np.dot(v1, v2)\n", 229 | " magnitude1 = np.linalg.norm(v1)\n", 230 | " magnitude2 = np.linalg.norm(v2)\n", 231 | " cosine_similarity = dot_product / (magnitude1 * magnitude2)\n", 232 | " cosine_distance = 1 - cosine_similarity\n", 233 | " return int(cosine_similarity* 100), int(cosine_distance * 100)\n", 234 | "\n", 235 | "\n", 236 | "vector1 = np.array([1, 2, 3]) # Along x-axis\n", 237 | "vector2 = np.array([0, 1, 3]) # Along y-axis\n", 238 | "vector3 = np.array([3, 0, 1])\n", 239 | "\n", 240 | "compare_vector = np.array([1, 1, 1])\n", 241 | "\n", 242 | "\n", 243 | "print(calculate_cosine_metrics(vector1, compare_vector))\n", 244 | "print(calculate_cosine_metrics(vector2, compare_vector))\n", 245 | "print(calculate_cosine_metrics(vector3, compare_vector))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "d4d0e377-82a0-42ad-820a-800d33ebab24", 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3 (ipykernel)", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.11.8" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 5 278 | } 279 | -------------------------------------------------------------------------------- /notebooks/99 - Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8dd700e2-34e8-4c9a-803c-0270c0f7c9f8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import setup\n", 11 | "\n", 12 | "setup.init_django()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "f5adc0e1-26f7-4430-9de7-0a2df550063c", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from rag import (\n", 23 | " db as rag_db, \n", 24 | " engines as rag_engines,\n", 25 | " settings as rag_settings, \n", 26 | " updaters as rag_updaters,\n", 27 | " patches as rag_patches,\n", 28 | ")" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "id": "ba98d441-cc40-441f-b695-4f950bcfc130", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "from typing import Optional, Union\n", 39 | "from sqlalchemy import create_engine, text" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "id": "bca05a98-dca8-4c61-be99-533eb9511039", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "rag_settings.init()\n", 50 | "rag_db.init_vector_db()\n", 51 | "rag_updaters.update_llama_index_documents(use_saved_embeddings=True)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "id": "d31ead22-ac7b-4174-b2cb-4086a6459473", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "vector_index = rag_engines.get_semantic_query_index()\n", 62 | "semantic_query_retriever = rag_engines.get_semantic_query_retriever_engine()\n", 63 | "sql_query_engine = rag_engines.get_sql_query_engine()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 6, 69 | "id": "3a76d593-450d-48fc-bbb6-74e0d2298bca", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "vector_db blogpost\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "print(rag_settings.VECTOR_DB_NAME, rag_settings.VECTOR_DB_TABLE_NAME)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 7, 87 | "id": "872f70d5-fd38-4146-9eee-0f4ba554a8d7", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from llama_index.core.tools import QueryEngineTool\n", 92 | "\n", 93 | "vector_tool = QueryEngineTool.from_defaults(\n", 94 | " query_engine=semantic_query_retriever,\n", 95 | " description=(\n", 96 | " f\"Useful for answering semantic questions about different blog posts\"\n", 97 | " ),\n", 98 | ")" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "id": "0f155a6a-4d60-40e4-b43e-2428a086c8d1", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "sql_tool = QueryEngineTool.from_defaults(\n", 109 | " query_engine=sql_query_engine,\n", 110 | " description=(\n", 111 | " \"Useful for translating a natural language query into a SQL query over\"\n", 112 | " \" a table containing: blog posts and page views each blog post\"\n", 113 | " ),\n", 114 | ")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 9, 120 | "id": "298422a3-bf77-4ade-a79d-e5f3dd69d172", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "query_engine = rag_patches.MySQLAutoVectorQueryEngine(\n", 125 | " sql_tool, \n", 126 | " vector_tool,\n", 127 | ")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 15, 133 | "id": "50961604-48ca-437d-ad24-7ac7855cab96", 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "\u001b[1;3;34mQuerying other query engine: The question 'What do you make?' is a semantic question about the content or purpose of blog posts, which aligns with choice (2) that is useful for answering semantic questions about different blog posts.\n", 141 | "\u001b[0m" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "response = query_engine.query(\n", 147 | " \"What do you make?\"\n", 148 | ")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 16, 154 | "id": "0af92a90-2914-4172-9c5d-8e3efbbb9c59", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "'If your job is to make decisions, doing it more productively and with less hassle or drama could be beneficial. Improving the quality of your decisions is crucial, as it is a common goal for professionals in any field. Making better decisions can lead to more effective outcomes and greater success in your work.'" 161 | ] 162 | }, 163 | "execution_count": 16, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "response.response" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 17, 175 | "id": "78548bec-2197-42f8-ab50-3d095d1374c0", 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "\u001b[1;3;34mQuerying SQL database: The question requires translating a natural language query into a SQL query to retrieve the top 5 most viewed blog posts and analyze their content for keywords. This aligns with the functionality described in choice 1.\n", 183 | "\u001b[0m\u001b[1;3;33mSQL query: SELECT \n", 184 | " blog_blogpost.title, \n", 185 | " blog_blogpost.content, \n", 186 | " COUNT(analytics_pageview.id) AS view_count\n", 187 | "FROM \n", 188 | " blog_blogpost\n", 189 | "JOIN \n", 190 | " analytics_pageview ON blog_blogpost.id = analytics_pageview.post_id\n", 191 | "GROUP BY \n", 192 | " blog_blogpost.id\n", 193 | "ORDER BY \n", 194 | " view_count DESC\n", 195 | "LIMIT 5;\n", 196 | "\u001b[0m\u001b[1;3;33mSQL response: Here are the top 5 most viewed blog posts along with the keywords that their content includes:\n", 197 | "\n", 198 | "1. **Title: \"Taking it very seriously\"**\n", 199 | " - **View Count:** 2493\n", 200 | " - **Keywords:** April first, greeting, Happy, internet, apocalypse, pretend, smile\n", 201 | "\n", 202 | "2. **Title: \"“But we were comfortable”\"**\n", 203 | " - **View Count:** 2490\n", 204 | " - **Keywords:** digital shift, unwanted, risk, lonely, powerful, efficient, comfortable, follow\n", 205 | "\n", 206 | "3. **Title: \"All models are wrong, some models are useful\"**\n", 207 | " - **View Count:** 2471\n", 208 | " - **Keywords:** model, map, territory, approximation, problem, organization, opportunity, simplified\n", 209 | "\n", 210 | "4. **Title: \"The Fremen principle\"**\n", 211 | " - **View Count:** 2391\n", 212 | " - **Keywords:** new resources, limited resources, population, alternatives, distance learning, in-person lectures, tenure, accreditation\n", 213 | "\n", 214 | "5. **Title: \"Portfolio school: Get better clients\"**\n", 215 | " - **View Count:** 2383\n", 216 | " - **Keywords:** tragedy, health, economy, panic, focus, overwhelmed, health care workers\n", 217 | "\n", 218 | "These blog posts cover a range of topics from digital transformation and resource management to health and economic challenges, each resonating with a significant number of readers.\n", 219 | "\u001b[0m\u001b[1;3;34mTransformed query given SQL response: None\n", 220 | "\u001b[0m" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "response = query_engine.query(\n", 226 | " \"Are are the top 5 most viewed blog posts? What keywords do their content have?\"\n", 227 | ")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 18, 233 | "id": "96f78be4-960a-432c-98db-2c0af2078963", 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/markdown": [ 239 | "Here are the top 5 most viewed blog posts along with the keywords that their content includes:\n", 240 | "\n", 241 | "1. **Title: \"Taking it very seriously\"**\n", 242 | " - **View Count:** 2493\n", 243 | " - **Keywords:** April first, greeting, Happy, internet, apocalypse, pretend, smile\n", 244 | "\n", 245 | "2. **Title: \"“But we were comfortable”\"**\n", 246 | " - **View Count:** 2490\n", 247 | " - **Keywords:** digital shift, unwanted, risk, lonely, powerful, efficient, comfortable, follow\n", 248 | "\n", 249 | "3. **Title: \"All models are wrong, some models are useful\"**\n", 250 | " - **View Count:** 2471\n", 251 | " - **Keywords:** model, map, territory, approximation, problem, organization, opportunity, simplified\n", 252 | "\n", 253 | "4. **Title: \"The Fremen principle\"**\n", 254 | " - **View Count:** 2391\n", 255 | " - **Keywords:** new resources, limited resources, population, alternatives, distance learning, in-person lectures, tenure, accreditation\n", 256 | "\n", 257 | "5. **Title: \"Portfolio school: Get better clients\"**\n", 258 | " - **View Count:** 2383\n", 259 | " - **Keywords:** tragedy, health, economy, panic, focus, overwhelmed, health care workers\n", 260 | "\n", 261 | "These blog posts cover a range of topics from digital transformation and resource management to health and economic challenges, each resonating with a significant number of readers." 262 | ], 263 | "text/plain": [ 264 | "" 265 | ] 266 | }, 267 | "metadata": {}, 268 | "output_type": "display_data" 269 | } 270 | ], 271 | "source": [ 272 | "from IPython.display import Markdown, display\n", 273 | "\n", 274 | "display(Markdown(response.response))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 25, 280 | "id": "9787d4a9-901a-4fc3-a97d-d04fd15b9599", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "\u001b[1;3;34mQuerying SQL database: The question requires translating a natural language query into a SQL query to retrieve the top 5 least viewed blog posts in the specified time frame.\n", 288 | "\u001b[0m\u001b[1;3;33mSQL query: SELECT \n", 289 | " blog_blogpost.id, \n", 290 | " blog_blogpost.title, \n", 291 | " COUNT(analytics_pageview.id) AS view_count\n", 292 | "FROM \n", 293 | " blog_blogpost\n", 294 | "LEFT JOIN \n", 295 | " analytics_pageview ON blog_blogpost.id = analytics_pageview.post_id\n", 296 | "WHERE \n", 297 | " analytics_pageview.timestamp BETWEEN '2024-01-01' AND '2025-12-31'\n", 298 | "GROUP BY \n", 299 | " blog_blogpost.id, blog_blogpost.title\n", 300 | "ORDER BY \n", 301 | " view_count ASC\n", 302 | "LIMIT 5;\n", 303 | "\u001b[0m\u001b[1;3;33mSQL response: Based on the query results, the top 5 least viewed blog posts from the year 2024 to 2025 are as follows:\n", 304 | "\n", 305 | "1. **\"Monopoly and network effects\"** with 525 views.\n", 306 | "2. **\"A Sunday book reading\"** with 558 views.\n", 307 | "3. **\"Helping leaders in college reboot\"** with 624 views.\n", 308 | "4. **\"Is everything going to be okay?\"** with 648 views.\n", 309 | "5. **\"You’re surrounded\"** with 654 views.\n", 310 | "\u001b[0m\u001b[1;3;34mTransformed query given SQL response: None\n", 311 | "\u001b[0mBased on the query results, the top 5 least viewed blog posts from the year 2024 to 2025 are as follows:\n", 312 | "\n", 313 | "1. **\"Monopoly and network effects\"** with 525 views.\n", 314 | "2. **\"A Sunday book reading\"** with 558 views.\n", 315 | "3. **\"Helping leaders in college reboot\"** with 624 views.\n", 316 | "4. **\"Is everything going to be okay?\"** with 648 views.\n", 317 | "5. **\"You’re surrounded\"** with 654 views.\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "response = query_engine.query(\n", 323 | " \"What are the top 5 least viewed blog posts in the year 2024 to 2025?\"\n", 324 | ")\n", 325 | "print(response.response)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 26, 331 | "id": "90fac10c-1674-49b4-8f5a-73eb4aa43461", 332 | "metadata": {}, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/markdown": [ 337 | "Based on the query results, the top 5 least viewed blog posts from the year 2024 to 2025 are as follows:\n", 338 | "\n", 339 | "1. **\"Monopoly and network effects\"** with 525 views.\n", 340 | "2. **\"A Sunday book reading\"** with 558 views.\n", 341 | "3. **\"Helping leaders in college reboot\"** with 624 views.\n", 342 | "4. **\"Is everything going to be okay?\"** with 648 views.\n", 343 | "5. **\"You’re surrounded\"** with 654 views." 344 | ], 345 | "text/plain": [ 346 | "" 347 | ] 348 | }, 349 | "metadata": {}, 350 | "output_type": "display_data" 351 | } 352 | ], 353 | "source": [ 354 | "display(Markdown(response.response))" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "d633c0de-04d7-4232-9151-5b081f7d3e50", 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [] 364 | } 365 | ], 366 | "metadata": { 367 | "kernelspec": { 368 | "display_name": "Python 3 (ipykernel)", 369 | "language": "python", 370 | "name": "python3" 371 | }, 372 | "language_info": { 373 | "codemirror_mode": { 374 | "name": "ipython", 375 | "version": 3 376 | }, 377 | "file_extension": ".py", 378 | "mimetype": "text/x-python", 379 | "name": "python", 380 | "nbconvert_exporter": "python", 381 | "pygments_lexer": "ipython3", 382 | "version": "3.11.8" 383 | } 384 | }, 385 | "nbformat": 4, 386 | "nbformat_minor": 5 387 | } 388 | -------------------------------------------------------------------------------- /notebooks/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import sys 4 | 5 | NBS_DIR = pathlib.Path(__file__).resolve().parent 6 | BASE_DIR = NBS_DIR.parent 7 | 8 | 9 | def init_django(project_name='cfehome', django_root='src'): 10 | PROJECT_ROOT = BASE_DIR / django_root 11 | os.chdir(PROJECT_ROOT) 12 | sys.path.insert(0, str(PROJECT_ROOT)) 13 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", f"{project_name}.settings") 14 | os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true" 15 | import django 16 | 17 | django.setup() 18 | 19 | if __name__ == "__main__": 20 | init_django() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django>=5.0,<5.1 2 | sentence-transformers 3 | jupyter 4 | openai 5 | pgvector 6 | # psycopg2[binary] 7 | psycopg[binary] 8 | python-decouple 9 | dj-database-url 10 | dateparser 11 | numpy 12 | sqlalchemy 13 | llama-index 14 | llama-index-llms-openai 15 | llama-index-vector-stores-postgres -------------------------------------------------------------------------------- /src/analytics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/analytics/__init__.py -------------------------------------------------------------------------------- /src/analytics/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | from .models import PageView 5 | 6 | admin.site.register(PageView) -------------------------------------------------------------------------------- /src/analytics/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class AnalyticsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "analytics" 7 | -------------------------------------------------------------------------------- /src/analytics/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/analytics/management/__init__.py -------------------------------------------------------------------------------- /src/analytics/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/analytics/management/commands/__init__.py -------------------------------------------------------------------------------- /src/analytics/management/commands/fake_traffic.py: -------------------------------------------------------------------------------- 1 | import random 2 | from django.core.management.base import BaseCommand 3 | from django.db import transaction 4 | from analytics.models import PageView 5 | from blog.models import BlogPost 6 | from django.utils import timezone 7 | from datetime import timedelta 8 | 9 | class Command(BaseCommand): 10 | help = "Generate random page views for blog posts" 11 | 12 | def add_arguments(self, parser): 13 | parser.add_argument('--min', type=int, default=500, help='Minimum number of views') 14 | parser.add_argument('--max', type=int, default=2500, help='Maximum number of views') 15 | 16 | def handle(self, *args, **options): 17 | min_views = options['min'] 18 | max_views = options['max'] 19 | 20 | with transaction.atomic(): 21 | qs = BlogPost.objects.filter(can_delete=True) 22 | total_views = 0 23 | 24 | for obj in qs: 25 | rand_views = random.randint(min_views, max_views) 26 | page_views = [] 27 | now = timezone.now() 28 | 29 | for _ in range(rand_views): 30 | random_time = now - timedelta(days=random.randint(0, 30)) 31 | page_views.append( 32 | PageView(post=obj, timestamp=random_time) 33 | ) 34 | 35 | PageView.objects.bulk_create(page_views) 36 | total_views += rand_views 37 | 38 | self.stdout.write(self.style.SUCCESS(f"Random views completed. Total views added: {total_views}")) -------------------------------------------------------------------------------- /src/analytics/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2024-08-01 18:00 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | initial = True 9 | 10 | dependencies = [ 11 | ("blog", "0004_blogpost__content"), 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name="PageView", 17 | fields=[ 18 | ( 19 | "id", 20 | models.BigAutoField( 21 | auto_created=True, 22 | primary_key=True, 23 | serialize=False, 24 | verbose_name="ID", 25 | ), 26 | ), 27 | ("timestamp", models.DateTimeField(auto_now_add=True)), 28 | ( 29 | "post", 30 | models.ForeignKey( 31 | on_delete=django.db.models.deletion.CASCADE, to="blog.blogpost" 32 | ), 33 | ), 34 | ], 35 | ), 36 | ] 37 | -------------------------------------------------------------------------------- /src/analytics/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/analytics/migrations/__init__.py -------------------------------------------------------------------------------- /src/analytics/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from blog.models import BlogPost 3 | # Create your models here. 4 | 5 | class PageView(models.Model): 6 | post = models.ForeignKey(BlogPost, on_delete=models.CASCADE) 7 | timestamp = models.DateTimeField(auto_now_add=True) -------------------------------------------------------------------------------- /src/analytics/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /src/analytics/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /src/blog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/blog/__init__.py -------------------------------------------------------------------------------- /src/blog/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | from .models import BlogPost 5 | 6 | class BlogPostAdmin(admin.ModelAdmin): 7 | list_display = ['title'] 8 | search_fields = ['title'] 9 | readonly_fields = ['timestamp'] 10 | 11 | admin.site.register(BlogPost, BlogPostAdmin) -------------------------------------------------------------------------------- /src/blog/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class BlogConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "blog" 7 | -------------------------------------------------------------------------------- /src/blog/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/blog/management/__init__.py -------------------------------------------------------------------------------- /src/blog/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/blog/management/commands/__init__.py -------------------------------------------------------------------------------- /src/blog/management/commands/load_posts.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | # pip install dateparser 4 | import dateparser 5 | from django.conf import settings 6 | from django.core.management.base import BaseCommand 7 | from django.utils import timezone 8 | 9 | from blog.models import BlogPost 10 | 11 | BASE_DIR = settings.BASE_DIR 12 | 13 | 14 | def parse_date(date_string): 15 | if not date_string: 16 | return None 17 | parsed_date = dateparser.parse(date_string) 18 | if parsed_date: 19 | return ( 20 | timezone.make_aware(parsed_date) 21 | if timezone.is_naive(parsed_date) 22 | else parsed_date 23 | ) 24 | return None 25 | 26 | 27 | class Command(BaseCommand): 28 | help = "Load blog posts from data.csv file" 29 | 30 | def handle(self, *args, **options): 31 | with open( 32 | str(BASE_DIR.parent / "datasets" / "seth-data.csv"), "r", encoding="utf-8" 33 | ) as file: 34 | csv_reader = csv.DictReader(file) 35 | rows = list(csv_reader) 36 | start_index = max(0, len(rows) - 50) 37 | for row in rows[start_index:]: 38 | created_at = parse_date(row["publication-date"]) 39 | if not created_at: 40 | # If publication-date is invalid, try to extract from URL 41 | created_at = parse_date("/".join(row["url"].split("/")[-4:-2])) 42 | 43 | if not created_at: 44 | created_at = timezone.now() 45 | 46 | # Create or update the BlogPost 47 | BlogPost.objects.update_or_create( 48 | title=row["title"], 49 | defaults={ 50 | "content": row["content_plain"], 51 | "timestamp": created_at, 52 | "can_delete": True 53 | }, 54 | ) 55 | self.stdout.write( 56 | self.style.SUCCESS( 57 | f"Successfully added/updated post: {row['title']} (Date: {created_at})" 58 | ) 59 | ) 60 | 61 | self.stdout.write(self.style.SUCCESS("Data import completed")) -------------------------------------------------------------------------------- /src/blog/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2024-07-31 16:04 2 | 3 | from django.db import migrations, models 4 | from pgvector.django import VectorExtension 5 | 6 | class Migration(migrations.Migration): 7 | initial = True 8 | 9 | dependencies = [] 10 | 11 | operations = [ 12 | VectorExtension(), 13 | migrations.CreateModel( 14 | name="BlogPost", 15 | fields=[ 16 | ( 17 | "id", 18 | models.BigAutoField( 19 | auto_created=True, 20 | primary_key=True, 21 | serialize=False, 22 | verbose_name="ID", 23 | ), 24 | ), 25 | ("title", models.CharField(max_length=200)), 26 | ("content", models.TextField()), 27 | ("timestamp", models.DateTimeField(auto_now_add=True)), 28 | ], 29 | ), 30 | ] 31 | -------------------------------------------------------------------------------- /src/blog/migrations/0002_blogpost_embedding.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2024-07-31 16:09 2 | 3 | import pgvector.django 4 | from django.db import migrations 5 | 6 | 7 | class Migration(migrations.Migration): 8 | dependencies = [ 9 | ("blog", "0001_initial"), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name="blogpost", 15 | name="embedding", 16 | field=pgvector.django.VectorField( 17 | blank=True, dimensions=1536, null=True 18 | ), 19 | ), 20 | ] 21 | -------------------------------------------------------------------------------- /src/blog/migrations/0003_blogpost_can_delete.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2024-07-31 16:53 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("blog", "0002_blogpost_embedding"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AddField( 13 | model_name="blogpost", 14 | name="can_delete", 15 | field=models.BooleanField( 16 | default=False, help_text="Use in jupyter notebooks" 17 | ), 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /src/blog/migrations/0004_blogpost__content.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2024-07-31 18:19 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | dependencies = [ 8 | ("blog", "0003_blogpost_can_delete"), 9 | ] 10 | 11 | operations = [ 12 | migrations.AddField( 13 | model_name="blogpost", 14 | name="_content", 15 | field=models.TextField(blank=True, null=True), 16 | ), 17 | ] 18 | -------------------------------------------------------------------------------- /src/blog/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/blog/migrations/__init__.py -------------------------------------------------------------------------------- /src/blog/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from pgvector.django import VectorField 3 | # Create your models here. 4 | 5 | from . import services 6 | EMEDDING_LENGTH=services.EMEDDING_LENGTH 7 | 8 | class BlogPost(models.Model): 9 | # id -> models.AutoField() 10 | title = models.CharField(max_length=200) 11 | content = models.TextField() 12 | _content = models.TextField(blank=True, null=True) 13 | timestamp = models.DateTimeField(auto_now_add=True) 14 | embedding = VectorField(dimensions=EMEDDING_LENGTH, blank=True, null=True) 15 | can_delete = models.BooleanField(default=False, help_text="Use in jupyter notebooks") 16 | 17 | def save(self, *args, **kwargs): 18 | has_changed = False 19 | if self._content != self.content: 20 | has_changed = True 21 | self._content = self.content 22 | if (self.embedding is None) or has_changed == True: 23 | raw_embedding_text = self.get_embedding_text_raw() 24 | if raw_embedding_text is not None: 25 | self.embedding = services.get_embedding(raw_embedding_text) 26 | super().save(*args, **kwargs) 27 | 28 | def get_embedding_text_raw(self): 29 | return self.content -------------------------------------------------------------------------------- /src/blog/services.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from decouple import config 3 | from django.apps import apps 4 | from pgvector.django import CosineDistance 5 | from django.db.models import F 6 | 7 | # https://platform.openai.com/docs/guides/embeddings/how-to-get-embeddings 8 | EMEDDING_LENGTH=config("EMEDDING_LENGTH", default=1536, cast=int) 9 | EMEDDING_MODEL =config("EMEDDING_MODEL", default="text-embedding-3-small") 10 | OPENAI_API_KEY= config("OPENAI_API_KEY") 11 | 12 | client = OpenAI( 13 | api_key=OPENAI_API_KEY 14 | ) 15 | 16 | def get_embedding(text, model=EMEDDING_MODEL): 17 | text = text.replace("\n", " ").strip() 18 | return client.embeddings.create(input=[text], model=model).data[0].embedding 19 | 20 | def get_query_embedding(text): 21 | # get_or_create Query Embedding model 22 | return get_embedding(text) 23 | 24 | def search_posts(query, limit=5): 25 | BlogPost = apps.get_model(app_label='blog', model_name='BlogPost') 26 | query_embedding = get_query_embedding(query) 27 | qs = BlogPost.objects.annotate( 28 | distance=CosineDistance('embedding',query_embedding), 29 | similarity=1 - F("distance") 30 | ).order_by("distance")[:limit] 31 | return qs -------------------------------------------------------------------------------- /src/blog/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /src/blog/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /src/cfehome/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/cfehome/__init__.py -------------------------------------------------------------------------------- /src/cfehome/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for cfehome project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /src/cfehome/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for cfehome project. 3 | 4 | Generated by 'django-admin startproject' using Django 5.0.7. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/5.0/ref/settings/ 11 | """ 12 | from decouple import config 13 | import dj_database_url 14 | from pathlib import Path 15 | 16 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 17 | BASE_DIR = Path(__file__).resolve().parent.parent 18 | 19 | 20 | # Quick-start development settings - unsuitable for production 21 | # See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ 22 | 23 | # SECURITY WARNING: keep the secret key used in production secret! 24 | SECRET_KEY = "django-insecure-%!a_5-bpjmu0j%dyb8yf+xs&5l2fz%^6&n4bok=g@7zj_q(w!m" 25 | 26 | # SECURITY WARNING: don't run with debug turned on in production! 27 | DEBUG = config("DJANGO_DEBUG", cast=bool, default=False) 28 | 29 | ALLOWED_HOSTS = [] 30 | 31 | 32 | # Application definition 33 | 34 | INSTALLED_APPS = [ 35 | "django.contrib.admin", 36 | "django.contrib.auth", 37 | "django.contrib.contenttypes", 38 | "django.contrib.sessions", 39 | "django.contrib.messages", 40 | "django.contrib.staticfiles", 41 | "analytics", 42 | "blog", 43 | "products", 44 | ] 45 | 46 | MIDDLEWARE = [ 47 | "django.middleware.security.SecurityMiddleware", 48 | "django.contrib.sessions.middleware.SessionMiddleware", 49 | "django.middleware.common.CommonMiddleware", 50 | "django.middleware.csrf.CsrfViewMiddleware", 51 | "django.contrib.auth.middleware.AuthenticationMiddleware", 52 | "django.contrib.messages.middleware.MessageMiddleware", 53 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 54 | ] 55 | 56 | ROOT_URLCONF = "cfehome.urls" 57 | 58 | TEMPLATES = [ 59 | { 60 | "BACKEND": "django.template.backends.django.DjangoTemplates", 61 | "DIRS": [], 62 | "APP_DIRS": True, 63 | "OPTIONS": { 64 | "context_processors": [ 65 | "django.template.context_processors.debug", 66 | "django.template.context_processors.request", 67 | "django.contrib.auth.context_processors.auth", 68 | "django.contrib.messages.context_processors.messages", 69 | ], 70 | }, 71 | }, 72 | ] 73 | 74 | WSGI_APPLICATION = "cfehome.wsgi.application" 75 | 76 | 77 | # Database 78 | # https://docs.djangoproject.com/en/5.0/ref/settings/#databases 79 | 80 | DATABASES = { 81 | "default": { 82 | "ENGINE": "django.db.backends.sqlite3", 83 | "NAME": BASE_DIR / "db.sqlite3", 84 | } 85 | } 86 | DATABASE_URL = config("DATABASE_URL", default="", cast=str) 87 | 88 | if DATABASE_URL != "": 89 | DATABASES = { 90 | "default": dj_database_url.config( 91 | default=DATABASE_URL, 92 | conn_max_age=0 93 | ) 94 | } 95 | 96 | # Password validation 97 | # https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators 98 | 99 | AUTH_PASSWORD_VALIDATORS = [ 100 | { 101 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", 102 | }, 103 | { 104 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", 105 | }, 106 | { 107 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", 108 | }, 109 | { 110 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", 111 | }, 112 | ] 113 | 114 | 115 | # Internationalization 116 | # https://docs.djangoproject.com/en/5.0/topics/i18n/ 117 | 118 | LANGUAGE_CODE = "en-us" 119 | 120 | TIME_ZONE = "UTC" 121 | 122 | USE_I18N = True 123 | 124 | USE_TZ = True 125 | 126 | 127 | # Static files (CSS, JavaScript, Images) 128 | # https://docs.djangoproject.com/en/5.0/howto/static-files/ 129 | 130 | STATIC_URL = "static/" 131 | 132 | # Default primary key field type 133 | # https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field 134 | 135 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" 136 | -------------------------------------------------------------------------------- /src/cfehome/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL configuration for cfehome project. 3 | 4 | The `urlpatterns` list routes URLs to views. For more information please see: 5 | https://docs.djangoproject.com/en/5.0/topics/http/urls/ 6 | Examples: 7 | Function views 8 | 1. Add an import: from my_app import views 9 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 10 | Class-based views 11 | 1. Add an import: from other_app.views import Home 12 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 13 | Including another URLconf 14 | 1. Import the include() function: from django.urls import include, path 15 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 16 | """ 17 | from django.contrib import admin 18 | from django.urls import path 19 | 20 | urlpatterns = [ 21 | path("admin/", admin.site.urls), 22 | ] 23 | -------------------------------------------------------------------------------- /src/cfehome/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for cfehome project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /src/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cfehome.settings") 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /src/products/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/products/__init__.py -------------------------------------------------------------------------------- /src/products/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | from .models import Product, Embedding 5 | 6 | 7 | admin.site.register(Product) 8 | 9 | admin.site.register(Embedding) -------------------------------------------------------------------------------- /src/products/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ProductsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "products" 7 | -------------------------------------------------------------------------------- /src/products/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2024-07-31 17:38 2 | 3 | import django.db.models.deletion 4 | import pgvector.django 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | initial = True 10 | 11 | dependencies = [ 12 | ("contenttypes", "0002_remove_content_type_name"), 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name="Product", 18 | fields=[ 19 | ( 20 | "id", 21 | models.BigAutoField( 22 | auto_created=True, 23 | primary_key=True, 24 | serialize=False, 25 | verbose_name="ID", 26 | ), 27 | ), 28 | ("title", models.CharField(max_length=200)), 29 | ("content", models.TextField()), 30 | ("timestamp", models.DateTimeField(auto_now_add=True)), 31 | ( 32 | "can_delete", 33 | models.BooleanField( 34 | default=False, help_text="Use in jupyter notebooks" 35 | ), 36 | ), 37 | ], 38 | ), 39 | migrations.CreateModel( 40 | name="Embedding", 41 | fields=[ 42 | ( 43 | "id", 44 | models.BigAutoField( 45 | auto_created=True, 46 | primary_key=True, 47 | serialize=False, 48 | verbose_name="ID", 49 | ), 50 | ), 51 | ( 52 | "embedding", 53 | pgvector.django.VectorField( 54 | blank=True, dimensions=1536, null=True 55 | ), 56 | ), 57 | ("object_id", models.PositiveIntegerField()), 58 | ( 59 | "content_type", 60 | models.ForeignKey( 61 | null=True, 62 | on_delete=django.db.models.deletion.SET_NULL, 63 | to="contenttypes.contenttype", 64 | ), 65 | ), 66 | ], 67 | ), 68 | ] 69 | -------------------------------------------------------------------------------- /src/products/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/products/migrations/__init__.py -------------------------------------------------------------------------------- /src/products/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from pgvector.django import VectorField 3 | from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation 4 | from django.contrib.contenttypes.models import ContentType 5 | # Create your models here. 6 | 7 | # https://platform.openai.com/docs/guides/embeddings/how-to-get-embeddings 8 | EMEDDING_MODEL="text-embedding-3-small" 9 | EMEDDING_LENGTH=1536 10 | 11 | class Embedding(models.Model): 12 | embedding = VectorField(dimensions=EMEDDING_LENGTH, blank=True, null=True) 13 | object_id = models.PositiveIntegerField() 14 | content_type = models.ForeignKey(ContentType, on_delete=models.SET_NULL, null=True) 15 | content_object = GenericForeignKey('content_type', 'object_id') 16 | 17 | 18 | class Product(models.Model): 19 | # id -> models.AutoField() 20 | title = models.CharField(max_length=200) 21 | content = models.TextField() 22 | embedding_obj = GenericRelation(Embedding) 23 | timestamp = models.DateTimeField(auto_now_add=True) 24 | can_delete = models.BooleanField(default=False, help_text="Use in jupyter notebooks") 25 | 26 | def get_embedding_text_raw(self): 27 | return self.content -------------------------------------------------------------------------------- /src/products/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /src/products/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /src/rag/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/talk-to-django/822446c14df61d0abb89e77ac5863ffadea33306/src/rag/__init__.py -------------------------------------------------------------------------------- /src/rag/db.py: -------------------------------------------------------------------------------- 1 | from decouple import config 2 | 3 | from sqlalchemy import create_engine, text 4 | 5 | from . import settings 6 | 7 | 8 | def get_database_url(use_pooling=True): 9 | db_url_env = config("DATABASE_URL") 10 | if use_pooling: 11 | db_url_env = config("DATABASE_URL_POOL") 12 | if db_url_env.startswith("postgres://"): 13 | return db_url_env.replace("postgres://", "postgresql://", 1) 14 | return db_url_env 15 | 16 | 17 | def init_vector_db(): 18 | db_url = get_database_url(use_pooling=True) 19 | vector_db_name = settings.VECTOR_DB_NAME 20 | vector_db_name = settings.VECTOR_DB_TABLE_NAME 21 | engine = create_engine(db_url, isolation_level="AUTOCOMMIT") 22 | with engine.connect() as connection: 23 | result = connection.execute(text("SELECT 1 FROM pg_database WHERE datname = :db_name"), {"db_name": vector_db_name}) 24 | db_exists = result.scalar() == 1 25 | if not db_exists: 26 | connection.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) 27 | connection.execute(text(f"CREATE DATABASE {vector_db_name}")) -------------------------------------------------------------------------------- /src/rag/embeddings.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from llama_index.embeddings.openai import OpenAIEmbedding 4 | 5 | class MyOpenAIEmbedding(OpenAIEmbedding): 6 | def _get_query_embedding(self, query: str) -> List[float]: 7 | """Get query embedding.""" 8 | # print('my query', query) 9 | # obj, created = Query.objects.get_or_create(query=query) 10 | # obj.get_query_embedding() 11 | return super()._get_query_embedding(query) 12 | 13 | def _get_text_embedding(self, text: str) -> List[float]: 14 | """Get text embedding.""" 15 | # print("texts", text) 16 | return super()._get_text_embedding(text) 17 | 18 | def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]: 19 | """Get text embeddings. 20 | 21 | By default, this is a wrapper around _get_text_embedding. 22 | Can be overridden for batch queries. 23 | """ 24 | # print("texts", texts) 25 | return super()._get_text_embeddings(texts) -------------------------------------------------------------------------------- /src/rag/engines.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from django.apps import apps 3 | from sqlalchemy import make_url, create_engine 4 | from llama_index.vector_stores.postgres import PGVectorStore 5 | from llama_index.core import VectorStoreIndex, StorageContext 6 | from llama_index.core.query_engine import RetrieverQueryEngine 7 | from llama_index.core import SQLDatabase 8 | from llama_index.core.query_engine import NLSQLTableQueryEngine 9 | from llama_index.core.retrievers import NLSQLRetriever 10 | 11 | from . import db, settings, prompts 12 | 13 | EMEDDING_LENGTH = settings.EMEDDING_LENGTH 14 | 15 | settings.init() 16 | 17 | def get_vector_store( 18 | vector_db_name=settings.VECTOR_DB_NAME, vector_db_table_name=settings.VECTOR_DB_TABLE_NAME 19 | ): 20 | db_url = db.get_database_url(use_pooling=True) 21 | url = make_url(db_url) 22 | return PGVectorStore.from_params( 23 | database=vector_db_name, 24 | host=url.host, 25 | password=url.password, 26 | port=url.port or 5432, 27 | user=url.username, 28 | table_name=vector_db_table_name, 29 | embed_dim=EMEDDING_LENGTH, 30 | ) 31 | 32 | def get_semantic_query_index(): 33 | vector_store = get_vector_store() 34 | storage_context = StorageContext.from_defaults(vector_store=vector_store) 35 | return VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context) 36 | 37 | def get_semantic_query_retriever_engine(top_k=5): 38 | index = get_semantic_query_index() 39 | retriever = index.as_retriever(similarity_top_k=top_k) 40 | retriever.retrieve_mode = 'embedding' 41 | return RetrieverQueryEngine.from_args( 42 | retriever 43 | ) 44 | 45 | def get_semantic_query_engine(): 46 | index = get_semantic_query_index() 47 | return index.as_query_engine() 48 | 49 | 50 | def get_default_sql_engine_tables() -> List[str]: 51 | BlogPost = apps.get_model("blog", "BlogPost") 52 | PageView = apps.get_model("analytics", "PageView") 53 | tables = [] 54 | models = [BlogPost, PageView] 55 | for model in models: 56 | table = model._meta.db_table 57 | tables.append(table) 58 | return tables 59 | 60 | 61 | def get_llamaindex_sql_database() -> SQLDatabase: 62 | """ 63 | Using django database 64 | Not using the LlamaIndex Vector db like 65 | `get_vector_store` 66 | """ 67 | tables = get_default_sql_engine_tables() 68 | database_url = db.get_database_url(use_pooling=True) 69 | engine = create_engine(database_url) 70 | return SQLDatabase(engine, include_tables=tables) 71 | 72 | def get_sql_query_engine(*args, **kwargs) -> NLSQLTableQueryEngine: 73 | tables = get_default_sql_engine_tables() 74 | sql_database = get_llamaindex_sql_database() 75 | config = { 76 | "sql_database": sql_database, 77 | "tables": tables, 78 | "response_synthesis_prompt": prompts.custom_sql_response_synthesis_prompt, 79 | "text_to_sql_prompt": prompts.custom_text_to_sql_prompt 80 | } 81 | config.update(**kwargs) 82 | return NLSQLTableQueryEngine(*args, **config) 83 | 84 | def get_sql_query_retriever(*args, **kwargs) -> NLSQLRetriever: 85 | tables = get_default_sql_engine_tables() 86 | sql_database = get_llamaindex_sql_database() 87 | config = { 88 | "sql_database": sql_database, 89 | "tables": tables, 90 | "response_synthesis_prompt": prompts.custom_sql_response_synthesis_prompt, 91 | "text_to_sql_prompt": prompts.custom_text_to_sql_prompt 92 | } 93 | config.update(**kwargs) 94 | return NLSQLRetriever(*args, **config) -------------------------------------------------------------------------------- /src/rag/patches.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Union 2 | 3 | from llama_index.core.query_engine import SQLAutoVectorQueryEngine 4 | from llama_index.core.query_engine.sql_vector_query_engine import * 5 | 6 | class MySQLAutoVectorQueryEngine(SQLAutoVectorQueryEngine): 7 | def __init__( 8 | self, 9 | sql_query_tool: QueryEngineTool, 10 | vector_query_tool: QueryEngineTool, 11 | selector: Optional[Union[LLMSingleSelector, PydanticSingleSelector]] = None, 12 | llm: Optional[LLM] = None, 13 | service_context: Optional[ServiceContext] = None, 14 | sql_vector_synthesis_prompt: Optional[BasePromptTemplate] = None, 15 | sql_augment_query_transform: Optional[SQLAugmentQueryTransform] = None, 16 | use_sql_vector_synthesis: bool = True, 17 | callback_manager: Optional[CallbackManager] = None, 18 | verbose: bool = True, 19 | ) -> None: 20 | """Initialize params.""" 21 | # validate that the query engines are of the right type 22 | if not isinstance( 23 | sql_query_tool.query_engine, 24 | (BaseSQLTableQueryEngine, NLSQLTableQueryEngine), 25 | ): 26 | raise ValueError( 27 | "sql_query_tool.query_engine must be an instance of " 28 | "BaseSQLTableQueryEngine or NLSQLTableQueryEngine" 29 | ) 30 | if not isinstance(vector_query_tool.query_engine, RetrieverQueryEngine): 31 | raise ValueError( 32 | "vector_query_tool.query_engine must be an instance of " 33 | "RetrieverQueryEngine" 34 | ) 35 | # if not isinstance( 36 | # vector_query_tool.query_engine.retriever, VectorIndexAutoRetriever 37 | # ): 38 | # raise ValueError( 39 | # "vector_query_tool.query_engine.retriever must be an instance " 40 | # "of VectorIndexAutoRetriever" 41 | # ) 42 | 43 | sql_vector_synthesis_prompt = ( 44 | sql_vector_synthesis_prompt or DEFAULT_SQL_VECTOR_SYNTHESIS_PROMPT 45 | ) 46 | SQLJoinQueryEngine.__init__( 47 | self, 48 | sql_query_tool, 49 | vector_query_tool, 50 | selector=selector, 51 | llm=llm, 52 | service_context=service_context, 53 | sql_join_synthesis_prompt=sql_vector_synthesis_prompt, 54 | sql_augment_query_transform=sql_augment_query_transform, 55 | use_sql_join_synthesis=use_sql_vector_synthesis, 56 | callback_manager=callback_manager, 57 | verbose=verbose, 58 | ) -------------------------------------------------------------------------------- /src/rag/prompts.py: -------------------------------------------------------------------------------- 1 | from llama_index.core import PromptTemplate 2 | 3 | custom_text_to_sql_prompt = PromptTemplate("""Given an input question, create a precise {dialect} PostgreSQL query to answer it. Follow these guidelines: 4 | 5 | 1. Use only tables and columns from the provided schema. 6 | 2. Select only relevant columns, never all columns. 7 | 3. Qualify column names with table names when necessary. 8 | 4. Use appropriate JOINs, WHERE clauses, and aggregations. 9 | 5. Order results to highlight the most pertinent information. 10 | 6. Avoid querying non-existent columns or tables. 11 | 7. Optimize the query for performance where possible. 12 | 13 | Your response should contain only the SQL query, without any additional explanation or formatting. Do not use markdown or prepend the query with the term `sql`. 14 | 15 | Schema: 16 | {schema} 17 | 18 | Question: {query_str} 19 | SQL Query: 20 | """) 21 | 22 | custom_sql_response_synthesis_prompt = PromptTemplate("""Given an input question, synthesize a response from the query results. 23 | Query: {query_str} 24 | SQL: {sql_query} 25 | SQL Response: {context_str} 26 | Response: 27 | """) -------------------------------------------------------------------------------- /src/rag/settings.py: -------------------------------------------------------------------------------- 1 | from decouple import config 2 | from llama_index.llms.openai import OpenAI 3 | 4 | from llama_index.core import Settings 5 | 6 | from .embeddings import MyOpenAIEmbedding 7 | 8 | 9 | 10 | LLM_MODEL = config("LLM_MODEL", default="gpt-4o") 11 | EMEDDING_LENGTH = config("EMEDDING_LENGTH", default=1536, cast=int) 12 | EMEDDING_MODEL =config("EMEDDING_MODEL", default="text-embedding-3-small") 13 | OPENAI_API_KEY = config("OPENAI_API_KEY") 14 | 15 | VECTOR_DB_NAME = config("VECTOR_DB_NAME", default='vector_db') 16 | VECTOR_DB_TABLE_NAME = config("VECTOR_DB_TABLE_NAME", default='blogpost') 17 | 18 | def init(): 19 | llm = OpenAI(model=LLM_MODEL, api_key=OPENAI_API_KEY) 20 | embed_model = MyOpenAIEmbedding(model=EMEDDING_MODEL, api_key=OPENAI_API_KEY) 21 | Settings.llm = llm 22 | Settings.embed_model = embed_model 23 | -------------------------------------------------------------------------------- /src/rag/sync.py: -------------------------------------------------------------------------------- 1 | 2 | from blog.models import BlogPost 3 | from llama_index.core import Document 4 | 5 | from .engines import get_semantic_query_index 6 | 7 | def get_blog_post_docs(): 8 | docs = [] 9 | qs = BlogPost.objects.filter(can_delete=True) 10 | for obj in qs: 11 | if obj.embedding is None: 12 | continue 13 | docs.append( 14 | Document( 15 | text=f"{obj.get_embedding_text_raw()}", 16 | doc_id=str(obj.id), 17 | embedding=obj.embedding.tolist(), 18 | metadata = { 19 | "pk": obj.pk, 20 | "title": obj.title 21 | } 22 | ) 23 | ) 24 | return docs 25 | 26 | 27 | def sync_blog_docs(): 28 | index = get_semantic_query_index() 29 | docs = get_blog_post_docs() 30 | print(f"Syncing {len(docs)} docs") 31 | for doc in docs: 32 | index.delete_ref_doc(f"{doc.id_}", delete_from_docstore=True) 33 | index.insert(doc) 34 | print("Sync done.") 35 | 36 | -------------------------------------------------------------------------------- /src/rag/updaters.py: -------------------------------------------------------------------------------- 1 | from django.apps import apps 2 | from llama_index.core import Document 3 | 4 | from . import engines 5 | 6 | def update_llama_index_documents(use_saved_embeddings=True): 7 | vector_index = engines.get_semantic_query_index() 8 | BlogPost = apps.get_model("blog", "BlogPost") 9 | docs = [] 10 | qs = BlogPost.objects.filter(can_delete=True) 11 | for obj in qs: 12 | doc_config = { 13 | "text": f"{obj.get_embedding_text_raw()}", 14 | "doc_id": str(obj.id), 15 | "metadata": { 16 | "pk": obj.pk, 17 | "title": obj.title 18 | } 19 | } 20 | if use_saved_embeddings: 21 | if obj.embedding is not None: 22 | doc_config['embedding'] = list(obj.embedding) 23 | docs.append( 24 | Document(**doc_config) 25 | ) 26 | for doc in docs: 27 | vector_index.delete_ref_doc(f"{doc.id_}", delete_from_docstore=True) 28 | vector_index.insert(doc) --------------------------------------------------------------------------------