├── .github
    ├── CODEOWNERS
    └── workflows
    │   └── devdocs-review.yml
├── create-embeddings
    ├── README.md
    ├── open-source-existing-data.ipynb
    ├── voyage-existing-data.ipynb
    ├── openai-existing-data.ipynb
    ├── open-source-new-data.ipynb
    ├── voyage-new-data.ipynb
    └── openai-new-data.ipynb
├── manage-indexes
    ├── delete-indexes.ipynb
    ├── view-indexes.ipynb
    ├── edit-indexes.ipynb
    ├── create-indexes-basic.ipynb
    └── create-indexes-filter.ipynb
├── ai-integrations
    ├── README.md
    ├── langchain-parent-document-retrieval.ipynb
    ├── langchain-hybrid-search.ipynb
    ├── langchain-natural-language.ipynb
    ├── langchain-local-rag.ipynb
    ├── semantic-kernel.ipynb
    ├── haystack.ipynb
    ├── langchain.ipynb
    ├── llamaindex.ipynb
    ├── langchain-graphrag.ipynb
    └── langchain-memory-semantic-cache.ipynb
├── README.md
├── use-cases
    ├── local-rag.ipynb
    ├── rag.ipynb
    └── rag-with-voyage.ipynb
├── quantization
    ├── existing-data.ipynb
    └── new-data.ipynb
└── LICENSE


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | @mongodb/devdocs @davidhou17
2 | 
3 | /.github/ @mongodb/devdocs


--------------------------------------------------------------------------------
/create-embeddings/README.md:
--------------------------------------------------------------------------------
 1 | # Create Embeddings
 2 | 
 3 | This folder contains Jupyter Notebooks that describe how to generate
 4 | vector embeddings.
 5 | 
 6 | Select one of the following notebooks based on your preferred 
 7 | embedding model, and whether you're generating embeddings from 
 8 | new data or from data you already have in MongoDB.
 9 | 
10 | | Notebook | Description |
11 | |----------|-------------|
12 | | [open-source-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-new-data.ipynb) | Generate embeddings from new data using an open-source embedding model |
13 | | [open-source-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-existing-data.ipynb) | Generate embeddings from  existing data in MongoDB using an open-source embedding model |
14 | | [voyage-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-new-data.ipynb) | Generate embeddings from new data using an embedding model from Voyage AI |
15 | | [voyage-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb) | Generate embeddings from existing data in MongoDB using an embedding model from Voyage AI |
16 | | [openai-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/openai-new-data.ipynb) | Generate embeddings from new data using an embedding model from OpenAI |
17 | | [openai-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb) | Generate embeddings from existing data in MongoDB using an embedding model from OpenAI |
18 | 


--------------------------------------------------------------------------------
/.github/workflows/devdocs-review.yml:
--------------------------------------------------------------------------------
 1 | name: Notify DevDocs on Reviews
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types: [created, edited]
 6 |   pull_request_review_comment:
 7 |     types: [created, edited]
 8 |   pull_request_target:
 9 |     types: [opened, edited]
10 |   pull_request_review:
11 |     types: [submitted, edited]
12 | 
13 | jobs:
14 |   notify:
15 |     runs-on: ubuntu-latest
16 |     if: ${{ github.event.pull_request || github.event.issue.pull_request }}
17 |     steps:
18 |       - name: Send Slack Notification
19 |         env:
20 |           EVENT_BODY: ${{ github.event.pull_request.body || github.event.comment.body }}
21 |           PR_TITLE: ${{ github.event.pull_request.title || github.event.issue.title }}
22 |           PR_USER: ${{ github.event.pull_request.user.login || github.event.comment.user.login }}
23 |           PR_URL: ${{ github.event.pull_request.html_url || github.event.comment.html_url }}
24 |           REPO_NAME: ${{ github.repository }}
25 |           SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }}
26 |         run: |
27 |           if ! echo "$EVENT_BODY" | grep -qE "@DevDocs|@mongodb/devdocs"; then
28 |             echo "::notice::No '@DevDocs' or '@mongodb/devdocs' tag found. Skipping notification."
29 |             exit 0
30 |           fi
31 | 
32 |           # 2. Build the message and payload in one step
33 |           SLACK_PAYLOAD=$(jq -n \
34 |             --arg repo "$REPO_NAME" \
35 |             --arg title "$PR_TITLE" \
36 |             --arg user "$PR_USER" \
37 |             --arg url "$PR_URL" \
38 |             '{
39 |               "channel": "#docs-devdocs-notifications",
40 |               "username": "Issue Notifier",
41 |               "icon_emoji": ":mega:",
42 |               "text": "*📢 @DevDocs mentioned in * \($repo)\n*Title:* \($title)\n*By:* \($user)\n*URL:* \($url)"
43 |             }')
44 | 
45 |           # 3. Send to Slack
46 |           curl -X POST \
47 |                -H 'Content-type: application/json' \
48 |                --data "$SLACK_PAYLOAD" \
49 |                "$SLACK_WEBHOOK"
50 | 


--------------------------------------------------------------------------------
/manage-indexes/delete-indexes.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# MongoDB Vector Search - Delete Vector Indexes"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 |     "\n",
16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/manage-indexes/delete-indexes.ipynb\">\n",
17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
18 |     "</a>"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {
25 |     "vscode": {
26 |      "languageId": "shellscript"
27 |     }
28 |    },
29 |    "outputs": [],
30 |    "source": [
31 |     "pip install pymongo"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": [
40 |     "from pymongo.mongo_client import MongoClient\n",
41 |     "\n",
42 |     "# Connect to your MongoDB cluster\n",
43 |     "uri = \"<connectionString>\"\n",
44 |     "client = MongoClient(uri)\n",
45 |     "\n",
46 |     "# Access your database and collection\n",
47 |     "database = client[\"<databaseName>\"]\n",
48 |     "collection = database[\"<collectionName>\"]\n",
49 |     "\n",
50 |     "# Delete your search index\n",
51 |     "collection.drop_search_index(\"<indexName>\")"
52 |    ]
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "kernelspec": {
57 |    "display_name": "Python 3 (ipykernel)",
58 |    "language": "python",
59 |    "name": "python3"
60 |   },
61 |   "language_info": {
62 |    "codemirror_mode": {
63 |     "name": "ipython",
64 |     "version": 3
65 |    },
66 |    "file_extension": ".py",
67 |    "mimetype": "text/x-python",
68 |    "name": "python",
69 |    "nbconvert_exporter": "python",
70 |    "pygments_lexer": "ipython3",
71 |    "version": "3.9.6"
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/manage-indexes/view-indexes.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# MongoDB Vector Search - View Vector Indexes"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 |     "\n",
16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/manage-indexes/view-indexes.ipynb\">\n",
17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
18 |     "</a>"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {
25 |     "vscode": {
26 |      "languageId": "shellscript"
27 |     }
28 |    },
29 |    "outputs": [],
30 |    "source": [
31 |     "pip install pymongo"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": [
40 |     "from pymongo.mongo_client import MongoClient\n",
41 |     "\n",
42 |     "# Connect to your MongoDB cluster\n",
43 |     "uri = \"<connectionString>\"\n",
44 |     "client = MongoClient(uri)\n",
45 |     "\n",
46 |     "# Access your database and collection\n",
47 |     "database = client[\"<databaseName>\"]\n",
48 |     "collection = database[\"<collectionName>\"]\n",
49 |     "\n",
50 |     "# Get a list of the collection's search indexes and print them\n",
51 |     "cursor = collection.list_search_indexes()\n",
52 |     "for index in cursor:\n",
53 |     "    print(index)"
54 |    ]
55 |   }
56 |  ],
57 |  "metadata": {
58 |   "kernelspec": {
59 |    "display_name": "Python 3 (ipykernel)",
60 |    "language": "python",
61 |    "name": "python3"
62 |   },
63 |   "language_info": {
64 |    "codemirror_mode": {
65 |     "name": "ipython",
66 |     "version": 3
67 |    },
68 |    "file_extension": ".py",
69 |    "mimetype": "text/x-python",
70 |    "name": "python",
71 |    "nbconvert_exporter": "python",
72 |    "pygments_lexer": "ipython3",
73 |    "version": "3.9.6"
74 |   }
75 |  },
76 |  "nbformat": 4,
77 |  "nbformat_minor": 2
78 | }
79 | 


--------------------------------------------------------------------------------
/ai-integrations/README.md:
--------------------------------------------------------------------------------
 1 | # AI Integrations
 2 | 
 3 | This folder contains Jupyter Notebooks that demonstrate how to integrate various AI frameworks with MongoDB. These notebooks show you how to implement RAG and other features for your AI-powered and agentic applications by leveraging MongoDB as both a vector database and document database.
 4 | 
 5 | | Notebook | Description |
 6 | |----------|-------------|
 7 | | [langchain](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain.ipynb) | Implement basic RAG with LangChain and MongoDB Vector Search |
 8 | | [langchain-memory-semantic-cache](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-memory-semantic-cache.ipynb) | Implement RAG with memory with LangChain and MongoDB |
 9 | | [langchain-hybrid-search](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-hybrid-search.ipynb) | Combine vector search with full-text search using LangChain and MongoDB |
10 | | [langchain-parent-document-retrieval](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-parent-document-retrieval.ipynb) | Perform parent-document retrieval with LangChain and MongoDB |
11 | | [langchain-self-query-retrieval](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-self-query-retrieval.ipynb) | Perform self-querying retrieval with LangChain and MongoDB |
12 | | [langchain-local-rag](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-local-rag.ipynb) | Implement RAG with local models with LangChain and MongoDB |
13 | | [langchain-graphrag](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-graphrag.ipynb) | Implement graph-based RAG with LangChain and MongoDB |
14 | | [langchain-natural-language](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-natural-language.ipynb) | Perform natural language querying with LangChain and MongoDB |
15 | | [langgraph](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langgraph.ipynb) | Build an AI agent with LangGraph and MongoDB |
16 | | [llamaindex](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/llamaindex.ipynb) | Implement basic RAG with LlamaIndex and MongoDB |
17 | | [haystack](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/haystack.ipynb) | Implement basic RAG with Haystack and MongoDB |
18 | | [semantic-kernel](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/semantic-kernel.ipynb) | Implement basic RAG with Microsoft Semantic Kernel and MongoDB |
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MongoDB Documentation Notebooks
 2 | 
 3 | This repository contains Jupyter Notebooks that follow
 4 | tutorials and code examples in the official [MongoDB Vector Search documentation](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/). You can run, download, and modify these notebooks as you learn how to use MongoDB Vector Search for your use case.
 5 | 
 6 | ## Overview
 7 | 
 8 | Each notebook corresponds to a page or example in our documentation.
 9 | Refer to the docs page linked in each notebook for prerequisites, set-up instructions, and detailed explanations of the code.
10 | 
11 | The following table summarizes the contents of the notebooks in each directory:
12 | 
13 | | Directory          | Description                                      |
14 | |--------------------|--------------------------------------------------|
15 | | [/create-embeddings](https://github.com/mongodb/docs-notebooks/tree/main/create-embeddings) | Learn how to generate embeddings for vector search |
16 | | [/get-started](https://github.com/mongodb/docs-notebooks/tree/main/get-started) | Complete our quick start tutorial |
17 | | [/ai-integrations](https://github.com/mongodb/docs-notebooks/tree/main/ai-integrations) | Build AI applications and agents with popular AI frameworks that integrate with MongoDB |
18 | | [/manage-indexes](https://github.com/mongodb/docs-notebooks/tree/main/manage-indexes) | Create, view, edit, and delete vector search indexes |
19 | | [/quantization](https://github.com/mongodb/docs-notebooks/tree/main/quantization) | Quantize your vector embeddings for efficient processing |
20 | | [/run-queries](https://github.com/mongodb/docs-notebooks/tree/main/run-queries) | Learn how to run vector search queries (ANN and ENN) |
21 | | [/use-cases](https://github.com/mongodb/docs-notebooks/tree/main/use-cases) | Implement RAG and build AI agents using a MongoDB-native retrieval system |
22 | 
23 | ## Other Resources
24 | 
25 | - [MongoDB Vector Search Documentation](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/)
26 | - [Generative AI Use Cases Repository](https://github.com/mongodb-developer/GenAI-Showcase/tree/main)
27 | 
28 | ## License
29 | 
30 | This project is licensed under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0).
31 | 
32 | ## Issues
33 | 
34 | To report an issue with any of these notebooks, please leave feedback through
35 | the corresponding documentation page linked at the top of the file. Using the
36 | `Rate This Page` button, you can add a comment about the issue after leaving
37 | a star rating.
38 | 
39 | ## Contributing
40 | 
41 | We are not currently accepting public contributions to this repository at this
42 | time.
43 | 


--------------------------------------------------------------------------------
/manage-indexes/edit-indexes.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# MongoDB Vector Search - Edit Vector Indexes"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 |     "\n",
16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/manage-indexes/edit-indexes.ipynb\">\n",
17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
18 |     "</a>"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {
25 |     "vscode": {
26 |      "languageId": "shellscript"
27 |     }
28 |    },
29 |    "outputs": [],
30 |    "source": [
31 |     "pip install pymongo"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": [
40 |     "from pymongo.mongo_client import MongoClient\n",
41 |     "\n",
42 |     "# Connect to your MongoDB cluster\n",
43 |     "uri = \"<connectionString>\"\n",
44 |     "client = MongoClient(uri)\n",
45 |     "\n",
46 |     "# Access your database and collection\n",
47 |     "database = client[\"<databaseName>\"]\n",
48 |     "collection = database[\"<collectionName>\"]\n",
49 |     "\n",
50 |     "definition = {\n",
51 |     "  \"fields\": [\n",
52 |     "    {\n",
53 |     "      \"type\": \"vector\",\n",
54 |     "      \"numDimensions\": <numberofDimensions>,\n",
55 |     "      \"path\": \"<fieldToIndex>\",\n",
56 |     "      \"similarity\": \"euclidean | cosine | dotProduct\",\n",
57 |     "      \"quantization\": \" none | scalar | binary \"\n",
58 |     "    },\n",
59 |     "    {\n",
60 |     "      \"type\": \"filter\",\n",
61 |     "      \"path\": \"<fieldToIndex>\"\n",
62 |     "    },\n",
63 |     "    ...\n",
64 |     "  ]\n",
65 |     "}\n",
66 |     "    \n",
67 |     "# Update your search index\n",
68 |     "collection.update_search_index(\"<indexName>\", definition)"
69 |    ]
70 |   }
71 |  ],
72 |  "metadata": {
73 |   "kernelspec": {
74 |    "display_name": "Python 3 (ipykernel)",
75 |    "language": "python",
76 |    "name": "python3"
77 |   },
78 |   "language_info": {
79 |    "codemirror_mode": {
80 |     "name": "ipython",
81 |     "version": 3
82 |    },
83 |    "file_extension": ".py",
84 |    "mimetype": "text/x-python",
85 |    "name": "python",
86 |    "nbconvert_exporter": "python",
87 |    "pygments_lexer": "ipython3",
88 |    "version": "3.9.6"
89 |   }
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 2
93 | }
94 | 


--------------------------------------------------------------------------------
/manage-indexes/create-indexes-basic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Vector Indexes - Basic Example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion for the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/manage-indexes/create-indexes-basic.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install pymongo"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from pymongo.mongo_client import MongoClient\n",
 41 |     "from pymongo.operations import SearchIndexModel\n",
 42 |     "import time\n",
 43 |     "\n",
 44 |     "# Connect to your MongoDB cluster\n",
 45 |     "uri = \"<connection-string>\"\n",
 46 |     "client = MongoClient(uri)\n",
 47 |     "\n",
 48 |     "# Access your database and collection\n",
 49 |     "database = client[\"sample_mflix\"]\n",
 50 |     "collection = database[\"embedded_movies\"]\n",
 51 |     "\n",
 52 |     "# Create your index model, then create the search index\n",
 53 |     "search_index_model = SearchIndexModel(\n",
 54 |     "  definition={\n",
 55 |     "    \"fields\": [\n",
 56 |     "      {\n",
 57 |     "        \"type\": \"vector\",\n",
 58 |     "        \"path\": \"plot_embedding_voyage_3_large\",\n",
 59 |     "        \"numDimensions\": 2048,\n",
 60 |     "        \"similarity\": \"dotProduct\"\n",
 61 |     "      }\n",
 62 |     "    ]\n",
 63 |     "  },\n",
 64 |     "  name=\"vector_index\",\n",
 65 |     "  type=\"vectorSearch\",\n",
 66 |     ")\n",
 67 |     "\n",
 68 |     "result = collection.create_search_index(model=search_index_model)\n",
 69 |     "print(\"New search index named \" + result + \" is building.\")\n",
 70 |     "\n",
 71 |     "# Wait for initial sync to complete\n",
 72 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
 73 |     "predicate=None\n",
 74 |     "if predicate is None:\n",
 75 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
 76 |     "\n",
 77 |     "while True:\n",
 78 |     "  indices = list(collection.list_search_indexes(result))\n",
 79 |     "  if len(indices) and predicate(indices[0]):\n",
 80 |     "    break\n",
 81 |     "  time.sleep(5)\n",
 82 |     "print(result + \" is ready for querying.\")"
 83 |    ]
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3 (ipykernel)",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.9.6"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 2
107 | }
108 | 


--------------------------------------------------------------------------------
/manage-indexes/create-indexes-filter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Vector Indexes - Filter Example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/manage-indexes/create-indexes-filter.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install pymongo"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from pymongo.mongo_client import MongoClient\n",
 41 |     "from pymongo.operations import SearchIndexModel\n",
 42 |     "import time\n",
 43 |     "\n",
 44 |     "# Connect to your MongoDB cluster\n",
 45 |     "uri = \"<connection-string>\"\n",
 46 |     "client = MongoClient(uri)\n",
 47 |     "\n",
 48 |     "# Access your database and collection\n",
 49 |     "database = client[\"sample_mflix\"]\n",
 50 |     "collection = database[\"embedded_movies\"]\n",
 51 |     "\n",
 52 |     "# Create your index model, then create the search index\n",
 53 |     "search_index_model = SearchIndexModel(\n",
 54 |     "  definition={\n",
 55 |     "    \"fields\": [\n",
 56 |     "      {\n",
 57 |     "        \"type\": \"vector\",\n",
 58 |     "        \"path\": \"plot_embedding_voyage_3_large\",\n",
 59 |     "        \"numDimensions\": 2048,\n",
 60 |     "        \"similarity\": \"dotProduct\",\n",
 61 |     "        \"quantization\": \"scalar\"\n",
 62 |     "      },\n",
 63 |     "      {\n",
 64 |     "        \"type\": \"filter\",\n",
 65 |     "        \"path\": \"genres\"\n",
 66 |     "      },\n",
 67 |     "      {\n",
 68 |     "        \"type\": \"filter\",\n",
 69 |     "        \"path\": \"year\"\n",
 70 |     "      }\n",
 71 |     "    ]\n",
 72 |     "  },\n",
 73 |     "  name=\"vector_index\",\n",
 74 |     "  type=\"vectorSearch\",\n",
 75 |     ")\n",
 76 |     "\n",
 77 |     "result = collection.create_search_index(model=search_index_model)\n",
 78 |     "print(\"New search index named \" + result + \" is building.\")\n",
 79 |     "\n",
 80 |     "# Wait for initial sync to complete\n",
 81 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
 82 |     "predicate=None\n",
 83 |     "if predicate is None:\n",
 84 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
 85 |     "\n",
 86 |     "while True:\n",
 87 |     "  indices = list(collection.list_search_indexes(result))\n",
 88 |     "  if len(indices) and predicate(indices[0]):\n",
 89 |     "    break\n",
 90 |     "  time.sleep(5)\n",
 91 |     "print(result + \" is ready for querying.\")\n",
 92 |     "\n",
 93 |     "client.close()\n"
 94 |    ]
 95 |   }
 96 |  ],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "Python 3",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.9.12"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 2
118 | }
119 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain-parent-document-retrieval.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# LangChain MongoDB Integration - Parent Document Retrieval"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Parent Document Retrieval](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/parent-document-retrieval/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-parent-document-retrieval.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai langchain-text-splitters pymongo pypdf"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "\n",
 42 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
 43 |     "os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
 44 |     "MONGODB_URI = \"<connection-string>\""
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
 54 |     "from langchain_community.document_loaders import PyPDFLoader\n",
 55 |     "\n",
 56 |     "# Load the PDF\n",
 57 |     "loader = PyPDFLoader(\"https://investors.mongodb.com/node/12881/pdf\") \n",
 58 |     "data = loader.load()\n",
 59 |     "\n",
 60 |     "# Chunk into parent documents\n",
 61 |     "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)\n",
 62 |     "docs = parent_splitter.split_documents(data)\n",
 63 |     "\n",
 64 |     "# Print a document\n",
 65 |     "docs[0]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "from langchain_mongodb.retrievers import MongoDBAtlasParentDocumentRetriever\n",
 75 |     "from langchain_voyageai import VoyageAIEmbeddings\n",
 76 |     "\n",
 77 |     "# Define the embedding model to use\n",
 78 |     "embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n",
 79 |     "\n",
 80 |     "# Define the chunking method for the child documents\n",
 81 |     "child_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
 82 |     "\n",
 83 |     "# Specify the database and collection name\n",
 84 |     "database_name = \"langchain_db\"\n",
 85 |     "collection_name = \"parent_document\"\n",
 86 |     "\n",
 87 |     "# Create the parent document retriever\n",
 88 |     "parent_doc_retriever = MongoDBAtlasParentDocumentRetriever.from_connection_string(\n",
 89 |     "    connection_string = MONGODB_URI,\n",
 90 |     "    child_splitter = child_splitter,\n",
 91 |     "    embedding_model = embedding_model,\n",
 92 |     "    database_name = database_name,\n",
 93 |     "    collection_name = collection_name,\n",
 94 |     "    text_key = \"page_content\",\n",
 95 |     "    relevance_score_fn = \"dotProduct\",\n",
 96 |     "    search_kwargs = { \"k\": 10 },\n",
 97 |     ")"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# Ingest the documents into Atlas\n",
107 |     "parent_doc_retriever.add_documents(docs)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Get the vector store instance from the retriever\n",
117 |     "vector_store = parent_doc_retriever.vectorstore\n",
118 |     "\n",
119 |     "# Use helper method to create the vector search index\n",
120 |     "vector_store.create_vector_search_index(\n",
121 |     "   dimensions = 1024,       # The dimensions of the vector embeddings to be indexed\n",
122 |     "   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
123 |     ")\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# Run a vector search query\n",
133 |     "parent_doc_retriever.invoke(\"AI technology\")"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "from langchain_core.output_parsers import StrOutputParser\n",
143 |     "from langchain_core.prompts import PromptTemplate\n",
144 |     "from langchain_core.runnables import  RunnablePassthrough\n",
145 |     "from langchain_openai import ChatOpenAI\n",
146 |     "\n",
147 |     "# Define a prompt template\n",
148 |     "template = \"\"\"\n",
149 |     "   Use the following pieces of context to answer the question at the end.\n",
150 |     "   {context}\n",
151 |     "   Question: {query}?\n",
152 |     "\"\"\"\n",
153 |     "prompt = PromptTemplate.from_template(template)\n",
154 |     "model = ChatOpenAI()\n",
155 |     "\n",
156 |     "# Construct a chain to answer questions on your data\n",
157 |     "chain = (\n",
158 |     "   {\"context\": parent_doc_retriever, \"query\": RunnablePassthrough()}\n",
159 |     "   | prompt\n",
160 |     "   | model\n",
161 |     "   | StrOutputParser()\n",
162 |     ")\n",
163 |     "\n",
164 |     "# Prompt the chain\n",
165 |     "query = \"In a list, what are MongoDB's latest AI announcements?\"\n",
166 |     "answer = chain.invoke(query)\n",
167 |     "print(answer)"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "kernelspec": {
173 |    "display_name": "Python 3",
174 |    "language": "python",
175 |    "name": "python3"
176 |   },
177 |   "language_info": {
178 |    "codemirror_mode": {
179 |     "name": "ipython",
180 |     "version": 3
181 |    },
182 |    "file_extension": ".py",
183 |    "mimetype": "text/x-python",
184 |    "name": "python",
185 |    "nbconvert_exporter": "python",
186 |    "pygments_lexer": "ipython3",
187 |    "version": "3.9.12"
188 |   }
189 |  },
190 |  "nbformat": 4,
191 |  "nbformat_minor": 2
192 | }
193 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain-hybrid-search.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# LangChain MongoDB Integration - Hybrid Search"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [LangChain Hybrid Search](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/hybrid-search/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-hybrid-search.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "\n",
 42 |     "os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
 43 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
 44 |     "MONGODB_URI = \"<connection-string>\""
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
 54 |     "from langchain_voyageai import VoyageAIEmbeddings\n",
 55 |     "\n",
 56 |     "# Create the vector store\n",
 57 |     "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
 58 |     "   connection_string = MONGODB_URI,\n",
 59 |     "   embedding = VoyageAIEmbeddings(model = \"voyage-3-large\", output_dimension = 2048),\n",
 60 |     "   namespace = \"sample_mflix.embedded_movies\",\n",
 61 |     "   text_key = \"plot\",\n",
 62 |     "   embedding_key = \"plot_embedding_voyage_3_large\",\n",
 63 |     "   relevance_score_fn = \"dotProduct\"\n",
 64 |     ")"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# Use helper method to create the vector search index\n",
 74 |     "vector_store.create_vector_search_index(\n",
 75 |     "   dimensions = 2048, # The dimensions of the vector embeddings to be indexed\n",
 76 |     "   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
 77 |     ")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from langchain_mongodb.index import create_fulltext_search_index\n",
 87 |     "from pymongo import MongoClient\n",
 88 |     "\n",
 89 |     "# Connect to your cluster\n",
 90 |     "client = MongoClient(MONGODB_URI)\n",
 91 |     "\n",
 92 |     "# Use helper method to create the search index\n",
 93 |     "create_fulltext_search_index(\n",
 94 |     "   collection = client[\"sample_mflix\"][\"embedded_movies\"],\n",
 95 |     "   field = \"plot\",\n",
 96 |     "   index_name = \"search_index\",\n",
 97 |     "   wait_until_complete = 60\n",
 98 |     ")"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever\n",
108 |     "\n",
109 |     "# Initialize the retriever\n",
110 |     "retriever = MongoDBAtlasHybridSearchRetriever(\n",
111 |     "    vectorstore = vector_store,\n",
112 |     "    search_index_name = \"search_index\",\n",
113 |     "    top_k = 5,\n",
114 |     "    fulltext_penalty = 50,\n",
115 |     "    vector_penalty = 50,\n",
116 |     "    post_filter=[\n",
117 |     "        {\n",
118 |     "            \"$project\": {\n",
119 |     "                \"plot_embedding\": 0,\n",
120 |     "                \"plot_embedding_voyage_3_large\": 0\n",
121 |     "            }\n",
122 |     "        }\n",
123 |     "    ])\n",
124 |     "\n",
125 |     "# Define your query\n",
126 |     "query = \"time travel\"\n",
127 |     "\n",
128 |     "# Print results\n",
129 |     "documents = retriever.invoke(query)\n",
130 |     "for doc in documents:\n",
131 |     "   print(\"Title: \" + doc.metadata[\"title\"])\n",
132 |     "   print(\"Plot: \" + doc.page_content)\n",
133 |     "   print(\"Search score: {}\".format(doc.metadata[\"fulltext_score\"]))\n",
134 |     "   print(\"Vector Search score: {}\".format(doc.metadata[\"vector_score\"]))\n",
135 |     "   print(\"Total score: {}\\n\".format(doc.metadata[\"fulltext_score\"] + doc.metadata[\"vector_score\"]))"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from langchain_core.output_parsers import StrOutputParser\n",
145 |     "from langchain_core.prompts import PromptTemplate\n",
146 |     "from langchain_core.runnables import  RunnablePassthrough\n",
147 |     "from langchain_openai import ChatOpenAI\n",
148 |     "\n",
149 |     "# Define a prompt template\n",
150 |     "template = \"\"\"\n",
151 |     "   Use the following pieces of context to answer the question at the end.\n",
152 |     "   {context}\n",
153 |     "   Question: Can you recommend some movies about {query}?\n",
154 |     "\"\"\"\n",
155 |     "prompt = PromptTemplate.from_template(template)\n",
156 |     "model = ChatOpenAI()\n",
157 |     "\n",
158 |     "# Construct a chain to answer questions on your data\n",
159 |     "chain = (\n",
160 |     "   {\"context\": retriever, \"query\": RunnablePassthrough()}\n",
161 |     "   | prompt\n",
162 |     "   | model\n",
163 |     "   | StrOutputParser()\n",
164 |     ")\n",
165 |     "\n",
166 |     "# Prompt the chain\n",
167 |     "query = \"time travel\"\n",
168 |     "answer = chain.invoke(query)\n",
169 |     "print(answer)"
170 |    ]
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "kernelspec": {
175 |    "display_name": "Python 3",
176 |    "language": "python",
177 |    "name": "python3"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": {
181 |     "name": "ipython",
182 |     "version": 3
183 |    },
184 |    "file_extension": ".py",
185 |    "mimetype": "text/x-python",
186 |    "name": "python",
187 |    "nbconvert_exporter": "python",
188 |    "pygments_lexer": "ipython3",
189 |    "version": "3.9.12"
190 |   }
191 |  },
192 |  "nbformat": 4,
193 |  "nbformat_minor": 2
194 | }
195 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain-natural-language.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f9696293",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Query MongoDB with Natural Language Using LangChain and LangGraph"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "e696dea0",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This notebook is a companion to the [Query MongoDB with Natural Language Using LangChain and LangGraph](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/natural-language-to-mql/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 17 |     "\n",
 18 |     "This notebook demonstrates how to query a MongoDB cluster with a natural language prompt using an AI agent built with the [LangChain MongoDB Toolkit](https://langchain-mongodb.readthedocs.io/en/latest/langchain_mongodb/agent_toolkit/langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit.html#langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit) and the [LangGraph ReAct Agent Framework](https://langchain-ai.github.io/langgraph/agents/agents/).\n",
 19 |     "\n",
 20 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-natural-language.ipynb\">\n",
 21 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 22 |     "</a>"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "f106dda9",
 29 |    "metadata": {
 30 |     "vscode": {
 31 |      "languageId": "shellscript"
 32 |     }
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "pip install --quiet --upgrade langchain-mongodb langchain-openai langgraph"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "998157e0",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Set up your environment\n",
 45 |     "\n",
 46 |     "Before you begin, make sure you have the following:\n",
 47 |     "\n",
 48 |     "- A MongoDB cluster up and running (you'll need the [connection string](https://www.mongodb.com/docs/manual/reference/connection-string/))\n",
 49 |     "- An API key to access an LLM (This tutorial uses a model from OpenAI, but you can use any model [supported by LangChain](https://python.langchain.com/docs/integrations/chat/))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "694ccd64",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "os.environ[\"OPENAI_API_KEY\"] = '<api-key>'\n",
 60 |     "MONGODB_URI = '<connection-string>'\n",
 61 |     "DB_NAME = 'sample_restaurants'\n",
 62 |     "NATURAL_LANGUAGE_QUERY = 'Find all restaurants that serve hamburgers.'"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "c764c565",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import os\n",
 73 |     "from langchain_openai import ChatOpenAI\n",
 74 |     "from langgraph.prebuilt import create_react_agent\n",
 75 |     "from langchain_mongodb.agent_toolkit import (\n",
 76 |     "    MONGODB_AGENT_SYSTEM_PROMPT,\n",
 77 |     "    MongoDBDatabase,\n",
 78 |     "    MongoDBDatabaseToolkit,\n",
 79 |     ")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "5a6b006c",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Build the agent\n",
 88 |     "\n",
 89 |     "Next, define the `NaturalLanguageToMQL` Python class.\n",
 90 |     "\n",
 91 |     "#### Key Points\n",
 92 |     "\n",
 93 |     "- `self.toolkit`, the tools that the agent can use, is an instance of the [MongoDB Toolkit](https://langchain-mongodb.readthedocs.io/en/latest/langchain_mongodb/agent_toolkit/langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit.html#langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit). \n",
 94 |     "\n",
 95 |     "- `self.agent`, the agent itself, is an instance of the [ReAct Agent framework](https://langchain-ai.github.io/langgraph/agents/agents/), which takes `self.toolkit` as a parameter."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "b45185db",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "class NaturalLanguageToMQL:\n",
106 |     "    def __init__(self):\n",
107 |     "        self.llm = ChatOpenAI(model=\"gpt-4o-mini\", timeout=60)\n",
108 |     "        self.system_message = MONGODB_AGENT_SYSTEM_PROMPT.format(top_k=5)\n",
109 |     "        self.db_wrapper = MongoDBDatabase.from_connection_string(\n",
110 |     "                            MONGODB_URI, \n",
111 |     "                            database=DB_NAME)\n",
112 |     "        self.toolkit = MongoDBDatabaseToolkit(db=self.db_wrapper, llm=self.llm)\n",
113 |     "        self.agent = create_react_agent(\n",
114 |     "                        self.llm, \n",
115 |     "                        self.toolkit.get_tools(), \n",
116 |     "                        state_modifier=self.system_message)\n",
117 |     "        self.messages = []\n",
118 |     "\n",
119 |     "    def convert_to_mql_and_execute_query(self, query):\n",
120 |     "        # Start the agent with the agent.stream() method\n",
121 |     "        events = self.agent.stream(\n",
122 |     "            {'messages': [('user', query)]},\n",
123 |     "            stream_mode='values',\n",
124 |     "        )\n",
125 |     "        # Add output (events) from the agent to the self.messages list\n",
126 |     "        for event in events:\n",
127 |     "            self.messages.extend(event['messages'])\n",
128 |     "    \n",
129 |     "    def print_results(self):\n",
130 |     "        # Print the the end-user's expected output from \n",
131 |     "        # the final message produced by the agent.\n",
132 |     "        print(self.messages[-1].content)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "id": "c90825eb",
138 |    "metadata": {},
139 |    "source": [
140 |     "## Run a sample query\n",
141 |     "\n",
142 |     "And finally, instantiate the `NaturalLanguageToMQL` class and run a sample query."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "b7284c63",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "def main():\n",
153 |     "    converter = NaturalLanguageToMQL()\n",
154 |     "    converter.convert_to_mql_and_execute_query(NATURAL_LANGUAGE_QUERY)\n",
155 |     "    converter.print_results()\n",
156 |     "\n",
157 |     "main()"
158 |    ]
159 |   }
160 |  ],
161 |  "metadata": {
162 |   "kernelspec": {
163 |    "display_name": "Python 3",
164 |    "language": "python",
165 |    "name": "python3"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.10.12"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 2
182 | }
183 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain-local-rag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# LangChain MongoDB Integration - Implement RAG Locally"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [LangChain Local RAG](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/get-started/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-local-rag.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {
 24 |     "vscode": {
 25 |      "languageId": "shellscript"
 26 |     }
 27 |    },
 28 |    "source": [
 29 |     "## Create a local Atlas deployment\n",
 30 |     "\n",
 31 |     "Run the following command in your terminal to set up your local Atlas deployment. \n",
 32 |     "\n",
 33 |     "```\n",
 34 |     "atlas deployments setup\n",
 35 |     "```"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "vscode": {
 42 |      "languageId": "shellscript"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "## Set up the environment"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "vscode": {
 54 |      "languageId": "shellscript"
 55 |     }
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "pip install --quiet --upgrade pymongo langchain langchain-community langchain-huggingface langchain-text-splitters gpt4all pypdf"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "MONGODB_URI = (\"mongodb://localhost:<port-number>/?directConnection=true\")"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Configure the vector store"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
 85 |     "from langchain_huggingface import HuggingFaceEmbeddings\n",
 86 |     "\n",
 87 |     "# Load the embedding model (https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)\n",
 88 |     "embedding_model = HuggingFaceEmbeddings(model_name=\"mixedbread-ai/mxbai-embed-large-v1\")\n",
 89 |     "\n",
 90 |     "# Instantiate vector store\n",
 91 |     "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
 92 |     "   connection_string = MONGODB_URI,\n",
 93 |     "   namespace = \"langchain_db.local_rag\",\n",
 94 |     "   embedding=embedding_model,\n",
 95 |     "   index_name=\"vector_index\"\n",
 96 |     ")"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "from langchain_community.document_loaders import PyPDFLoader\n",
106 |     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
107 |     "\n",
108 |     "# Load the PDF\n",
109 |     "loader = PyPDFLoader(\"https://investors.mongodb.com/node/13176/pdf\")\n",
110 |     "data = loader.load()\n",
111 |     "\n",
112 |     "# Split PDF into documents\n",
113 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
114 |     "docs = text_splitter.split_documents(data)\n",
115 |     "\n",
116 |     "# Add data to the vector store\n",
117 |     "vector_store.add_documents(docs)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "vector_store.create_vector_search_index(\n",
127 |     "  dimensions = 1024,       # The dimensions of the vector embeddings to be indexed\n",
128 |     "  wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
129 |     ")"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## Implement RAG with a local LLM\n",
137 |     "Before running the following code, [download the local model](https://gpt4all.io/models/gguf/mistral-7b-openorca.gguf2.Q4_0.gguf)."
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
147 |     "from langchain_community.llms import GPT4All\n",
148 |     "\n",
149 |     "# Configure the LLM\n",
150 |     "local_path = \"<path-to-model>\"\n",
151 |     "\n",
152 |     "# Callbacks support token-wise streaming\n",
153 |     "callbacks = [StreamingStdOutCallbackHandler()]\n",
154 |     "\n",
155 |     "# Verbose is required to pass to the callback manager\n",
156 |     "llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "from langchain_core.prompts import PromptTemplate\n",
166 |     "from langchain_core.output_parsers import StrOutputParser\n",
167 |     "from langchain_core.runnables import RunnablePassthrough\n",
168 |     "import pprint\n",
169 |     "\n",
170 |     "# Instantiate MongoDB Vector Search as a retriever\n",
171 |     "retriever = vector_store.as_retriever()\n",
172 |     "\n",
173 |     "# Define prompt template\n",
174 |     "template = \"\"\"\n",
175 |     "Use the following pieces of context to answer the question at the end.\n",
176 |     "{context}\n",
177 |     "Question: {question}\n",
178 |     "\"\"\"\n",
179 |     "custom_rag_prompt = PromptTemplate.from_template(template)\n",
180 |     "\n",
181 |     "def format_docs(docs):\n",
182 |     "   return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
183 |     "\n",
184 |     "# Create chain   \n",
185 |     "rag_chain = (\n",
186 |     "   {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
187 |     "   | custom_rag_prompt\n",
188 |     "   | llm\n",
189 |     "   | StrOutputParser()\n",
190 |     ")\n",
191 |     "\n",
192 |     "# Prompt the chain\n",
193 |     "question = \"What was MongoDB's latest acquisition?\"\n",
194 |     "answer = rag_chain.invoke(question)\n",
195 |     "\n",
196 |     "# Return source documents\n",
197 |     "documents = retriever.invoke(question)\n",
198 |     "print(\"\\nSource documents:\")\n",
199 |     "pprint.pprint(documents)"
200 |    ]
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": "Python 3",
206 |    "language": "python",
207 |    "name": "python3"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.10.12"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 2
224 | }
225 | 


--------------------------------------------------------------------------------
/ai-integrations/semantic-kernel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Semantic Kernel Integration"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Semantic Kernel Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/semantic-kernel/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/semantic-kernel.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install --quiet --upgrade semantic-kernel openai motor"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import semantic_kernel as sk\n",
 41 |     "from semantic_kernel.connectors.ai.open_ai import (OpenAIChatCompletion, OpenAITextEmbedding)\n",
 42 |     "from semantic_kernel.connectors.memory.mongodb_atlas import MongoDBAtlasMemoryStore\n",
 43 |     "from semantic_kernel.core_plugins.text_memory_plugin import TextMemoryPlugin\n",
 44 |     "from semantic_kernel.memory.semantic_text_memory import SemanticTextMemory\n",
 45 |     "from semantic_kernel.prompt_template.input_variable import InputVariable\n",
 46 |     "from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig\n",
 47 |     "from pymongo import MongoClient\n",
 48 |     "from pymongo.operations import SearchIndexModel"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "OPENAI_API_KEY  = \"<api-key>\"\n",
 58 |     "MONGODB_URI = \"<connection-string>\""
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "kernel = sk.Kernel()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "chat_service = OpenAIChatCompletion(\n",
 77 |     "   service_id=\"chat\",\n",
 78 |     "   ai_model_id=\"gpt-3.5-turbo\",\n",
 79 |     "   api_key=OPENAI_API_KEY\n",
 80 |     ")\n",
 81 |     "embedding_service = OpenAITextEmbedding(\n",
 82 |     "   ai_model_id=\"text-embedding-ada-002\",\n",
 83 |     "   api_key=OPENAI_API_KEY\n",
 84 |     ")\n",
 85 |     "kernel.add_service(chat_service)\n",
 86 |     "kernel.add_service(embedding_service)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "mongodb_atlas_memory_store = MongoDBAtlasMemoryStore(\n",
 96 |     "   connection_string=MONGODB_URI,\n",
 97 |     "   database_name=\"semantic_kernel_db\",\n",
 98 |     "   index_name=\"vector_index\"\n",
 99 |     ")\n",
100 |     "\n",
101 |     "memory = SemanticTextMemory(\n",
102 |     "   storage=mongodb_atlas_memory_store,\n",
103 |     "   embeddings_generator=embedding_service\n",
104 |     ")\n",
105 |     "kernel.add_plugin(TextMemoryPlugin(memory), \"TextMemoryPlugin\")"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "async def populate_memory(kernel: sk.Kernel) -> None:\n",
115 |     "    await memory.save_information(\n",
116 |     "       collection=\"test\", id=\"1\", text=\"I am a developer\"\n",
117 |     "    )\n",
118 |     "    await memory.save_information(\n",
119 |     "       collection=\"test\", id=\"2\", text=\"I started using MongoDB two years ago\"\n",
120 |     "    )\n",
121 |     "    await memory.save_information(\n",
122 |     "       collection=\"test\", id=\"3\", text=\"I'm using MongoDB Vector Search with Semantic Kernel to implement RAG\"\n",
123 |     "    )\n",
124 |     "    await memory.save_information(\n",
125 |     "       collection=\"test\", id=\"4\", text=\"I like coffee\"\n",
126 |     "    )\n",
127 |     "\n",
128 |     "print(\"Populating memory...\")\n",
129 |     "await populate_memory(kernel)\n",
130 |     "print(kernel)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# Connect to your MongoDB cluster and specify the collection\n",
140 |     "client = MongoClient(MONGODB_URI)\n",
141 |     "collection = client[\"semantic_kernel_db\"][\"test\"]\n",
142 |     "\n",
143 |     "# Create your index model, then create the search index\n",
144 |     "search_index_model = SearchIndexModel(\n",
145 |     "   definition={\n",
146 |     "      \"fields\": [\n",
147 |     "         {\n",
148 |     "         \"type\": \"vector\",\n",
149 |     "         \"path\": \"embedding\",\n",
150 |     "         \"numDimensions\": 1536,\n",
151 |     "         \"similarity\": \"cosine\"\n",
152 |     "         }\n",
153 |     "      ]\n",
154 |     "   },\n",
155 |     "   name=\"vector_index\",\n",
156 |     "   type=\"vectorSearch\"\n",
157 |     ")\n",
158 |     "\n",
159 |     "collection.create_search_index(model=search_index_model)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "## Semantic Search Query"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "result = await memory.search(\"test\", \"What is my job title?\")\n",
176 |     "print(f\"Retrieved document: {result[0].text}, {result[0].relevance}\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Basic RAG"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "service_id = \"chat\"\n",
193 |     "settings = kernel.get_service(service_id).instantiate_prompt_execution_settings(\n",
194 |     "   service_id=service_id\n",
195 |     ")\n",
196 |     "\n",
197 |     "prompt_template = \"\"\"\n",
198 |     "   Answer the following question based on the given context.\n",
199 |     "\n",
200 |     "   Question: {{$input}}\n",
201 |     "   Context: {{$context}}\n",
202 |     "\"\"\"\n",
203 |     "\n",
204 |     "chat_prompt_template_config = PromptTemplateConfig(\n",
205 |     "   execution_settings=settings,\n",
206 |     "   input_variables=[\n",
207 |     "       InputVariable(name=\"input\"),\n",
208 |     "       InputVariable(name=\"context\")\n",
209 |     "   ],\n",
210 |     "   template=prompt_template\n",
211 |     ")\n",
212 |     "\n",
213 |     "prompt = kernel.add_function(\n",
214 |     "   function_name=\"RAG\",\n",
215 |     "   plugin_name=\"TextMemoryPlugin\",\n",
216 |     "   prompt_template_config=chat_prompt_template_config,\n",
217 |     ")\n",
218 |     "\n",
219 |     "question = \"When did I start using MongoDB?\"\n",
220 |     "results = await memory.search(\"test\", question)\n",
221 |     "retrieved_document = results[0].text\n",
222 |     "answer = await prompt.invoke(\n",
223 |     "   kernel=kernel, input=question, context=retrieved_document\n",
224 |     ")\n",
225 |     "print(answer)"
226 |    ]
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "kernelspec": {
231 |    "display_name": "Python 3",
232 |    "language": "python",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "codemirror_mode": {
237 |     "name": "ipython",
238 |     "version": 3
239 |    },
240 |    "file_extension": ".py",
241 |    "mimetype": "text/x-python",
242 |    "name": "python",
243 |    "nbconvert_exporter": "python",
244 |    "pygments_lexer": "ipython3",
245 |    "version": "3.9.12"
246 |   }
247 |  },
248 |  "nbformat": 4,
249 |  "nbformat_minor": 2
250 | }
251 | 


--------------------------------------------------------------------------------
/use-cases/local-rag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Local Retrieval-Augmented Generation (RAG)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Local Retrieval-Augmented Generation (RAG)](https://www.mongodb.com/docs/atlas/atlas-vector-search/tutorials/local-rag/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you a RAG implementation with MongoDB Vector Search that you can run **completely locally** by using models from Hugging Face and GPT4All.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/use-cases/local-rag.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Create a local Atlas deployment\n",
 28 |     "\n",
 29 |     "Run the following commands in your terminal to set up your local Atlas deployment. \n",
 30 |     "\n",
 31 |     "```\n",
 32 |     "atlas deployments setup\n",
 33 |     "curl  https://atlas-education.s3.amazonaws.com/sampledata.archive -o sampledata.archive\n",
 34 |     "mongorestore --archive=sampledata.archive --port=<port-number>\n",
 35 |     "```"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "vscode": {
 43 |      "languageId": "shellscript"
 44 |     }
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "pip install --quiet --upgrade pymongo gpt4all sentence_transformers"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "MONGODB_URI = (\"<connection-string>\")\n",
 58 |     "# Use \"mongodb://localhost:<port-number>/?directConnection=true\" for local Atlas deployments"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from pymongo import MongoClient\n",
 68 |     "from sentence_transformers import SentenceTransformer\n",
 69 |     "\n",
 70 |     "# Connect to your local Atlas deployment or MongoDB cluster\n",
 71 |     "client = MongoClient(MONGODB_URI)\n",
 72 |     "\n",
 73 |     "# Select the sample_airbnb.listingsAndReviews collection\n",
 74 |     "collection = client[\"sample_airbnb\"][\"listingsAndReviews\"]\n",
 75 |     "\n",
 76 |     "# Load the embedding model (https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)\n",
 77 |     "model_path = \"<path-to-save-model>\"\n",
 78 |     "model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')\n",
 79 |     "model.save(model_path)\n",
 80 |     "model = SentenceTransformer(model_path)\n",
 81 |     "\n",
 82 |     "# Define function to generate embeddings\n",
 83 |     "def get_embedding(text):\n",
 84 |     "    return model.encode(text).tolist()\n",
 85 |     "\n",
 86 |     "# Filters for only documents with a summary field and without an embeddings field\n",
 87 |     "filter = { '$and': [ { 'summary': { '$exists': True, '$ne': None } }, { 'embeddings': { '$exists': False } } ] }\n",
 88 |     "\n",
 89 |     "# Creates embeddings for subset of the collection\n",
 90 |     "updated_doc_count = 0\n",
 91 |     "for document in collection.find(filter).limit(50):\n",
 92 |     "    text = document['summary']\n",
 93 |     "    embedding = get_embedding(text)\n",
 94 |     "    collection.update_one({ '_id': document['_id'] }, { \"$set\": { 'embeddings': embedding } }, upsert=True)\n",
 95 |     "    updated_doc_count += 1\n",
 96 |     "\n",
 97 |     "print(\"Documents updated: {}\".format(updated_doc_count))"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "from pymongo.operations import SearchIndexModel\n",
107 |     "\n",
108 |     "# Create your index model, then create the search index\n",
109 |     "search_index_model = SearchIndexModel(\n",
110 |     "  definition = {\n",
111 |     "    \"fields\": [\n",
112 |     "      {\n",
113 |     "        \"type\": \"vector\",\n",
114 |     "        \"numDimensions\": 1024,\n",
115 |     "        \"path\": \"embeddings\",\n",
116 |     "        \"similarity\": \"cosine\"\n",
117 |     "      }\n",
118 |     "    ]\n",
119 |     "  },\n",
120 |     "  name = \"vector_index\",\n",
121 |     "  type = \"vectorSearch\" \n",
122 |     ")\n",
123 |     "collection.create_search_index(model=search_index_model)\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# Function to get the results of a vector search query\n",
133 |     "def get_query_results(query):\n",
134 |     "   query_embedding = get_embedding(query)\n",
135 |     "\n",
136 |     "   pipeline = [\n",
137 |     "      {\n",
138 |     "            \"$vectorSearch\": {\n",
139 |     "               \"index\": \"vector_index\",\n",
140 |     "               \"queryVector\": query_embedding,\n",
141 |     "               \"path\": \"embeddings\",\n",
142 |     "               \"exact\": True,\n",
143 |     "               \"limit\": 5\n",
144 |     "            }\n",
145 |     "      }, {\n",
146 |     "            \"$project\": {\n",
147 |     "               \"_id\": 0,\n",
148 |     "               \"summary\": 1,\n",
149 |     "               \"listing_url\": 1,\n",
150 |     "               \"score\": {\n",
151 |     "                  \"$meta\": \"vectorSearchScore\"\n",
152 |     "               }\n",
153 |     "            }\n",
154 |     "      }\n",
155 |     "   ]\n",
156 |     "\n",
157 |     "   results = collection.aggregate(pipeline)\n",
158 |     "\n",
159 |     "   array_of_results = []\n",
160 |     "   for doc in results:\n",
161 |     "      array_of_results.append(doc)\n",
162 |     "   return array_of_results"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "import pprint\n",
172 |     "pprint.pprint(get_query_results(\"beach house\"))"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "from gpt4all import GPT4All\n",
182 |     "\n",
183 |     "# Download the model and move it to the same directory as this notebook\n",
184 |     "# For complete details, refer to the documentation page\n",
185 |     "local_llm_path = \"./mistral-7b-openorca.gguf2.Q4_0.gguf\"\n",
186 |     "local_llm = GPT4All(local_llm_path)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "question = \"Can you recommend a few AirBnBs that are beach houses? Include a link to the listing.\"\n",
196 |     "documents = get_query_results(question)\n",
197 |     "\n",
198 |     "text_documents = \"\"\n",
199 |     "for doc in documents:\n",
200 |     "    summary = doc.get(\"summary\", \"\")\n",
201 |     "    link = doc.get(\"listing_url\", \"\")\n",
202 |     "    string = f\"Summary: {summary} Link: {link}. \\n\"\n",
203 |     "    text_documents += string\n",
204 |     "\n",
205 |     "prompt = f\"\"\"Use the following pieces of context to answer the question at the end.\n",
206 |     "    {text_documents}\n",
207 |     "    Question: {question}\n",
208 |     "\"\"\"\n",
209 |     "\n",
210 |     "response = local_llm.generate(prompt)\n",
211 |     "cleaned_response = response.replace('\\\\n', '\\n')\n",
212 |     "print(cleaned_response)"
213 |    ]
214 |   }
215 |  ],
216 |  "metadata": {
217 |   "kernelspec": {
218 |    "display_name": "Python 3",
219 |    "language": "python",
220 |    "name": "python3"
221 |   },
222 |   "language_info": {
223 |    "codemirror_mode": {
224 |     "name": "ipython",
225 |     "version": 3
226 |    },
227 |    "file_extension": ".py",
228 |    "mimetype": "text/x-python",
229 |    "name": "python",
230 |    "nbconvert_exporter": "python",
231 |    "pygments_lexer": "ipython3",
232 |    "version": "3.9.12"
233 |   }
234 |  },
235 |  "nbformat": 4,
236 |  "nbformat_minor": 2
237 | }
238 | 


--------------------------------------------------------------------------------
/use-cases/rag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Retrieval-Augmented Generation (RAG)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Retrieval-Augmented Generation (RAG)](https://www.mongodb.com/docs/atlas/atlas-vector-search/rag/#get-started) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to implement RAG with MongoDB Vector Search by using open-source models from Hugging Face.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/use-cases/rag.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade pymongo sentence_transformers einops langchain langchain_community langchain-text-splitters pypdf huggingface_hub"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import os\n",
 43 |     "\n",
 44 |     "# Specify your Hugging Face access token\n",
 45 |     "os.environ[\"HF_TOKEN\"] = \"<token>\""
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from sentence_transformers import SentenceTransformer\n",
 55 |     "\n",
 56 |     "# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1\")\n",
 57 |     "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1\", trust_remote_code=True)\n",
 58 |     "    \n",
 59 |     "# Define a function to generate embeddings\n",
 60 |     "def get_embedding(data):\n",
 61 |     "    \"\"\"Generates vector embeddings for the given data.\"\"\"\n",
 62 |     "\n",
 63 |     "    embedding = model.encode(data)\n",
 64 |     "    return embedding.tolist()"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "from langchain_community.document_loaders import PyPDFLoader\n",
 74 |     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
 75 |     "\n",
 76 |     "# Load the PDF\n",
 77 |     "loader = PyPDFLoader(\"https://investors.mongodb.com/node/12236/pdf\")\n",
 78 |     "data = loader.load()\n",
 79 |     "\n",
 80 |     "# Split the data into chunks\n",
 81 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)\n",
 82 |     "documents = text_splitter.split_documents(data)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Prepare documents for insertion\n",
 92 |     "docs_to_insert = [{\n",
 93 |     "    \"text\": doc.page_content,\n",
 94 |     "    \"embedding\": get_embedding(doc.page_content)\n",
 95 |     "} for doc in documents]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "from pymongo import MongoClient\n",
105 |     "\n",
106 |     "# Connect to your MongoDB cluster\n",
107 |     "client = MongoClient(\"<connection-string>\")\n",
108 |     "collection = client[\"rag_db\"][\"test\"]\n",
109 |     "\n",
110 |     "# Insert documents into the collection\n",
111 |     "result = collection.insert_many(docs_to_insert)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "from pymongo.operations import SearchIndexModel\n",
121 |     "import time\n",
122 |     "\n",
123 |     "# Create your index model, then create the search index\n",
124 |     "index_name=\"vector_index\"\n",
125 |     "search_index_model = SearchIndexModel(\n",
126 |     "  definition = {\n",
127 |     "    \"fields\": [\n",
128 |     "      {\n",
129 |     "        \"type\": \"vector\",\n",
130 |     "        \"numDimensions\": 768,\n",
131 |     "        \"path\": \"embedding\",\n",
132 |     "        \"similarity\": \"cosine\"\n",
133 |     "      }\n",
134 |     "    ]\n",
135 |     "  },\n",
136 |     "  name = index_name,\n",
137 |     "  type = \"vectorSearch\"\n",
138 |     ")\n",
139 |     "collection.create_search_index(model=search_index_model)\n",
140 |     "\n",
141 |     "# Wait for initial sync to complete\n",
142 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
143 |     "predicate=None\n",
144 |     "if predicate is None:\n",
145 |     "   predicate = lambda index: index.get(\"queryable\") is True\n",
146 |     "\n",
147 |     "while True:\n",
148 |     "   indices = list(collection.list_search_indexes(index_name))\n",
149 |     "   if len(indices) and predicate(indices[0]):\n",
150 |     "      break\n",
151 |     "   time.sleep(5)\n",
152 |     "print(index_name + \" is ready for querying.\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "# Define a function to run vector search queries\n",
162 |     "def get_query_results(query):\n",
163 |     "  \"\"\"Gets results from a vector search query.\"\"\"\n",
164 |     "\n",
165 |     "  query_embedding = get_embedding(query)\n",
166 |     "  pipeline = [\n",
167 |     "      {\n",
168 |     "            \"$vectorSearch\": {\n",
169 |     "              \"index\": \"vector_index\",\n",
170 |     "              \"queryVector\": query_embedding,\n",
171 |     "              \"path\": \"embedding\",\n",
172 |     "              \"exact\": True,\n",
173 |     "              \"limit\": 5\n",
174 |     "            }\n",
175 |     "      }, {\n",
176 |     "            \"$project\": {\n",
177 |     "              \"_id\": 0,\n",
178 |     "              \"text\": 1\n",
179 |     "         }\n",
180 |     "      }\n",
181 |     "  ]\n",
182 |     "\n",
183 |     "  results = collection.aggregate(pipeline)\n",
184 |     "\n",
185 |     "  array_of_results = []\n",
186 |     "  for doc in results:\n",
187 |     "      array_of_results.append(doc)\n",
188 |     "  return array_of_results\n",
189 |     "\n",
190 |     "# Test the function with a sample query\n",
191 |     "import pprint\n",
192 |     "pprint.pprint(get_query_results(\"AI technology\"))"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "from huggingface_hub import InferenceClient\n",
202 |     "\n",
203 |     "# Specify search query, retrieve relevant documents, and convert to string\n",
204 |     "query = \"What are MongoDB's latest AI announcements?\"\n",
205 |     "context_docs = get_query_results(query)\n",
206 |     "context_string = \" \".join([doc[\"text\"] for doc in context_docs])\n",
207 |     "\n",
208 |     "# Construct prompt for the LLM using the retrieved documents as the context\n",
209 |     "prompt = f\"\"\"Use the following pieces of context to answer the question at the end.\n",
210 |     "    {context_string}\n",
211 |     "    Question: {query}\n",
212 |     "\"\"\"\n",
213 |     "\n",
214 |     "# Use a model from Hugging Face\n",
215 |     "llm = InferenceClient(\n",
216 |     "    \"mistralai/Mixtral-8x22B-Instruct-v0.1\",\n",
217 |     "    provider = \"fireworks-ai\",  \n",
218 |     "    token = os.getenv(\"HF_TOKEN\"))\n",
219 |     "\n",
220 |     "# Prompt the LLM (this code varies depending on the model you use)\n",
221 |     "output = llm.chat_completion(\n",
222 |     "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
223 |     "    max_tokens=150\n",
224 |     ")\n",
225 |     "print(output.choices[0].message.content)"
226 |    ]
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "kernelspec": {
231 |    "display_name": "Python 3",
232 |    "language": "python",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "codemirror_mode": {
237 |     "name": "ipython",
238 |     "version": 3
239 |    },
240 |    "file_extension": ".py",
241 |    "mimetype": "text/x-python",
242 |    "name": "python",
243 |    "nbconvert_exporter": "python",
244 |    "pygments_lexer": "ipython3",
245 |    "version": "3.9.12"
246 |   }
247 |  },
248 |  "nbformat": 4,
249 |  "nbformat_minor": 2
250 | }
251 | 


--------------------------------------------------------------------------------
/use-cases/rag-with-voyage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Retrieval-Augmented Generation (RAG)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Retrieval-Augmented Generation (RAG)](https://www.mongodb.com/docs/atlas/atlas-vector-search/rag/#get-started) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to implement RAG with MongoDB Vector Search by using the ``voyage-3-large`` embedding model from Voyage AI and an open-source generative model from Hugging Face.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/use-cases/rag-with-voyage.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade pymongo sentence_transformers voyageai einops langchain langchain_community pypdf huggingface_hub"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import os\n",
 43 |     "\n",
 44 |     "# Specify your Hugging Face access token and Voyage API key\n",
 45 |     "os.environ[\"HF_TOKEN\"] = \"<token>\"\n",
 46 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<api-key>\""
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "import voyageai\n",
 56 |     "\n",
 57 |     "# Specify the Voyage AI embedding model\n",
 58 |     "model = \"voyage-3-large\"\n",
 59 |     "vo = voyageai.Client()\n",
 60 |     "\n",
 61 |     "# Define a function to generate embeddings\n",
 62 |     "def get_embedding(data, input_type = \"document\"):\n",
 63 |     "  embeddings = vo.embed(\n",
 64 |     "      data, model = model, input_type = input_type\n",
 65 |     "  ).embeddings\n",
 66 |     "  return embeddings[0]"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "from langchain_community.document_loaders import PyPDFLoader\n",
 76 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
 77 |     "\n",
 78 |     "# Load the PDF\n",
 79 |     "loader = PyPDFLoader(\"https://investors.mongodb.com/node/12236/pdf\")\n",
 80 |     "data = loader.load()\n",
 81 |     "\n",
 82 |     "# Split the data into chunks\n",
 83 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)\n",
 84 |     "documents = text_splitter.split_documents(data)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# Prepare documents for insertion\n",
 94 |     "docs_to_insert = [{\n",
 95 |     "    \"text\": doc.page_content,\n",
 96 |     "    \"embedding\": get_embedding(doc.page_content)\n",
 97 |     "} for doc in documents]"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "from pymongo import MongoClient\n",
107 |     "\n",
108 |     "# Connect to your MongoDB cluster\n",
109 |     "client = MongoClient(\"<connection-string>\")\n",
110 |     "collection = client[\"rag_db\"][\"test\"]\n",
111 |     "\n",
112 |     "# Insert documents into the collection\n",
113 |     "result = collection.insert_many(docs_to_insert)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "from pymongo.operations import SearchIndexModel\n",
123 |     "import time\n",
124 |     "\n",
125 |     "# Create your index model, then create the search index\n",
126 |     "index_name=\"vector_index\"\n",
127 |     "search_index_model = SearchIndexModel(\n",
128 |     "  definition = {\n",
129 |     "    \"fields\": [\n",
130 |     "      {\n",
131 |     "        \"type\": \"vector\",\n",
132 |     "        \"numDimensions\": 1024,\n",
133 |     "        \"path\": \"embedding\",\n",
134 |     "        \"similarity\": \"cosine\"\n",
135 |     "      }\n",
136 |     "    ]\n",
137 |     "  },\n",
138 |     "  name = index_name,\n",
139 |     "  type = \"vectorSearch\"\n",
140 |     ")\n",
141 |     "collection.create_search_index(model=search_index_model)\n",
142 |     "\n",
143 |     "# Wait for initial sync to complete\n",
144 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
145 |     "predicate=None\n",
146 |     "if predicate is None:\n",
147 |     "   predicate = lambda index: index.get(\"queryable\") is True\n",
148 |     "\n",
149 |     "while True:\n",
150 |     "   indices = list(collection.list_search_indexes(index_name))\n",
151 |     "   if len(indices) and predicate(indices[0]):\n",
152 |     "      break\n",
153 |     "   time.sleep(5)\n",
154 |     "print(index_name + \" is ready for querying.\")"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "# Define a function to run vector search queries\n",
164 |     "def get_query_results(query):\n",
165 |     "  \"\"\"Gets results from a vector search query.\"\"\"\n",
166 |     "\n",
167 |     "  query_embedding = get_embedding(query, input_type=\"query\")\n",
168 |     "  pipeline = [\n",
169 |     "      {\n",
170 |     "            \"$vectorSearch\": {\n",
171 |     "              \"index\": \"vector_index\",\n",
172 |     "              \"queryVector\": query_embedding,\n",
173 |     "              \"path\": \"embedding\",\n",
174 |     "              \"exact\": True,\n",
175 |     "              \"limit\": 5\n",
176 |     "            }\n",
177 |     "      }, {\n",
178 |     "            \"$project\": {\n",
179 |     "              \"_id\": 0,\n",
180 |     "              \"text\": 1\n",
181 |     "         }\n",
182 |     "      }\n",
183 |     "  ]\n",
184 |     "\n",
185 |     "  results = collection.aggregate(pipeline)\n",
186 |     "\n",
187 |     "  array_of_results = []\n",
188 |     "  for doc in results:\n",
189 |     "      array_of_results.append(doc)\n",
190 |     "  return array_of_results\n",
191 |     "\n",
192 |     "# Test the function with a sample query\n",
193 |     "import pprint\n",
194 |     "pprint.pprint(get_query_results(\"AI technology\"))"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "from huggingface_hub import InferenceClient\n",
204 |     "\n",
205 |     "# Specify search query, retrieve relevant documents, and convert to string\n",
206 |     "query = \"What are MongoDB's latest AI announcements?\"\n",
207 |     "context_docs = get_query_results(query)\n",
208 |     "context_string = \" \".join([doc[\"text\"] for doc in context_docs])\n",
209 |     "\n",
210 |     "# Construct prompt for the LLM using the retrieved documents as the context\n",
211 |     "prompt = f\"\"\"Use the following pieces of context to answer the question at the end.\n",
212 |     "    {context_string}\n",
213 |     "    Question: {query}\n",
214 |     "\"\"\"\n",
215 |     "\n",
216 |     "# Use a model from Hugging Face\n",
217 |     "llm = InferenceClient(\n",
218 |     "    \"mistralai/Mixtral-8x22B-Instruct-v0.1\",\n",
219 |     "    provider = \"fireworks-ai\",\n",
220 |     "    token = os.getenv(\"HF_TOKEN\"))\n",
221 |     "\n",
222 |     "# Prompt the LLM (this code varies depending on the model you use)\n",
223 |     "output = llm.chat_completion(\n",
224 |     "    messages=[{\"role\": \"user\", \"content\": prompt}],\n",
225 |     "    max_tokens=150\n",
226 |     ")\n",
227 |     "print(output.choices[0].message.content)"
228 |    ]
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "Python 3",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.9.12"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 2
252 | }
253 | 


--------------------------------------------------------------------------------
/create-embeddings/open-source-existing-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Embeddings - Open Source - Existing Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to generate embeddings from **existing data in MongoDB** by using the open-source ``nomic-embed-text-v1`` model.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-existing-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade sentence-transformers pymongo einops"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "vscode": {
 40 |      "languageId": "shellscript"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "## Use an Embedding Model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from sentence_transformers import SentenceTransformer\n",
 54 |     "\n",
 55 |     "# Load the embedding model\n",
 56 |     "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1\", trust_remote_code=True)\n",
 57 |     "\n",
 58 |     "# Define a function to generate embeddings\n",
 59 |     "def get_embedding(data, precision=\"float32\"):\n",
 60 |     "   return model.encode(data, precision=precision).tolist()\n",
 61 |     "\n",
 62 |     "# Generate an embedding\n",
 63 |     "embedding = get_embedding(\"foo\")\n",
 64 |     "print(embedding)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### (Optional) Compress your embeddings\n",
 72 |     "\n",
 73 |     "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from bson.binary import Binary \n",
 83 |     "from bson.binary import BinaryVectorDtype\n",
 84 |     "\n",
 85 |     "# Define a function to generate BSON vectors\n",
 86 |     "def generate_bson_vector(vector, vector_dtype):\n",
 87 |     "   return Binary.from_vector(vector, vector_dtype)\n",
 88 |     "\n",
 89 |     "# Generate BSON vector from the sample float32 embedding\n",
 90 |     "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
 91 |     "\n",
 92 |     "# Print the converted embedding\n",
 93 |     "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## Generate Embeddings"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "import pymongo\n",
110 |     "\n",
111 |     "# Connect to your MongoDB cluster\n",
112 |     "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
113 |     "db = mongo_client[\"sample_airbnb\"]\n",
114 |     "collection = db[\"listingsAndReviews\"]\n",
115 |     "\n",
116 |     "# Define a filter to exclude documents with null or empty 'summary' fields\n",
117 |     "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
118 |     "\n",
119 |     "# Get a subset of documents in the collection\n",
120 |     "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "from pymongo import UpdateOne\n",
130 |     "\n",
131 |     "# Generate the list of bulk write operations\n",
132 |     "operations = []\n",
133 |     "for doc in documents:\n",
134 |     "   summary = doc[\"summary\"]\n",
135 |     "   # Generate embeddings for this document\n",
136 |     "   embedding = get_embedding(summary)\n",
137 |     "\n",
138 |     "   # Uncomment the following line to convert to BSON vectors\n",
139 |     "   # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
140 |     "\n",
141 |     "   # Add the update operation to the list\n",
142 |     "   operations.append(UpdateOne(\n",
143 |     "      {\"_id\": doc[\"_id\"]},\n",
144 |     "      {\"$set\": {\n",
145 |     "         \"embedding\": embedding\n",
146 |     "      }}\n",
147 |     "   ))\n",
148 |     "\n",
149 |     "# Execute the bulk write operation\n",
150 |     "if operations:\n",
151 |     "   result = collection.bulk_write(operations)\n",
152 |     "   updated_doc_count = result.modified_count\n",
153 |     "\n",
154 |     "print(f\"Updated {updated_doc_count} documents.\")"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "## Index and Query Your Embeddings"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "from pymongo.operations import SearchIndexModel\n",
171 |     "import time\n",
172 |     "\n",
173 |     "# Create your index model, then create the search index\n",
174 |     "search_index_model = SearchIndexModel(\n",
175 |     "  definition = {\n",
176 |     "    \"fields\": [\n",
177 |     "      {\n",
178 |     "        \"type\": \"vector\",\n",
179 |     "        \"path\": \"embedding\",\n",
180 |     "        \"similarity\": \"dotProduct\",\n",
181 |     "        \"numDimensions\": 768\n",
182 |     "      }\n",
183 |     "    ]\n",
184 |     "  },\n",
185 |     "  name=\"vector_index\",\n",
186 |     "  type=\"vectorSearch\"\n",
187 |     ")\n",
188 |     "result = collection.create_search_index(model=search_index_model)\n",
189 |     "\n",
190 |     "# Wait for initial sync to complete\n",
191 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
192 |     "predicate=None\n",
193 |     "if predicate is None:\n",
194 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
195 |     "\n",
196 |     "while True:\n",
197 |     "  indices = list(collection.list_search_indexes(result))\n",
198 |     "  if len(indices) and predicate(indices[0]):\n",
199 |     "    break\n",
200 |     "  time.sleep(5)\n",
201 |     "print(result + \" is ready for querying.\")"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# Generate embedding for the search query\n",
211 |     "query_embedding = get_embedding(\"beach house\")\n",
212 |     "\n",
213 |     "# Sample vector search pipeline\n",
214 |     "pipeline = [\n",
215 |     "   {\n",
216 |     "      \"$vectorSearch\": {\n",
217 |     "            \"index\": \"vector_index\",\n",
218 |     "            \"queryVector\": query_embedding,\n",
219 |     "            \"path\": \"embedding\",\n",
220 |     "            \"exact\": True,\n",
221 |     "            \"limit\": 5\n",
222 |     "      }\n",
223 |     "   }, \n",
224 |     "   {\n",
225 |     "      \"$project\": {\n",
226 |     "         \"_id\": 0, \n",
227 |     "         \"summary\": 1,\n",
228 |     "         \"score\": {\n",
229 |     "            \"$meta\": \"vectorSearchScore\"\n",
230 |     "         }\n",
231 |     "      }\n",
232 |     "   }\n",
233 |     "]\n",
234 |     "\n",
235 |     "# Execute the search\n",
236 |     "results = collection.aggregate(pipeline)\n",
237 |     "\n",
238 |     "# Print results\n",
239 |     "for i in results:\n",
240 |     "   print(i)\n"
241 |    ]
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.10.12"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 2
265 | }
266 | 


--------------------------------------------------------------------------------
/create-embeddings/voyage-existing-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Embeddings - Voyage AI - Existing Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to generate embeddings from **existing data in MongoDB** by using the ``voyage-3-large`` model from Voyage AI.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade voyageai pymongo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "vscode": {
 40 |      "languageId": "shellscript"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "## Use an Embedding Model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import os\n",
 54 |     "import voyageai\n",
 55 |     "\n",
 56 |     "# Specify your Voyage API key and embedding model\n",
 57 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<api-key>\"\n",
 58 |     "model = \"voyage-3-large\"\n",
 59 |     "vo = voyageai.Client()\n",
 60 |     "\n",
 61 |     "# Define a function to generate embeddings\n",
 62 |     "def get_embedding(data, input_type = \"document\"):\n",
 63 |     "  embeddings = vo.embed(\n",
 64 |     "      data, model = model, input_type = input_type\n",
 65 |     "  ).embeddings\n",
 66 |     "  return embeddings[0]\n",
 67 |     "\n",
 68 |     "# Generate an embedding\n",
 69 |     "embedding = get_embedding(\"foo\")\n",
 70 |     "print(embedding)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### (Optional) Compress your embeddings\n",
 78 |     "\n",
 79 |     "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from bson.binary import Binary \n",
 89 |     "from bson.binary import BinaryVectorDtype\n",
 90 |     "\n",
 91 |     "# Define a function to generate BSON vectors\n",
 92 |     "def generate_bson_vector(vector, vector_dtype):\n",
 93 |     "   return Binary.from_vector(vector, vector_dtype)\n",
 94 |     "\n",
 95 |     "# Generate BSON vector from the sample float32 embedding\n",
 96 |     "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
 97 |     "\n",
 98 |     "# Print the converted embedding\n",
 99 |     "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Generate Embeddings"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "import pymongo\n",
116 |     "\n",
117 |     "# Connect to your MongoDB cluster\n",
118 |     "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
119 |     "db = mongo_client[\"sample_airbnb\"]\n",
120 |     "collection = db[\"listingsAndReviews\"]\n",
121 |     "\n",
122 |     "# Define a filter to exclude documents with null or empty 'summary' fields\n",
123 |     "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
124 |     "\n",
125 |     "# Get a subset of documents in the collection\n",
126 |     "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from pymongo import UpdateOne\n",
136 |     "\n",
137 |     "# Generate the list of bulk write operations\n",
138 |     "operations = []\n",
139 |     "for doc in documents:\n",
140 |     "   summary = doc[\"summary\"]\n",
141 |     "   # Generate embeddings for this document\n",
142 |     "   embedding = get_embedding(summary)\n",
143 |     "\n",
144 |     "   # Uncomment the following line to convert to BSON vectors\n",
145 |     "   # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
146 |     "\n",
147 |     "   # Add the update operation to the list\n",
148 |     "   operations.append(UpdateOne(\n",
149 |     "      {\"_id\": doc[\"_id\"]},\n",
150 |     "      {\"$set\": {\n",
151 |     "         \"embedding\": embedding\n",
152 |     "      }}\n",
153 |     "   ))\n",
154 |     "\n",
155 |     "# Execute the bulk write operation\n",
156 |     "if operations:\n",
157 |     "   result = collection.bulk_write(operations)\n",
158 |     "   updated_doc_count = result.modified_count\n",
159 |     "\n",
160 |     "print(f\"Updated {updated_doc_count} documents.\")"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Index and Query Your Embeddings"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "from pymongo.operations import SearchIndexModel\n",
177 |     "import time\n",
178 |     "\n",
179 |     "# Create your index model, then create the search index\n",
180 |     "search_index_model = SearchIndexModel(\n",
181 |     "  definition = {\n",
182 |     "    \"fields\": [\n",
183 |     "      {\n",
184 |     "        \"type\": \"vector\",\n",
185 |     "        \"path\": \"embedding\",\n",
186 |     "        \"similarity\": \"dotProduct\",\n",
187 |     "        \"numDimensions\": 1024\n",
188 |     "      }\n",
189 |     "    ]\n",
190 |     "  },\n",
191 |     "  name=\"vector_index\",\n",
192 |     "  type=\"vectorSearch\"\n",
193 |     ")\n",
194 |     "result = collection.create_search_index(model=search_index_model)\n",
195 |     "\n",
196 |     "# Wait for initial sync to complete\n",
197 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
198 |     "predicate=None\n",
199 |     "if predicate is None:\n",
200 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
201 |     "\n",
202 |     "while True:\n",
203 |     "  indices = list(collection.list_search_indexes(result))\n",
204 |     "  if len(indices) and predicate(indices[0]):\n",
205 |     "    break\n",
206 |     "  time.sleep(5)\n",
207 |     "print(result + \" is ready for querying.\")"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "# Generate embedding for the search query\n",
217 |     "query_embedding = get_embedding(\"beach house\", input_type=\"query\")\n",
218 |     "\n",
219 |     "# Sample vector search pipeline\n",
220 |     "pipeline = [\n",
221 |     "   {\n",
222 |     "      \"$vectorSearch\": {\n",
223 |     "            \"index\": \"vector_index\",\n",
224 |     "            \"queryVector\": query_embedding,\n",
225 |     "            \"path\": \"embedding\",\n",
226 |     "            \"exact\": True,\n",
227 |     "            \"limit\": 5\n",
228 |     "      }\n",
229 |     "   }, \n",
230 |     "   {\n",
231 |     "      \"$project\": {\n",
232 |     "         \"_id\": 0, \n",
233 |     "         \"summary\": 1,\n",
234 |     "         \"score\": {\n",
235 |     "            \"$meta\": \"vectorSearchScore\"\n",
236 |     "         }\n",
237 |     "      }\n",
238 |     "   }\n",
239 |     "]\n",
240 |     "\n",
241 |     "# Execute the search\n",
242 |     "results = collection.aggregate(pipeline)\n",
243 |     "\n",
244 |     "# Print results\n",
245 |     "for i in results:\n",
246 |     "   print(i)\n"
247 |    ]
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "Python 3",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.10.12"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 2
271 | }
272 | 


--------------------------------------------------------------------------------
/create-embeddings/openai-existing-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Embeddings - OpenAI - Existing Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to generate embeddings from **existing data in MongoDB** by using OpenAI's ``text-embedding-3-small`` model.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/openai-existing-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade openai pymongo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "vscode": {
 40 |      "languageId": "shellscript"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "## Use an Embedding Model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import os\n",
 54 |     "from openai import OpenAI\n",
 55 |     "\n",
 56 |     "# Specify your OpenAI API key and embedding model\n",
 57 |     "os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
 58 |     "model = \"text-embedding-3-small\"\n",
 59 |     "openai_client = OpenAI()\n",
 60 |     "\n",
 61 |     "# Define a function to generate embeddings\n",
 62 |     "def get_embedding(text):\n",
 63 |     "   \"\"\"Generates vector embeddings for the given text.\"\"\"\n",
 64 |     "\n",
 65 |     "   embedding = openai_client.embeddings.create(input = [text], model=model).data[0].embedding\n",
 66 |     "   return embedding\n",
 67 |     "\n",
 68 |     "# Generate an embedding\n",
 69 |     "embedding = get_embedding(\"foo\")\n",
 70 |     "print(embedding)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### (Optional) Compress your embeddings\n",
 78 |     "\n",
 79 |     "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from bson.binary import Binary \n",
 89 |     "from bson.binary import BinaryVectorDtype\n",
 90 |     "\n",
 91 |     "# Define a function to generate BSON vectors\n",
 92 |     "def generate_bson_vector(vector, vector_dtype):\n",
 93 |     "   return Binary.from_vector(vector, vector_dtype)\n",
 94 |     "\n",
 95 |     "# Generate BSON vector from the sample float32 embedding\n",
 96 |     "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
 97 |     "\n",
 98 |     "# Print the converted embedding\n",
 99 |     "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Generate Embeddings"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "import pymongo\n",
116 |     "\n",
117 |     "# Connect to your MongoDB cluster\n",
118 |     "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
119 |     "db = mongo_client[\"sample_airbnb\"]\n",
120 |     "collection = db[\"listingsAndReviews\"]\n",
121 |     "\n",
122 |     "# Define a filter to exclude documents with null or empty 'summary' fields\n",
123 |     "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
124 |     "\n",
125 |     "# Get a subset of documents in the collection\n",
126 |     "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from pymongo import UpdateOne\n",
136 |     "\n",
137 |     "# Generate the list of bulk write operations\n",
138 |     "operations = []\n",
139 |     "for doc in documents:\n",
140 |     "   summary = doc[\"summary\"]\n",
141 |     "   # Generate embeddings for this document\n",
142 |     "   embedding = get_embedding(summary)\n",
143 |     "\n",
144 |     "   # Uncomment the following line to convert to BSON vectors\n",
145 |     "   # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
146 |     "\n",
147 |     "   # Add the update operation to the list\n",
148 |     "   operations.append(UpdateOne(\n",
149 |     "      {\"_id\": doc[\"_id\"]},\n",
150 |     "      {\"$set\": {\n",
151 |     "         \"embedding\": embedding\n",
152 |     "      }}\n",
153 |     "   ))\n",
154 |     "\n",
155 |     "# Execute the bulk write operation\n",
156 |     "if operations:\n",
157 |     "   result = collection.bulk_write(operations)\n",
158 |     "   updated_doc_count = result.modified_count\n",
159 |     "\n",
160 |     "print(f\"Updated {updated_doc_count} documents.\")"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Index and Query Your Embeddings"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "from pymongo.operations import SearchIndexModel\n",
177 |     "import time\n",
178 |     "\n",
179 |     "# Create your index model, then create the search index\n",
180 |     "search_index_model = SearchIndexModel(\n",
181 |     "  definition = {\n",
182 |     "    \"fields\": [\n",
183 |     "      {\n",
184 |     "        \"type\": \"vector\",\n",
185 |     "        \"path\": \"embedding\",\n",
186 |     "        \"similarity\": \"dotProduct\",\n",
187 |     "        \"numDimensions\": 1536\n",
188 |     "      }\n",
189 |     "    ]\n",
190 |     "  },\n",
191 |     "  name=\"vector_index\",\n",
192 |     "  type=\"vectorSearch\"\n",
193 |     ")\n",
194 |     "result = collection.create_search_index(model=search_index_model)\n",
195 |     "\n",
196 |     "# Wait for initial sync to complete\n",
197 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
198 |     "predicate=None\n",
199 |     "if predicate is None:\n",
200 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
201 |     "\n",
202 |     "while True:\n",
203 |     "  indices = list(collection.list_search_indexes(result))\n",
204 |     "  if len(indices) and predicate(indices[0]):\n",
205 |     "    break\n",
206 |     "  time.sleep(5)\n",
207 |     "print(result + \" is ready for querying.\")"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "# Generate embedding for the search query\n",
217 |     "query_embedding = get_embedding(\"beach house\")\n",
218 |     "\n",
219 |     "# Sample vector search pipeline\n",
220 |     "pipeline = [\n",
221 |     "   {\n",
222 |     "      \"$vectorSearch\": {\n",
223 |     "            \"index\": \"vector_index\",\n",
224 |     "            \"queryVector\": query_embedding,\n",
225 |     "            \"path\": \"embedding\",\n",
226 |     "            \"exact\": True,\n",
227 |     "            \"limit\": 5\n",
228 |     "      }\n",
229 |     "   }, \n",
230 |     "   {\n",
231 |     "      \"$project\": {\n",
232 |     "         \"_id\": 0, \n",
233 |     "         \"summary\": 1,\n",
234 |     "         \"score\": {\n",
235 |     "            \"$meta\": \"vectorSearchScore\"\n",
236 |     "         }\n",
237 |     "      }\n",
238 |     "   }\n",
239 |     "]\n",
240 |     "\n",
241 |     "# Execute the search\n",
242 |     "results = collection.aggregate(pipeline)\n",
243 |     "\n",
244 |     "# Print results\n",
245 |     "for i in results:\n",
246 |     "   print(i)\n"
247 |    ]
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "Python 3",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.10.12"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 2
271 | }
272 | 


--------------------------------------------------------------------------------
/ai-integrations/haystack.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Haystack Integration"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Haystack Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/haystack/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/haystack.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install --quiet --upgrade mongodb-atlas-haystack voyage-embedders-haystack pymongo"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "\n",
 42 |     "os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
 43 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
 44 |     "os.environ[\"MONGO_CONNECTION_STRING\"]= \"<connection-string>\""
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from pymongo import MongoClient\n",
 54 |     "client = MongoClient(os.environ.get(\"MONGO_CONNECTION_STRING\"))"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Create your database and collection\n",
 64 |     "db_name = \"haystack_db\"\n",
 65 |     "collection_name = \"test\"\n",
 66 |     "database = client[db_name]\n",
 67 |     "database.create_collection(collection_name)\n",
 68 |     "\n",
 69 |     "# Define collection\n",
 70 |     "collection = client[db_name][collection_name]"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "from pymongo.operations import SearchIndexModel\n",
 80 |     "import time\n",
 81 |     "\n",
 82 |     "# Create your index model, then create the search index\n",
 83 |     "search_index_model = SearchIndexModel(\n",
 84 |     "   definition={\n",
 85 |     "      \"fields\": [\n",
 86 |     "         {\n",
 87 |     "            \"type\": \"vector\",\n",
 88 |     "            \"path\": \"embedding\",\n",
 89 |     "            \"numDimensions\": 1024,\n",
 90 |     "            \"similarity\": \"cosine\"\n",
 91 |     "         }\n",
 92 |     "      ]\n",
 93 |     "   },\n",
 94 |     "   name=\"vector_index\",\n",
 95 |     "   type=\"vectorSearch\"\n",
 96 |     ")\n",
 97 |     "result = collection.create_search_index(model=search_index_model)\n",
 98 |     "\n",
 99 |     "# Wait for initial sync to complete\n",
100 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
101 |     "predicate=None\n",
102 |     "if predicate is None:\n",
103 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
104 |     "\n",
105 |     "while True:\n",
106 |     "  indices = list(collection.list_search_indexes(result))\n",
107 |     "  if len(indices) and predicate(indices[0]):\n",
108 |     "    break\n",
109 |     "  time.sleep(5)\n",
110 |     "print(result + \" is ready for querying.\")"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore\n",
120 |     "\n",
121 |     "document_store = MongoDBAtlasDocumentStore(\n",
122 |     "   database_name=\"haystack_db\",\n",
123 |     "   collection_name=\"test\",\n",
124 |     "   vector_search_index=\"vector_index\",\n",
125 |     "   full_text_search_index=\"search_index\" # Declared but not used in this example\n",
126 |     ")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from haystack import Pipeline, Document\n",
136 |     "from haystack.document_stores.types import DuplicatePolicy\n",
137 |     "from haystack.components.writers import DocumentWriter\n",
138 |     "from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder, VoyageTextEmbedder\n",
139 |     "\n",
140 |     "# Create some example documents\n",
141 |     "documents = [\n",
142 |     "   Document(content=\"My name is Jean and I live in Paris.\"),\n",
143 |     "   Document(content=\"My name is Mark and I live in Berlin.\"),\n",
144 |     "   Document(content=\"My name is Giorgio and I live in Rome.\"),\n",
145 |     "]\n",
146 |     "\n",
147 |     "# Initializing a document embedder to convert text content into vectorized form.\n",
148 |     "doc_embedder = VoyageDocumentEmbedder()\n",
149 |     "\n",
150 |     "# Setting up a document writer to handle the insertion of documents into the MongoDB collection.\n",
151 |     "doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n",
152 |     "\n",
153 |     "# Creating a pipeline for indexing documents. The pipeline includes embedding and writing documents.\n",
154 |     "indexing_pipe = Pipeline()\n",
155 |     "indexing_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n",
156 |     "indexing_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n",
157 |     "\n",
158 |     "# Connecting the components of the pipeline for document flow.\n",
159 |     "indexing_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n",
160 |     "\n",
161 |     "# Running the pipeline with the list of documents to index them in MongoDB.\n",
162 |     "indexing_pipe.run({\"doc_embedder\": {\"documents\": documents}})"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## Basic RAG"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "from haystack.components.generators import OpenAIGenerator\n",
179 |     "from haystack.components.builders.prompt_builder import PromptBuilder\n",
180 |     "from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasEmbeddingRetriever\n",
181 |     "\n",
182 |     "# Template for generating prompts for a movie recommendation engine.\n",
183 |     "prompt_template = \"\"\"\n",
184 |     "    You are an assistant allowed to use the following context documents.\\nDocuments:\n",
185 |     "    {% for doc in documents %}\n",
186 |     "        {{ doc.content }}\n",
187 |     "    {% endfor %}\n",
188 |     "\n",
189 |     "    \\nQuery: {{query}}\n",
190 |     "    \\nAnswer:\n",
191 |     "\"\"\"\n",
192 |     "\n",
193 |     "# Setting up a retrieval-augmented generation (RAG) pipeline for generating responses.\n",
194 |     "rag_pipeline = Pipeline()\n",
195 |     "rag_pipeline.add_component(\"text_embedder\", VoyageTextEmbedder())\n",
196 |     "\n",
197 |     "# Adding a component for retrieving related documents from MongoDB based on the query embedding.\n",
198 |     "rag_pipeline.add_component(instance=MongoDBAtlasEmbeddingRetriever(document_store=document_store,top_k=15), name=\"retriever\")\n",
199 |     "\n",
200 |     "# Building prompts based on retrieved documents to be used for generating responses.\n",
201 |     "rag_pipeline.add_component(\"prompt_builder\", PromptBuilder(template=prompt_template, required_variables=[\"query\", \"documents\"]))\n",
202 |     "\n",
203 |     "# Adding a language model generator to produce the final text output.\n",
204 |     "rag_pipeline.add_component(\"llm\", OpenAIGenerator())\n",
205 |     "\n",
206 |     "# Connecting the components of the RAG pipeline to ensure proper data flow.\n",
207 |     "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n",
208 |     "rag_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
209 |     "rag_pipeline.connect(\"prompt_builder\", \"llm\")\n",
210 |     "\n",
211 |     "# Run the pipeline\n",
212 |     "query = \"Where does Mark live?\"\n",
213 |     "result = rag_pipeline.run(\n",
214 |     "  {\n",
215 |     "      \"text_embedder\": {\"text\": query},\n",
216 |     "      \"prompt_builder\": {\"query\": query},\n",
217 |     "  })\n",
218 |     "print(result['llm']['replies'][0])"
219 |    ]
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python 3",
225 |    "language": "python",
226 |    "name": "python3"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "ipython",
231 |     "version": 3
232 |    },
233 |    "file_extension": ".py",
234 |    "mimetype": "text/x-python",
235 |    "name": "python",
236 |    "nbconvert_exporter": "python",
237 |    "pygments_lexer": "ipython3",
238 |    "version": "3.9.12"
239 |   }
240 |  },
241 |  "nbformat": 4,
242 |  "nbformat_minor": 2
243 | }
244 | 


--------------------------------------------------------------------------------
/create-embeddings/open-source-new-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Embeddings - Open Source - New Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to generate embeddings from **new data** by using the open-source ``nomic-embed-text-v1`` model.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-new-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade sentence-transformers pymongo einops"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "vscode": {
 40 |      "languageId": "shellscript"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "## Use an Embedding Model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from sentence_transformers import SentenceTransformer\n",
 54 |     "\n",
 55 |     "# Load the embedding model\n",
 56 |     "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1\", trust_remote_code=True)\n",
 57 |     "\n",
 58 |     "# Define a function to generate embeddings\n",
 59 |     "def get_embedding(data, precision=\"float32\"):\n",
 60 |     "   return model.encode(data, precision=precision).tolist()\n",
 61 |     "\n",
 62 |     "# Generate an embedding\n",
 63 |     "embedding = get_embedding(\"foo\")\n",
 64 |     "print(embedding)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### (Optional) Compress your embeddings\n",
 72 |     "\n",
 73 |     "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "from bson.binary import Binary \n",
 83 |     "from bson.binary import BinaryVectorDtype\n",
 84 |     "\n",
 85 |     "# Define a function to generate BSON vectors\n",
 86 |     "def generate_bson_vector(vector, vector_dtype):\n",
 87 |     "   return Binary.from_vector(vector, vector_dtype)\n",
 88 |     "\n",
 89 |     "# Generate BSON vector from the sample float32 embedding\n",
 90 |     "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
 91 |     "\n",
 92 |     "# Print the converted embedding\n",
 93 |     "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "## Generate Embeddings"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# Sample data\n",
110 |     "texts = [\n",
111 |     "  \"Titanic: The story of the 1912 sinking of the largest luxury liner ever built\",\n",
112 |     "  \"The Lion King: Lion cub and future king Simba searches for his identity\",\n",
113 |     "  \"Avatar: A marine is dispatched to the moon Pandora on a unique mission\"\n",
114 |     "]"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Generate embeddings from the sample data\n",
124 |     "embeddings = []\n",
125 |     "for text in texts:\n",
126 |     " embedding = get_embedding(text)\n",
127 |     "\n",
128 |     " # Uncomment the following line to convert to BSON vectors\n",
129 |     " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
130 |     " \n",
131 |     " embeddings.append(embedding)\n",
132 |     "\n",
133 |     " # Print the embeddings\n",
134 |     " print(f\"\\nText: {text}\")\n",
135 |     " print(f\"Embedding: {embedding[:3]}... (truncated)\")"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "## Ingest Embeddings into MongoDB"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "def create_docs_with_embeddings(embeddings, data):\n",
152 |     "   docs = []\n",
153 |     "   for i, (embedding, text) in enumerate(zip(embeddings, data)):\n",
154 |     "      doc = {\n",
155 |     "            \"_id\": i,\n",
156 |     "            \"text\": text,\n",
157 |     "            \"embedding\": embedding,\n",
158 |     "      }\n",
159 |     "      docs.append(doc)\n",
160 |     "   return docs"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# Create documents with embeddings and sample data\n",
170 |     "docs = create_docs_with_embeddings(embeddings, texts)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "import pymongo\n",
180 |     "\n",
181 |     "# Connect to your MongoDB cluster\n",
182 |     "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
183 |     "db = mongo_client[\"sample_db\"]\n",
184 |     "collection = db[\"embeddings\"]\n",
185 |     "\n",
186 |     "# Ingest data into MongoDB\n",
187 |     "collection.insert_many(docs)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "## Index and Query Your Embeddings"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "from pymongo.operations import SearchIndexModel\n",
204 |     "import time\n",
205 |     "\n",
206 |     "# Create your index model, then create the search index\n",
207 |     "search_index_model = SearchIndexModel(\n",
208 |     "  definition = {\n",
209 |     "    \"fields\": [\n",
210 |     "      {\n",
211 |     "        \"type\": \"vector\",\n",
212 |     "        \"path\": \"embedding\",\n",
213 |     "        \"similarity\": \"dotProduct\",\n",
214 |     "        \"numDimensions\": 768\n",
215 |     "      }\n",
216 |     "    ]\n",
217 |     "  },\n",
218 |     "  name=\"vector_index\",\n",
219 |     "  type=\"vectorSearch\"\n",
220 |     ")\n",
221 |     "result = collection.create_search_index(model=search_index_model)\n",
222 |     "\n",
223 |     "# Wait for initial sync to complete\n",
224 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
225 |     "predicate=None\n",
226 |     "if predicate is None:\n",
227 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
228 |     "\n",
229 |     "while True:\n",
230 |     "  indices = list(collection.list_search_indexes(result))\n",
231 |     "  if len(indices) and predicate(indices[0]):\n",
232 |     "    break\n",
233 |     "  time.sleep(5)\n",
234 |     "print(result + \" is ready for querying.\")"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "# Generate embedding for the search query\n",
244 |     "query_embedding = get_embedding(\"ocean tragedy\")\n",
245 |     "\n",
246 |     "# Sample vector search pipeline\n",
247 |     "pipeline = [\n",
248 |     "   {\n",
249 |     "      \"$vectorSearch\": {\n",
250 |     "            \"index\": \"vector_index\",\n",
251 |     "            \"queryVector\": query_embedding,\n",
252 |     "            \"path\": \"embedding\",\n",
253 |     "            \"exact\": True,\n",
254 |     "            \"limit\": 5\n",
255 |     "      }\n",
256 |     "   }, \n",
257 |     "   {\n",
258 |     "      \"$project\": {\n",
259 |     "         \"_id\": 0, \n",
260 |     "         \"text\": 1,\n",
261 |     "         \"score\": {\n",
262 |     "            \"$meta\": \"vectorSearchScore\"\n",
263 |     "         }\n",
264 |     "      }\n",
265 |     "   }\n",
266 |     "]\n",
267 |     "\n",
268 |     "# Execute the search\n",
269 |     "results = collection.aggregate(pipeline)\n",
270 |     "\n",
271 |     "# Print results\n",
272 |     "for i in results:\n",
273 |     "   print(i)\n"
274 |    ]
275 |   }
276 |  ],
277 |  "metadata": {
278 |   "kernelspec": {
279 |    "display_name": "Python 3",
280 |    "language": "python",
281 |    "name": "python3"
282 |   },
283 |   "language_info": {
284 |    "codemirror_mode": {
285 |     "name": "ipython",
286 |     "version": 3
287 |    },
288 |    "file_extension": ".py",
289 |    "mimetype": "text/x-python",
290 |    "name": "python",
291 |    "nbconvert_exporter": "python",
292 |    "pygments_lexer": "ipython3",
293 |    "version": "3.10.12"
294 |   }
295 |  },
296 |  "nbformat": 4,
297 |  "nbformat_minor": 2
298 | }
299 | 


--------------------------------------------------------------------------------
/create-embeddings/voyage-new-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Embeddings - Voyage AI - New Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to generate embeddings from **new data** by using the ``voyage-3-large`` model from Voyage AI.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-new-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade voyageai pymongo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "vscode": {
 40 |      "languageId": "shellscript"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "## Use an Embedding Model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import os\n",
 54 |     "import voyageai\n",
 55 |     "\n",
 56 |     "# Specify your Voyage API key and embedding model\n",
 57 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<api-key>\"\n",
 58 |     "model = \"voyage-3-large\"\n",
 59 |     "vo = voyageai.Client()\n",
 60 |     "\n",
 61 |     "# Define a function to generate embeddings\n",
 62 |     "def get_embedding(data, input_type = \"document\"):\n",
 63 |     "  embeddings = vo.embed(\n",
 64 |     "      data, model = model, input_type = input_type\n",
 65 |     "  ).embeddings\n",
 66 |     "  return embeddings[0]\n",
 67 |     "\n",
 68 |     "# Generate an embedding\n",
 69 |     "embedding = get_embedding(\"foo\")\n",
 70 |     "print(embedding)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### (Optional) Compress your embeddings\n",
 78 |     "\n",
 79 |     "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from bson.binary import Binary \n",
 89 |     "from bson.binary import BinaryVectorDtype\n",
 90 |     "\n",
 91 |     "# Define a function to generate BSON vectors\n",
 92 |     "def generate_bson_vector(vector, vector_dtype):\n",
 93 |     "   return Binary.from_vector(vector, vector_dtype)\n",
 94 |     "\n",
 95 |     "# Generate BSON vectors using the `BinaryVectorDtype` class\n",
 96 |     "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
 97 |     "\n",
 98 |     "# Print the converted embedding\n",
 99 |     "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Generate Embeddings"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "# Sample data\n",
116 |     "texts = [\n",
117 |     "  \"Titanic: The story of the 1912 sinking of the largest luxury liner ever built\",\n",
118 |     "  \"The Lion King: Lion cub and future king Simba searches for his identity\",\n",
119 |     "  \"Avatar: A marine is dispatched to the moon Pandora on a unique mission\"\n",
120 |     "]"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# Generate embeddings from the sample data\n",
130 |     "embeddings = []\n",
131 |     "for text in texts:\n",
132 |     " embedding = get_embedding(text)\n",
133 |     "\n",
134 |     " # Uncomment the following line to convert to BSON vectors\n",
135 |     " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
136 |     " \n",
137 |     " embeddings.append(embedding)\n",
138 |     "\n",
139 |     " # Print the embeddings\n",
140 |     " print(f\"\\nText: {text}\")\n",
141 |     " print(f\"Embedding: {embedding[:3]}... (truncated)\")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "## Ingest Embeddings into MongoDB"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def create_docs_with_embeddings(embeddings, data):\n",
158 |     "   docs = []\n",
159 |     "   for i, (embedding, text) in enumerate(zip(embeddings, data)):\n",
160 |     "      doc = {\n",
161 |     "            \"_id\": i,\n",
162 |     "            \"text\": text,\n",
163 |     "            \"embedding\": embedding,\n",
164 |     "      }\n",
165 |     "      docs.append(doc)\n",
166 |     "   return docs"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Create documents with embeddings and sample data\n",
176 |     "docs = create_docs_with_embeddings(embeddings, texts)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "import pymongo\n",
186 |     "\n",
187 |     "# Connect to your MongoDB cluster\n",
188 |     "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
189 |     "db = mongo_client[\"sample_db\"]\n",
190 |     "collection = db[\"embeddings\"]\n",
191 |     "\n",
192 |     "# Ingest data into MongoDB\n",
193 |     "collection.insert_many(docs)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Index and Query Your Embeddings"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "from pymongo.operations import SearchIndexModel\n",
210 |     "import time\n",
211 |     "\n",
212 |     "# Create your index model, then create the search index\n",
213 |     "search_index_model = SearchIndexModel(\n",
214 |     "  definition = {\n",
215 |     "    \"fields\": [\n",
216 |     "      {\n",
217 |     "        \"type\": \"vector\",\n",
218 |     "        \"path\": \"embedding\",\n",
219 |     "        \"similarity\": \"dotProduct\",\n",
220 |     "        \"numDimensions\": 1024\n",
221 |     "      }\n",
222 |     "    ]\n",
223 |     "  },\n",
224 |     "  name=\"vector_index\",\n",
225 |     "  type=\"vectorSearch\"\n",
226 |     ")\n",
227 |     "result = collection.create_search_index(model=search_index_model)\n",
228 |     "\n",
229 |     "# Wait for initial sync to complete\n",
230 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
231 |     "predicate=None\n",
232 |     "if predicate is None:\n",
233 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
234 |     "\n",
235 |     "while True:\n",
236 |     "  indices = list(collection.list_search_indexes(result))\n",
237 |     "  if len(indices) and predicate(indices[0]):\n",
238 |     "    break\n",
239 |     "  time.sleep(5)\n",
240 |     "print(result + \" is ready for querying.\")"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# Generate embedding for the search query\n",
250 |     "query_embedding = get_embedding(\"ocean tragedy\", input_type=\"query\")\n",
251 |     "\n",
252 |     "# Sample vector search pipeline\n",
253 |     "pipeline = [\n",
254 |     "   {\n",
255 |     "      \"$vectorSearch\": {\n",
256 |     "            \"index\": \"vector_index\",\n",
257 |     "            \"queryVector\": query_embedding,\n",
258 |     "            \"path\": \"embedding\",\n",
259 |     "            \"exact\": True,\n",
260 |     "            \"limit\": 5\n",
261 |     "      }\n",
262 |     "   }, \n",
263 |     "   {\n",
264 |     "      \"$project\": {\n",
265 |     "         \"_id\": 0, \n",
266 |     "         \"text\": 1,\n",
267 |     "         \"score\": {\n",
268 |     "            \"$meta\": \"vectorSearchScore\"\n",
269 |     "         }\n",
270 |     "      }\n",
271 |     "   }\n",
272 |     "]\n",
273 |     "\n",
274 |     "# Execute the search\n",
275 |     "results = collection.aggregate(pipeline)\n",
276 |     "\n",
277 |     "# Print results\n",
278 |     "for i in results:\n",
279 |     "   print(i)\n"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.10.12"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 2
304 | }
305 | 


--------------------------------------------------------------------------------
/create-embeddings/openai-new-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Create Embeddings - OpenAI - New Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to generate embeddings from **new data** by using OpenAI's ``text-embedding-3-small`` model.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/create-embeddings/openai-new-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade openai pymongo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "vscode": {
 40 |      "languageId": "shellscript"
 41 |     }
 42 |    },
 43 |    "source": [
 44 |     "## Use an Embedding Model"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import os\n",
 54 |     "from openai import OpenAI\n",
 55 |     "\n",
 56 |     "# Specify your OpenAI API key and embedding model\n",
 57 |     "os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
 58 |     "model = \"text-embedding-3-small\"\n",
 59 |     "openai_client = OpenAI()\n",
 60 |     "\n",
 61 |     "# Define a function to generate embeddings\n",
 62 |     "def get_embedding(text):\n",
 63 |     "   \"\"\"Generates vector embeddings for the given text.\"\"\"\n",
 64 |     "\n",
 65 |     "   embedding = openai_client.embeddings.create(input = [text], model=model).data[0].embedding\n",
 66 |     "   return embedding\n",
 67 |     "\n",
 68 |     "# Generate an embedding\n",
 69 |     "embedding = get_embedding(\"foo\")\n",
 70 |     "print(embedding)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### (Optional) Compress your embeddings\n",
 78 |     "\n",
 79 |     "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from bson.binary import Binary \n",
 89 |     "from bson.binary import BinaryVectorDtype\n",
 90 |     "\n",
 91 |     "# Define a function to generate BSON vectors\n",
 92 |     "def generate_bson_vector(vector, vector_dtype):\n",
 93 |     "   return Binary.from_vector(vector, vector_dtype)\n",
 94 |     "\n",
 95 |     "# Generate BSON vectors using the `BinaryVectorDtype` class\n",
 96 |     "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
 97 |     "\n",
 98 |     "# Print the converted embedding\n",
 99 |     "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Generate Embeddings"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "# Sample data\n",
116 |     "texts = [\n",
117 |     "  \"Titanic: The story of the 1912 sinking of the largest luxury liner ever built\",\n",
118 |     "  \"The Lion King: Lion cub and future king Simba searches for his identity\",\n",
119 |     "  \"Avatar: A marine is dispatched to the moon Pandora on a unique mission\"\n",
120 |     "]"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# Generate embeddings from the sample data\n",
130 |     "embeddings = []\n",
131 |     "for text in texts:\n",
132 |     " embedding = get_embedding(text)\n",
133 |     "\n",
134 |     " # Uncomment the following line to convert to BSON vectors\n",
135 |     " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
136 |     " \n",
137 |     " embeddings.append(embedding)\n",
138 |     "\n",
139 |     " # Print the embeddings\n",
140 |     " print(f\"\\nText: {text}\")\n",
141 |     " print(f\"Embedding: {embedding[:3]}... (truncated)\")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "## Ingest Embeddings into MongoDB"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def create_docs_with_embeddings(embeddings, data):\n",
158 |     "   docs = []\n",
159 |     "   for i, (embedding, text) in enumerate(zip(embeddings, data)):\n",
160 |     "      doc = {\n",
161 |     "            \"_id\": i,\n",
162 |     "            \"text\": text,\n",
163 |     "            \"embedding\": embedding,\n",
164 |     "      }\n",
165 |     "      docs.append(doc)\n",
166 |     "   return docs"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Create documents with embeddings and sample data\n",
176 |     "docs = create_docs_with_embeddings(embeddings, texts)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "import pymongo\n",
186 |     "\n",
187 |     "# Connect to your MongoDB cluster\n",
188 |     "mongo_client = pymongo.MongoClient(\"<connection-string>\")\n",
189 |     "db = mongo_client[\"sample_db\"]\n",
190 |     "collection = db[\"embeddings\"]\n",
191 |     "\n",
192 |     "# Ingest data into MongoDB\n",
193 |     "collection.insert_many(docs)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Index and Query Your Embeddings"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "from pymongo.operations import SearchIndexModel\n",
210 |     "import time\n",
211 |     "\n",
212 |     "# Create your index model, then create the search index\n",
213 |     "search_index_model = SearchIndexModel(\n",
214 |     "  definition = {\n",
215 |     "    \"fields\": [\n",
216 |     "      {\n",
217 |     "        \"type\": \"vector\",\n",
218 |     "        \"path\": \"embedding\",\n",
219 |     "        \"similarity\": \"dotProduct\",\n",
220 |     "        \"numDimensions\": 1536\n",
221 |     "      }\n",
222 |     "    ]\n",
223 |     "  },\n",
224 |     "  name=\"vector_index\",\n",
225 |     "  type=\"vectorSearch\"\n",
226 |     ")\n",
227 |     "result = collection.create_search_index(model=search_index_model)\n",
228 |     "\n",
229 |     "# Wait for initial sync to complete\n",
230 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
231 |     "predicate=None\n",
232 |     "if predicate is None:\n",
233 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
234 |     "\n",
235 |     "while True:\n",
236 |     "  indices = list(collection.list_search_indexes(result))\n",
237 |     "  if len(indices) and predicate(indices[0]):\n",
238 |     "    break\n",
239 |     "  time.sleep(5)\n",
240 |     "print(result + \" is ready for querying.\")"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# Generate embedding for the search query\n",
250 |     "query_embedding = get_embedding(\"ocean tragedy\")\n",
251 |     "\n",
252 |     "# Sample vector search pipeline\n",
253 |     "pipeline = [\n",
254 |     "   {\n",
255 |     "      \"$vectorSearch\": {\n",
256 |     "            \"index\": \"vector_index\",\n",
257 |     "            \"queryVector\": query_embedding,\n",
258 |     "            \"path\": \"embedding\",\n",
259 |     "            \"exact\": True,\n",
260 |     "            \"limit\": 5\n",
261 |     "      }\n",
262 |     "   }, \n",
263 |     "   {\n",
264 |     "      \"$project\": {\n",
265 |     "         \"_id\": 0, \n",
266 |     "         \"text\": 1,\n",
267 |     "         \"score\": {\n",
268 |     "            \"$meta\": \"vectorSearchScore\"\n",
269 |     "         }\n",
270 |     "      }\n",
271 |     "   }\n",
272 |     "]\n",
273 |     "\n",
274 |     "# Execute the search\n",
275 |     "results = collection.aggregate(pipeline)\n",
276 |     "\n",
277 |     "# Print results\n",
278 |     "for i in results:\n",
279 |     "   print(i)\n"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.10.12"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 2
304 | }
305 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# LangChain MongoDB Integration - Implement RAG"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [LangChain Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/get-started/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai langchain-text-splitters pymongo pypdf"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "\n",
 42 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-api-key>\"\n",
 43 |     "os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
 44 |     "MONGODB_URI = \"<connection-string>\""
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from langchain_community.document_loaders import PyPDFLoader\n",
 54 |     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
 55 |     "\n",
 56 |     "# Load the PDF\n",
 57 |     "loader = PyPDFLoader(\"https://investors.mongodb.com/node/13176/pdf\")\n",
 58 |     "data = loader.load()\n",
 59 |     "\n",
 60 |     "# Split PDF into documents\n",
 61 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
 62 |     "docs = text_splitter.split_documents(data)\n",
 63 |     "\n",
 64 |     "# Print the first document\n",
 65 |     "docs[0]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
 75 |     "from langchain_voyageai import VoyageAIEmbeddings\n",
 76 |     "\n",
 77 |     "# Instantiate the vector store using your MongoDB connection string\n",
 78 |     "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
 79 |     "  connection_string = MONGODB_URI,\n",
 80 |     "  namespace = \"langchain_db.test\",\n",
 81 |     "  embedding =  VoyageAIEmbeddings(model=\"voyage-3-large\"),\n",
 82 |     "  index_name = \"vector_index\"\n",
 83 |     ")\n",
 84 |     "\n",
 85 |     "# Add documents to the vector store\n",
 86 |     "vector_store.add_documents(documents=docs)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "# Use helper method to create the vector search index\n",
 96 |     "vector_store.create_vector_search_index(\n",
 97 |     "   dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
 98 |     "   filters = [ \"page_label\" ],\n",
 99 |     "   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
100 |     ")"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Semantic Search Query"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "import pprint\n",
117 |     "\n",
118 |     "query = \"MongoDB acquisition\"\n",
119 |     "results = vector_store.similarity_search(query)\n",
120 |     "\n",
121 |     "pprint.pprint(results)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## Semantic Search with Score"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "query = \"MongoDB acquisition\"\n",
138 |     "results = vector_store.similarity_search_with_score(\n",
139 |     "   query = query, k = 3\n",
140 |     ")\n",
141 |     "\n",
142 |     "pprint.pprint(results)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "## Semantic Search with Filtering"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "query = \"MongoDB acquisition\"\n",
159 |     "\n",
160 |     "results = vector_store.similarity_search_with_score(\n",
161 |     "   query = query,\n",
162 |     "   k = 3,\n",
163 |     "   pre_filter = { \"page_label\": { \"$eq\": 2 } }\n",
164 |     ")\n",
165 |     "\n",
166 |     "pprint.pprint(results)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## Basic RAG"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "from langchain_core.output_parsers import StrOutputParser\n",
183 |     "from langchain_core.runnables import RunnablePassthrough\n",
184 |     "from langchain_openai import ChatOpenAI\n",
185 |     "from langchain.prompts import PromptTemplate\n",
186 |     "\n",
187 |     "# Instantiate MongoDB Vector Search as a retriever\n",
188 |     "retriever = vector_store.as_retriever(\n",
189 |     "   search_type = \"similarity\",\n",
190 |     "   search_kwargs = { \"k\": 10 }\n",
191 |     ")\n",
192 |     "\n",
193 |     "# Define a prompt template\n",
194 |     "template = \"\"\"\n",
195 |     "\n",
196 |     "Use the following pieces of context to answer the question at the end.\n",
197 |     "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
198 |     "\n",
199 |     "{context}\n",
200 |     "\n",
201 |     "Question: {question}\n",
202 |     "\"\"\"\n",
203 |     "custom_rag_prompt = PromptTemplate.from_template(template)\n",
204 |     "\n",
205 |     "llm = ChatOpenAI(model=\"gpt-4o\")\n",
206 |     "\n",
207 |     "def format_docs(docs):\n",
208 |     "   return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
209 |     "\n",
210 |     "# Construct a chain to answer questions on your data\n",
211 |     "rag_chain = (\n",
212 |     "   { \"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
213 |     "   | custom_rag_prompt\n",
214 |     "   | llm\n",
215 |     "   | StrOutputParser()\n",
216 |     ")\n",
217 |     "\n",
218 |     "# Prompt the chain\n",
219 |     "question = \"What was MongoDB's latest acquisition?\"\n",
220 |     "answer = rag_chain.invoke(question)\n",
221 |     "\n",
222 |     "print(\"Question: \" + question)\n",
223 |     "print(\"Answer: \" + answer)\n",
224 |     "\n",
225 |     "# Return source documents\n",
226 |     "documents = retriever.invoke(question)\n",
227 |     "print(\"\\nSource documents:\")\n",
228 |     "pprint.pprint(documents)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "## RAG with Filters"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "# Instantiate MongoDB Vector Search as a retriever\n",
245 |     "retriever = vector_store.as_retriever(\n",
246 |     "   search_type = \"similarity\",\n",
247 |     "   search_kwargs = {\n",
248 |     "      \"k\": 10,\n",
249 |     "      \"score_threshold\": 0.75,\n",
250 |     "      \"pre_filter\": { \"page_label\": { \"$eq\": 2 } }\n",
251 |     "   }\n",
252 |     ")\n",
253 |     "\n",
254 |     "# Define a prompt template\n",
255 |     "template = \"\"\"\n",
256 |     "\n",
257 |     "Use the following pieces of context to answer the question at the end.\n",
258 |     "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
259 |     "\n",
260 |     "{context}\n",
261 |     "\n",
262 |     "Question: {question}\n",
263 |     "\"\"\"\n",
264 |     "custom_rag_prompt = PromptTemplate.from_template(template)\n",
265 |     "\n",
266 |     "llm = ChatOpenAI(model=\"gpt-4o\")\n",
267 |     "\n",
268 |     "def format_docs(docs):\n",
269 |     "   return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
270 |     "\n",
271 |     "# Construct a chain to answer questions on your data\n",
272 |     "rag_chain = (\n",
273 |     "   { \"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
274 |     "   | custom_rag_prompt\n",
275 |     "   | llm\n",
276 |     "   | StrOutputParser()\n",
277 |     ")\n",
278 |     "\n",
279 |     "# Prompt the chain\n",
280 |     "question = \"What was MongoDB's latest acquisition?\"\n",
281 |     "answer = rag_chain.invoke(question)\n",
282 |     "\n",
283 |     "print(\"Question: \" + question)\n",
284 |     "print(\"Answer: \" + answer)\n",
285 |     "\n",
286 |     "# Return source documents\n",
287 |     "documents = retriever.invoke(question)\n",
288 |     "print(\"\\nSource documents:\")\n",
289 |     "pprint.pprint(documents)"
290 |    ]
291 |   }
292 |  ],
293 |  "metadata": {
294 |   "kernelspec": {
295 |    "display_name": "Python 3",
296 |    "language": "python",
297 |    "name": "python3"
298 |   },
299 |   "language_info": {
300 |    "codemirror_mode": {
301 |     "name": "ipython",
302 |     "version": 3
303 |    },
304 |    "file_extension": ".py",
305 |    "mimetype": "text/x-python",
306 |    "name": "python",
307 |    "nbconvert_exporter": "python",
308 |    "pygments_lexer": "ipython3",
309 |    "version": "3.9.12"
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 2
314 | }
315 | 


--------------------------------------------------------------------------------
/ai-integrations/llamaindex.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - LlamaIndex Integration"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [LlamaIndex Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/llamaindex/) page. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/llamaindex.ipynb\">\n",
 17 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 18 |     "</a>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {
 25 |     "vscode": {
 26 |      "languageId": "shellscript"
 27 |     }
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "pip install --quiet --upgrade llama-index llama-index-vector-stores-mongodb llama-index-llms-openai llama-index-embeddings-voyageai pymongo"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "\n",
 42 |     "os.environ[\"VOYAGEAI_API_KEY\"] = \"<voyageai-api-key>\"\n",
 43 |     "os.environ[\"OPENAI_API_KEY\"] = \"<openai-api-key>\"\n",
 44 |     "MONGODB_URI = \"<connection-string>\""
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from llama_index.embeddings.voyageai import VoyageEmbedding\n",
 54 |     "from llama_index.llms.openai import OpenAI\n",
 55 |     "from llama_index.core.settings import Settings\n",
 56 |     "\n",
 57 |     "embed_model= VoyageEmbedding(\n",
 58 |     "  voyage_api_key = os.environ[\"VOYAGEAI_API_KEY\"],\n",
 59 |     "  model_name = \"voyage-3-large\",\n",
 60 |     ")\n",
 61 |     "\n",
 62 |     "Settings.llm = OpenAI()\n",
 63 |     "Settings.embed_model = embed_model\n",
 64 |     "Settings.chunk_size = 100\n",
 65 |     "Settings.chunk_overlap = 10"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "from llama_index.core import SimpleDirectoryReader\n",
 75 |     "\n",
 76 |     "# Load the sample data\n",
 77 |     "from urllib.request import urlretrieve\n",
 78 |     "urlretrieve(\"https://investors.mongodb.com/node/13176/pdf\", \"mongodb-earnings-report.pdf\")\n",
 79 |     "sample_data = SimpleDirectoryReader(input_files=[\"mongodb-earnings-report.pdf\"]).load_data()\n",
 80 |     "\n",
 81 |     "# Print the first document\n",
 82 |     "sample_data[0]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "import pymongo\n",
 92 |     "from llama_index.core import StorageContext\n",
 93 |     "from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch\n",
 94 |     "\n",
 95 |     "# Connect to your MongoDB cluster\n",
 96 |     "mongo_client = pymongo.MongoClient(MONGODB_URI)\n",
 97 |     "\n",
 98 |     "# Instantiate the vector store\n",
 99 |     "atlas_vector_store = MongoDBAtlasVectorSearch(\n",
100 |     "    mongo_client,\n",
101 |     "    db_name = \"llamaindex_db\",\n",
102 |     "    collection_name = \"test\",\n",
103 |     "    vector_index_name = \"vector_index\"\n",
104 |     ")\n",
105 |     "vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "from llama_index.core import VectorStoreIndex\n",
115 |     "\n",
116 |     "# Store the data as vector embeddings\n",
117 |     "vector_store_index = VectorStoreIndex.from_documents(\n",
118 |     "   sample_data, storage_context=vector_store_context, show_progress=True\n",
119 |     ")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "from pymongo.operations import SearchIndexModel\n",
129 |     "import time\n",
130 |     "\n",
131 |     "# Specify the collection for which to create the index\n",
132 |     "collection = mongo_client[\"llamaindex_db\"][\"test\"]\n",
133 |     "\n",
134 |     "# Create your index model, then create the search index\n",
135 |     "search_index_model = SearchIndexModel(\n",
136 |     "  definition={\n",
137 |     "    \"fields\": [\n",
138 |     "      {\n",
139 |     "        \"type\": \"vector\",\n",
140 |     "        \"path\": \"embedding\",\n",
141 |     "        \"numDimensions\": 1024,\n",
142 |     "        \"similarity\": \"cosine\"\n",
143 |     "      },\n",
144 |     "      {\n",
145 |     "        \"type\": \"filter\",\n",
146 |     "        \"path\": \"metadata.page_label\"\n",
147 |     "      }\n",
148 |     "    ]\n",
149 |     "  },\n",
150 |     "  name=\"vector_index\",\n",
151 |     "  type=\"vectorSearch\",\n",
152 |     ")\n",
153 |     "result = collection.create_search_index(model=search_index_model)\n",
154 |     "\n",
155 |     "# Wait for initial sync to complete\n",
156 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
157 |     "predicate=None\n",
158 |     "if predicate is None:\n",
159 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
160 |     "\n",
161 |     "while True:\n",
162 |     "  indices = list(collection.list_search_indexes(result))\n",
163 |     "  if len(indices) and predicate(indices[0]):\n",
164 |     "    break\n",
165 |     "  time.sleep(5)\n",
166 |     "print(result + \" is ready for querying.\")"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## Semantic Search Query"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "retriever = vector_store_index.as_retriever(similarity_top_k=3)\n",
183 |     "nodes = retriever.retrieve(\"MongoDB acquisition\")\n",
184 |     "\n",
185 |     "for node in nodes:\n",
186 |     "    print(node)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Semantic Search with Filtering"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator\n",
203 |     "\n",
204 |     "# Specify metadata filters\n",
205 |     "metadata_filters = MetadataFilters(\n",
206 |     "   filters=[ExactMatchFilter(key=\"metadata.page_label\", value=\"2\")]\n",
207 |     ")\n",
208 |     "retriever = vector_store_index.as_retriever(similarity_top_k=3, filters=metadata_filters)\n",
209 |     "nodes = retriever.retrieve(\"MongoDB acquisition\")\n",
210 |     "\n",
211 |     "for node in nodes:\n",
212 |     "    print(node)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "## Basic RAG"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "from llama_index.core.retrievers import VectorIndexRetriever\n",
229 |     "from llama_index.core.query_engine import RetrieverQueryEngine\n",
230 |     "import pprint\n",
231 |     "\n",
232 |     "# Instantiate MongoDB Vector Search as a retriever\n",
233 |     "vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)\n",
234 |     "\n",
235 |     "# Pass the retriever into the query engine\n",
236 |     "query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)\n",
237 |     "\n",
238 |     "# Prompt the LLM\n",
239 |     "response = query_engine.query(\"What was MongoDB's latest acquisition?\")\n",
240 |     "\n",
241 |     "print(response)\n",
242 |     "print(\"\\nSource documents: \")\n",
243 |     "pprint.pprint(response.source_nodes)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "## RAG with Filters"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "from llama_index.core.retrievers import VectorIndexRetriever\n",
260 |     "from llama_index.core.query_engine import RetrieverQueryEngine\n",
261 |     "import pprint\n",
262 |     "\n",
263 |     "# Specify metadata filters\n",
264 |     "metadata_filters = MetadataFilters(\n",
265 |     "   filters=[ExactMatchFilter(key=\"metadata.page_label\", value=\"2\")]\n",
266 |     ")\n",
267 |     "\n",
268 |     "# Instantiate MongoDB Vector Search as a retriever\n",
269 |     "vector_store_retriever = VectorIndexRetriever(index=vector_store_index, filters=metadata_filters, similarity_top_k=5)\n",
270 |     "\n",
271 |     "# Pass the retriever into the query engine\n",
272 |     "query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)\n",
273 |     "\n",
274 |     "# Prompt the LLM\n",
275 |     "response = query_engine.query(\"What was MongoDB's latest acquisition?\")\n",
276 |     "\n",
277 |     "print(response)\n",
278 |     "print(\"\\nSource documents: \")\n",
279 |     "pprint.pprint(response.source_nodes)"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "3.9.12",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.9.12"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 2
304 | }
305 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain-graphrag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "b5dcbf95-9a30-416d-afed-d5b2bf0e8651",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# LangChain MongoDB Integration - GraphRAG\n",
  9 |     "\n",
 10 |     "This notebook is a companion to the [GraphRAG with MongoDB and LangChain](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/graph-rag/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 11 |     "\n",
 12 |     "This notebook demonstrates a GraphRAG implementation using MongoDB and LangChain. Compared to vector-based RAG, which structures your data as vector embeddings, GraphRAG structures data as a knowledge graph with entities and their relationships. This enables relationship-aware retrieval and multi-hop reasoning.\n",
 13 |     "\n",
 14 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-graphrag.ipynb\">\n",
 15 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 16 |     "</a>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "23f70093-83ea-4ecc-87db-2f2f89e546d7",
 23 |    "metadata": {
 24 |     "scrolled": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "pip install --quiet --upgrade pymongo langchain_community wikipedia langchain_openai langchain_mongodb langchain-text-splitters pyvis"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "d96955f9-a370-4f45-970d-ef187ee6195c",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Set up your environment\n",
 37 |     "\n",
 38 |     "Before you begin, make sure you have the following:\n",
 39 |     "\n",
 40 |     "- A MongoDB cluster up and running (you'll need the [connection string](https://www.mongodb.com/docs/manual/reference/connection-string/))\n",
 41 |     "- An API key to access an LLM (This tutorial uses a model from OpenAI, but you can use any model [supported by LangChain](https://python.langchain.com/docs/integrations/chat/))"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "0119b58d-f14e-4f36-a284-345d94478537",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "import os\n",
 52 |     "\n",
 53 |     "os.environ[\"OPENAI_API_KEY\"] = \"<api-key>\"\n",
 54 |     "MONGODB_URI = \"<connection-string>\"\n",
 55 |     "DB_NAME = \"langchain_db\"    # MongoDB database to store the knowledge graph\n",
 56 |     "COLLECTION = \"wikipedia\"    # MongoDB collection to store the knowledge graph"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "0adf66a8",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Use MongoDB as a knowledge graph\n",
 65 |     "\n",
 66 |     "Use the `MongoDBGraphStore` component to store your data as a knowledge graph. This component allows you to implement GraphRAG by storing entities (nodes) and their relationships (edges) in a MongoDB collection. It stores each entity as a document with relationship fields that reference other documents in your collection."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "f4e8db2f-d918-41aa-92f8-41f80a6d747a",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from langchain_openai import OpenAI\n",
 77 |     "from langchain.chat_models import init_chat_model\n",
 78 |     "\n",
 79 |     "# For best results, use latest models such as gpt-4o and Claude Sonnet 3.5+, etc.\n",
 80 |     "chat_model = init_chat_model(\"gpt-4o\", model_provider=\"openai\", temperature=0)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "72cd5c08-e17b-4f47-bca7-ded0fb25fb85",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "from langchain_community.document_loaders import WikipediaLoader\n",
 91 |     "from langchain_text_splitters import TokenTextSplitter\n",
 92 |     "\n",
 93 |     "# Load Wikipedia pages corresponding to the query \"Sherlock Holmes\"\n",
 94 |     "wikipedia_pages = WikipediaLoader(query=\"Sherlock Holmes\", load_max_docs=3).load()\n",
 95 |     "\n",
 96 |     "# Split the documents into chunks for efficient downstream processing (graph creation)\n",
 97 |     "text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=0)\n",
 98 |     "wikipedia_docs = text_splitter.split_documents(wikipedia_pages)\n",
 99 |     "\n",
100 |     "# Print the first document\n",
101 |     "wikipedia_docs[0]"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "2dc8f05b-0f9a-4293-b9ea-761030c98dca",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "from langchain_mongodb.graphrag.graph import MongoDBGraphStore\n",
112 |     "\n",
113 |     "graph_store = MongoDBGraphStore(\n",
114 |     "    connection_string = MONGODB_URI,\n",
115 |     "    database_name = DB_NAME,\n",
116 |     "    collection_name = COLLECTION,\n",
117 |     "    entity_extraction_model = chat_model\n",
118 |     ")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "3664189e",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# Extract entities and create knowledge graph in MongoDB\n",
129 |     "# This might take a few minutes; you can ignore any warnings\n",
130 |     "graph_store.add_documents(wikipedia_docs)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "b167c2eb-b2c5-45ef-bdc9-8230f7da4c52",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Visualize the knowledge graph\n",
139 |     "\n",
140 |     "To visualize the knowledge graph, you can export the structured data to a visualization library like `pyvis`.\n",
141 |     "This helps you to explore and understand the relationships and hierarchies within your data."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "8b515723-a8a4-435b-b386-5cb3244c2745",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "import networkx as nx\n",
152 |     "from pyvis.network import Network\n",
153 |     "\n",
154 |     "def visualize_graph(collection):\n",
155 |     "    docs = list(collection.find())\n",
156 |     "    \n",
157 |     "    def format_attributes(attrs):\n",
158 |     "        return \"<br>\".join(f\"{k}: {', '.join(v)}\" for k, v in attrs.items()) if attrs else \"\"\n",
159 |     "    \n",
160 |     "    G = nx.DiGraph()\n",
161 |     "\n",
162 |     "    # Create nodes\n",
163 |     "    for doc in docs:\n",
164 |     "        node_id = str(doc[\"_id\"])\n",
165 |     "        info = f\"Type: {doc.get('type', '')}\"\n",
166 |     "        if \"attributes\" in doc:\n",
167 |     "            attr_info = format_attributes(doc[\"attributes\"])\n",
168 |     "            if attr_info:\n",
169 |     "                info += \"<br>\" + attr_info\n",
170 |     "        G.add_node(node_id, label=node_id, title=info.replace(\"<br>\", \"\\n\"))\n",
171 |     "\n",
172 |     "    # Create edges\n",
173 |     "    for doc in docs:\n",
174 |     "        source = str(doc[\"_id\"])\n",
175 |     "        rels = doc.get(\"relationships\", {})\n",
176 |     "        targets = rels.get(\"target_ids\", [])\n",
177 |     "        types = rels.get(\"types\", [])\n",
178 |     "        attrs = rels.get(\"attributes\", [])\n",
179 |     "        \n",
180 |     "        for i, target in enumerate(targets):\n",
181 |     "            edge_type = types[i] if i < len(types) else \"\"\n",
182 |     "            extra = attrs[i] if i < len(attrs) else {}\n",
183 |     "            edge_info = f\"Relationship: {edge_type}\"\n",
184 |     "            if extra:\n",
185 |     "                edge_info += \"<br>\" + format_attributes(extra)\n",
186 |     "            G.add_edge(source, str(target), label=edge_type, title=edge_info.replace(\"<br>\", \"\\n\"))\n",
187 |     "\n",
188 |     "    # Build and configure network\n",
189 |     "    nt = Network(notebook=True, cdn_resources='in_line', width=\"800px\", height=\"600px\", directed=True)\n",
190 |     "    nt.from_nx(G)\n",
191 |     "    nt.set_options('''\n",
192 |     "    var options = {\n",
193 |     "      \"interaction\": {\n",
194 |     "        \"hover\": true,\n",
195 |     "        \"tooltipDelay\": 200\n",
196 |     "      },\n",
197 |     "      \"nodes\": {\n",
198 |     "        \"font\": {\"multi\": \"html\"}\n",
199 |     "      },\n",
200 |     "      \"physics\": {\n",
201 |     "        \"repulsion\": {\n",
202 |     "          \"nodeDistance\": 300,\n",
203 |     "          \"centralGravity\": 0.2,\n",
204 |     "          \"springLength\": 200,\n",
205 |     "          \"springStrength\": 0.05,\n",
206 |     "          \"damping\": 0.09\n",
207 |     "        }\n",
208 |     "      }\n",
209 |     "    }\n",
210 |     "    ''')\n",
211 |     "\n",
212 |     "    return nt.generate_html()"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "id": "62f9040e",
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "from IPython.display import HTML, display\n",
223 |     "from pymongo import MongoClient\n",
224 |     "\n",
225 |     "client = MongoClient(MONGODB_URI)\n",
226 |     "\n",
227 |     "collection = client[DB_NAME][COLLECTION]\n",
228 |     "html = visualize_graph(collection)\n",
229 |     "\n",
230 |     "display(HTML(html))"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "id": "fbea568d-c656-4271-9e40-6ee01292255e",
236 |    "metadata": {},
237 |    "source": [
238 |     "## Answer questions on your data\n",
239 |     "\n",
240 |     "The `MongoDBGraphStore` class provides a `chat_response` method that you can use to answer questions on your data. It executes queries by using the `$graphLookup` aggregation stage."
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "id": "506c7366-972c-4e50-88c4-3d5b0151e363",
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "query = \"Who inspired Sherlock Holmes?\"\n",
251 |     "\n",
252 |     "answer = graph_store.chat_response(query)\n",
253 |     "answer.content"
254 |    ]
255 |   }
256 |  ],
257 |  "metadata": {
258 |   "kernelspec": {
259 |    "display_name": "Python 3 (ipykernel)",
260 |    "language": "python",
261 |    "name": "python3"
262 |   },
263 |   "language_info": {
264 |    "codemirror_mode": {
265 |     "name": "ipython",
266 |     "version": 3
267 |    },
268 |    "file_extension": ".py",
269 |    "mimetype": "text/x-python",
270 |    "name": "python",
271 |    "nbconvert_exporter": "python",
272 |    "pygments_lexer": "ipython3",
273 |    "version": "3.9.6"
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 5
278 | }
279 | 


--------------------------------------------------------------------------------
/quantization/existing-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Vector Quantization - Existing Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Vector Quantization](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/#how-to-ingest-pre-quantized-vectors) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to pre-quantize and ingest your vectors for vector search from **existing data in Atlas** by using the `voyage-3-large` model from [Voyage AI](https://www.voyageai.com).\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/quantization/existing-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade voyageai pymongo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import os\n",
 43 |     "import voyageai\n",
 44 |     "from bson.binary import Binary, BinaryVectorDtype\n",
 45 |     "\n",
 46 |     "# Initialize the VoyageAI Client\n",
 47 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<VOYAGEAI-API-KEY>\"\n",
 48 |     "vo = voyageai.Client()\n",
 49 |     "\n",
 50 |     "# Define a function to generate embeddings for all strings in `texts`\n",
 51 |     "def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):\n",
 52 |     "    embeddings = []\n",
 53 |     "    for text in texts:  # Process eachstring in the data list\n",
 54 |     "        embedding = vo.embed(\n",
 55 |     "            texts=[text],  # Pass each string as a list with a single item\n",
 56 |     "            model=model,\n",
 57 |     "            output_dtype=dtype,\n",
 58 |     "            output_dimension=output_dimension,\n",
 59 |     "        ).embeddings[0]\n",
 60 |     "        embeddings.append(embedding)  # Collect the embedding for the current text\n",
 61 |     "    return embeddings\n",
 62 |     "\n",
 63 |     "# Convert embeddings to BSON vectors\n",
 64 |     "def generate_bson_vector(vector, vector_dtype):\n",
 65 |     "   return Binary.from_vector(vector, vector_dtype)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "import pymongo  \n",
 75 |     "\n",
 76 |     "# Connect to your MongoDB cluster\n",
 77 |     "mongo_client = pymongo.MongoClient(\"<CONNECTION-STRING>\")\n",
 78 |     "db = mongo_client[\"sample_airbnb\"]\n",
 79 |     "collection = db[\"listingsAndReviews\"]\n",
 80 |     "\n",
 81 |     "# Filter to exclude null or empty summary fields\n",
 82 |     "filter = { \"summary\": {\"$nin\": [None, \"\"]} }\n",
 83 |     "\n",
 84 |     "# Get a subset of documents in the collection\n",
 85 |     "documents = collection.find(filter).limit(50)\n",
 86 |     "\n",
 87 |     "# Initialize the count of updated documents\n",
 88 |     "updated_doc_count = 0"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "model_name = \"voyage-3-large\"\n",
 98 |     "output_dimension = 1024\n",
 99 |     "float32_field = \"float32_embedding\"\n",
100 |     "int8_field = \"int8_embedding\"\n",
101 |     "int1_field = \"int1_embedding\"\n",
102 |     "\n",
103 |     "# Process and update each document\n",
104 |     "updated_doc_count = 0  \n",
105 |     "for document in documents:  \n",
106 |     "    summary = document.get(\"summary\")  \n",
107 |     "    if not summary:  \n",
108 |     "        continue  \n",
109 |     "  \n",
110 |     "    # Generate embeddings for the summary field  \n",
111 |     "    float_embeddings = generate_embeddings([summary], model=model_name, dtype=\"float\", output_dimension=output_dimension)  \n",
112 |     "    int8_embeddings = generate_embeddings([summary], model=model_name, dtype=\"int8\", output_dimension=output_dimension)  \n",
113 |     "    ubinary_embeddings = generate_embeddings([summary], model=model_name, dtype=\"ubinary\", output_dimension=output_dimension)  \n",
114 |     "  \n",
115 |     "    # Convert embeddings to BSON-compatible format  \n",
116 |     "    bson_float = generate_bson_vector(float_embeddings[0], BinaryVectorDtype.FLOAT32)  \n",
117 |     "    bson_int8 = generate_bson_vector(int8_embeddings[0], BinaryVectorDtype.INT8)  \n",
118 |     "    bson_ubinary = generate_bson_vector(ubinary_embeddings[0], BinaryVectorDtype.PACKED_BIT)  \n",
119 |     "  \n",
120 |     "    # Prepare the updated document  \n",
121 |     "    updated_fields = {  \n",
122 |     "        float32_field: bson_float,  \n",
123 |     "        int8_field: bson_int8,  \n",
124 |     "        int1_field: bson_ubinary,\n",
125 |     "    }  \n",
126 |     "  \n",
127 |     "    # Update the document in MongoDB  \n",
128 |     "    result = collection.update_one({\"_id\": document[\"_id\"]}, {\"$set\": updated_fields})  \n",
129 |     "    if result.modified_count > 0:  \n",
130 |     "        updated_doc_count += 1  \n",
131 |     "  \n",
132 |     "# Print the results  \n",
133 |     "print(f\"Number of documents updated: {updated_doc_count}\") "
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "from pymongo.operations import SearchIndexModel\n",
143 |     "import time\n",
144 |     "\n",
145 |     "# Define and create the vector search index\n",
146 |     "index_name = \"vector_index\"\n",
147 |     "search_index_model = SearchIndexModel(\n",
148 |     "  definition={\n",
149 |     "    \"fields\": [\n",
150 |     "      {\n",
151 |     "        \"type\": \"vector\",\n",
152 |     "        \"path\": float32_field,\n",
153 |     "        \"similarity\": \"dotProduct\",\n",
154 |     "        \"numDimensions\": 1024\n",
155 |     "      },\n",
156 |     "      {\n",
157 |     "        \"type\": \"vector\",\n",
158 |     "        \"path\": int8_field,\n",
159 |     "        \"similarity\": \"dotProduct\",\n",
160 |     "        \"numDimensions\": 1024\n",
161 |     "      },\n",
162 |     "      {\n",
163 |     "        \"type\": \"vector\",\n",
164 |     "        \"path\": int1_field,\n",
165 |     "        \"similarity\": \"euclidean\",\n",
166 |     "        \"numDimensions\": 1024\n",
167 |     "      }\n",
168 |     "    ]\n",
169 |     "  },\n",
170 |     "  name=index_name,\n",
171 |     "  type=\"vectorSearch\"\n",
172 |     ")\n",
173 |     "result = collection.create_search_index(model=search_index_model)\n",
174 |     "print(\"New search index named \" + result + \" is building.\")\n",
175 |     "\n",
176 |     "# Wait for initial sync to complete\n",
177 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
178 |     "predicate=None\n",
179 |     "if predicate is None:\n",
180 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
181 |     "while True:\n",
182 |     "  indices = list(collection.list_search_indexes(index_name))\n",
183 |     "  if len(indices) and predicate(indices[0]):\n",
184 |     "    break\n",
185 |     "  time.sleep(5)\n",
186 |     "print(result + \" is ready for querying.\")"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "import voyageai\n",
196 |     "from bson.binary import Binary, BinaryVectorDtype\n",
197 |     "\n",
198 |     "# Define a function to run a vector search query\n",
199 |     "def run_vector_search(query_text, collection, path):\n",
200 |     "    # Map path to output dtype and BSON vector type\n",
201 |     "    path_to_dtype = {\n",
202 |     "        float32_field: (\"float\", BinaryVectorDtype.FLOAT32),\n",
203 |     "        int8_field: (\"int8\", BinaryVectorDtype.INT8),\n",
204 |     "        int1_field: (\"ubinary\", BinaryVectorDtype.PACKED_BIT),\n",
205 |     "    }\n",
206 |     "\n",
207 |     "    if path not in path_to_dtype:\n",
208 |     "        raise ValueError(\"Invalid path. Must be one of float32_field, int8_field, int1_field.\")\n",
209 |     "\n",
210 |     "    # Get Voyage AI output dtype and BSON vector type based on the path\n",
211 |     "    output_dtype, bson_dtype = path_to_dtype[path]\n",
212 |     "\n",
213 |     "    # Generate query embeddings using Voyage AI\n",
214 |     "    query_vector = vo.embed(\n",
215 |     "        texts=[query_text],\n",
216 |     "        model=\"voyage-3-large\",\n",
217 |     "        input_type=\"query\",\n",
218 |     "        output_dtype=output_dtype\n",
219 |     "    ).embeddings[0]\n",
220 |     "\n",
221 |     "    # Convert the query vector to BSON format\n",
222 |     "    bson_query_vector = Binary.from_vector(query_vector, bson_dtype)\n",
223 |     "\n",
224 |     "    # Define the aggregation pipeline for vector search\n",
225 |     "    pipeline = [\n",
226 |     "        {\n",
227 |     "            \"$vectorSearch\": {\n",
228 |     "                \"index\": index_name,  # Replace with your index name\n",
229 |     "                \"path\": path,         # Path to the embedding field\n",
230 |     "                \"queryVector\": bson_query_vector,  # BSON-encoded query vector\n",
231 |     "                \"numCandidates\": 20,\n",
232 |     "                \"limit\": 5\n",
233 |     "            }\n",
234 |     "        },\n",
235 |     "        {\n",
236 |     "            \"$project\": {\n",
237 |     "                \"_id\": 0,\n",
238 |     "                \"summary\": 1,\n",
239 |     "                \"score\": { \"$meta\": \"vectorSearchScore\" }  # Include the similarity score\n",
240 |     "            }\n",
241 |     "        }\n",
242 |     "    ]\n",
243 |     "\n",
244 |     "    # Run the aggregation pipeline and return results\n",
245 |     "    return collection.aggregate(pipeline)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "from pprint import pprint\n",
255 |     "\n",
256 |     "# Define a list of embedding fields to query\n",
257 |     "embedding_fields = [float32_field, int8_field, int1_field] \n",
258 |     "results = {}\n",
259 |     "\n",
260 |     "# Run vector search queries for each embedding type\n",
261 |     "query_text = \"ocean view\"\n",
262 |     "for field in embedding_fields:\n",
263 |     "    results[field] = list(run_vector_search(query_text, collection, field)) \n",
264 |     "\n",
265 |     "# Print the results\n",
266 |     "for field, field_results in results.items():\n",
267 |     "    print(f\"Results from {field}\")\n",
268 |     "    pprint(field_results)"
269 |    ]
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "kernelspec": {
274 |    "display_name": "Python 3",
275 |    "language": "python",
276 |    "name": "python3"
277 |   },
278 |   "language_info": {
279 |    "codemirror_mode": {
280 |     "name": "ipython",
281 |     "version": 3
282 |    },
283 |    "file_extension": ".py",
284 |    "mimetype": "text/x-python",
285 |    "name": "python",
286 |    "nbconvert_exporter": "python",
287 |    "pygments_lexer": "ipython3",
288 |    "version": "3.9.12"
289 |   }
290 |  },
291 |  "nbformat": 4,
292 |  "nbformat_minor": 2
293 | }
294 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/ai-integrations/langchain-memory-semantic-cache.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "922b6c28",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# LangChain MongoDB Integration - Memory and Semantic Caching for RAG"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "1fc29d11",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This notebook is a companion to the [Memory and Semantic Caching](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/memory-semantic-cache/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-memory-semantic-cache.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "id": "a289ba35",
 27 |    "metadata": {
 28 |     "vscode": {
 29 |      "languageId": "shellscript"
 30 |     }
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai langchain-text-splitters pypdf"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "c672ba1f",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import os\n",
 45 |     "\n",
 46 |     "os.environ[\"OPENAI_API_KEY\"] = \"<openai-key>\"\n",
 47 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<voyage-key>\"\n",
 48 |     "MONGODB_URI = \"<connection-string>\""
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "id": "8384c99d",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Configure the Vector Store"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "id": "f90ce770",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
 67 |     "from langchain_voyageai import VoyageAIEmbeddings\n",
 68 |     "\n",
 69 |     "# Use the voyage-3-large embedding model\n",
 70 |     "embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n",
 71 |     "\n",
 72 |     "# Create the vector store\n",
 73 |     "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
 74 |     "   connection_string = MONGODB_URI,\n",
 75 |     "   embedding = embedding_model,\n",
 76 |     "   namespace = \"langchain_db.rag_with_memory\"\n",
 77 |     ")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "7fb2f164",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "from langchain_community.document_loaders import PyPDFLoader\n",
 88 |     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
 89 |     "\n",
 90 |     "# Load the PDF\n",
 91 |     "loader = PyPDFLoader(\"https://investors.mongodb.com/node/13176/pdf\")\n",
 92 |     "data = loader.load()\n",
 93 |     "\n",
 94 |     "# Split PDF into documents\n",
 95 |     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
 96 |     "docs = text_splitter.split_documents(data)\n",
 97 |     "\n",
 98 |     "# Add data to the vector store\n",
 99 |     "vector_store.add_documents(docs)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "8bf1bff8",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# Use helper method to create the vector search index\n",
110 |     "vector_store.create_vector_search_index(\n",
111 |     "   dimensions = 1024,       # The dimensions of the vector embeddings to be indexed\n",
112 |     "   wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
113 |     ")"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "id": "8c3b6654",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Implement RAG with Memory"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "55583167",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "from langchain_openai import ChatOpenAI\n",
132 |     "\n",
133 |     "# Define the model to use for chat completion\n",
134 |     "llm = ChatOpenAI(model = \"gpt-4o\")"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "3b3b0361",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from langchain_mongodb.chat_message_histories import MongoDBChatMessageHistory\n",
145 |     "from langchain_core.runnables.history import RunnableWithMessageHistory\n",
146 |     "from langchain_core.prompts import MessagesPlaceholder\n",
147 |     "         \n",
148 |     "# Define a function that gets the chat message history \n",
149 |     "def get_session_history(session_id: str) -> MongoDBChatMessageHistory:\n",
150 |     "    return MongoDBChatMessageHistory(\n",
151 |     "        connection_string=MONGODB_URI,\n",
152 |     "        session_id=session_id,\n",
153 |     "        database_name=\"langchain_db\",\n",
154 |     "        collection_name=\"rag_with_memory\"\n",
155 |     "    )"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "74dfa896",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "from langchain_core.prompts import ChatPromptTemplate\n",
166 |     "from langchain_core.output_parsers import StrOutputParser\n",
167 |     "\n",
168 |     "# Create a prompt to generate standalone questions from follow-up questions\n",
169 |     "standalone_system_prompt = \"\"\"\n",
170 |     "  Given a chat history and a follow-up question, rephrase the follow-up question to be a standalone question.\n",
171 |     "  Do NOT answer the question, just reformulate it if needed, otherwise return it as is.\n",
172 |     "  Only return the final standalone question.\n",
173 |     "\"\"\"\n",
174 |     "\n",
175 |     "standalone_question_prompt = ChatPromptTemplate.from_messages(\n",
176 |     "    [\n",
177 |     "        (\"system\", standalone_system_prompt),\n",
178 |     "        MessagesPlaceholder(variable_name=\"history\"),\n",
179 |     "        (\"human\", \"{question}\"),\n",
180 |     "    ]\n",
181 |     ")\n",
182 |     "# Parse output as a string\n",
183 |     "parse_output = StrOutputParser()\n",
184 |     "\n",
185 |     "question_chain = standalone_question_prompt | llm | parse_output"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "id": "c7ad7c83",
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "from langchain_core.runnables import RunnablePassthrough\n",
196 |     "\n",
197 |     "# Create a retriever\n",
198 |     "retriever = vector_store.as_retriever(search_type=\"similarity\", search_kwargs={ \"k\": 5 })\n",
199 |     "\n",
200 |     "# Create a retriever chain that processes the question with history and retrieves documents\n",
201 |     "retriever_chain = RunnablePassthrough.assign(context=question_chain | retriever | (lambda docs: \"\\n\\n\".join([d.page_content for d in docs])))"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "id": "c15d460d",
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "# Create a prompt template that includes the retrieved context and chat history\n",
212 |     "rag_system_prompt = \"\"\"Answer the question based only on the following context:\n",
213 |     "{context}\n",
214 |     "\"\"\"\n",
215 |     "\n",
216 |     "rag_prompt = ChatPromptTemplate.from_messages(\n",
217 |     "    [\n",
218 |     "        (\"system\", rag_system_prompt),\n",
219 |     "        MessagesPlaceholder(variable_name=\"history\"),\n",
220 |     "        (\"human\", \"{question}\"),\n",
221 |     "    ]\n",
222 |     ")"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "id": "4401715b",
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "# Build the RAG chain\n",
233 |     "rag_chain = (\n",
234 |     "    retriever_chain\n",
235 |     "    | rag_prompt\n",
236 |     "    | llm\n",
237 |     "    | parse_output\n",
238 |     ")\n",
239 |     "\n",
240 |     "# Wrap the chain with message history\n",
241 |     "rag_with_memory = RunnableWithMessageHistory(\n",
242 |     "    rag_chain,\n",
243 |     "    get_session_history,\n",
244 |     "    input_messages_key=\"question\",\n",
245 |     "    history_messages_key=\"history\",\n",
246 |     ")"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "2093d8c8",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "# First question\n",
257 |     "response_1 = rag_with_memory.invoke(\n",
258 |     "    {\"question\": \"What was MongoDB's latest acquisition?\"},\n",
259 |     "    {\"configurable\": {\"session_id\": \"user_1\"}}\n",
260 |     ")\n",
261 |     "print(response_1)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "id": "14513bb6",
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "# Follow-up question that references the previous question\n",
272 |     "response_2 = rag_with_memory.invoke(\n",
273 |     "    {\"question\": \"Why did they do it?\"},\n",
274 |     "    {\"configurable\": {\"session_id\": \"user_1\"}}\n",
275 |     ")\n",
276 |     "print(response_2)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "id": "d9b2c3c5",
282 |    "metadata": {},
283 |    "source": [
284 |     "## Add Semantic Caching\n",
285 |     "\n",
286 |     "The semantic cache caches only the input to the LLM. When using it in retrieval chains, \n",
287 |     "note that documents retrieved can change between runs, resulting in cache misses for \n",
288 |     "semantically similar queries."
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "id": "594315fe",
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "from langchain_mongodb.cache import MongoDBAtlasSemanticCache\n",
299 |     "from langchain_core.globals import set_llm_cache\n",
300 |     "\n",
301 |     "# Configure the semantic cache\n",
302 |     "set_llm_cache(MongoDBAtlasSemanticCache(\n",
303 |     "    connection_string = MONGODB_URI,\n",
304 |     "    database_name = \"langchain_db\",\n",
305 |     "    collection_name = \"semantic_cache\",\n",
306 |     "    embedding = embedding_model,\n",
307 |     "    index_name = \"vector_index\",\n",
308 |     "    similarity_threshold = 0.5  # Adjust based on your requirements\n",
309 |     "))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "id": "f8063217",
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "%%time\n",
320 |     "\n",
321 |     "# First query (not cached)\n",
322 |     "rag_with_memory.invoke(\n",
323 |     "  {\"question\": \"What was MongoDB's latest acquisition?\"},\n",
324 |     "  {\"configurable\": {\"session_id\": \"user_2\"}}\n",
325 |     ")"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "id": "df4b0318",
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "%%time\n",
336 |     "\n",
337 |     "# Second query (cached)\n",
338 |     "rag_with_memory.invoke(\n",
339 |     "  {\"question\": \"What company did MongoDB acquire recently?\"},\n",
340 |     "  {\"configurable\": {\"session_id\": \"user_2\"}}\n",
341 |     ")"
342 |    ]
343 |   }
344 |  ],
345 |  "metadata": {
346 |   "kernelspec": {
347 |    "display_name": "Python 3",
348 |    "language": "python",
349 |    "name": "python3"
350 |   },
351 |   "language_info": {
352 |    "codemirror_mode": {
353 |     "name": "ipython",
354 |     "version": 3
355 |    },
356 |    "file_extension": ".py",
357 |    "mimetype": "text/x-python",
358 |    "name": "python",
359 |    "nbconvert_exporter": "python",
360 |    "pygments_lexer": "ipython3",
361 |    "version": "3.10.12"
362 |   }
363 |  },
364 |  "nbformat": 4,
365 |  "nbformat_minor": 5
366 | }
367 | 


--------------------------------------------------------------------------------
/quantization/new-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MongoDB Vector Search - Vector Quantization - New Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook is a companion to the [Vector Quantization](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/#how-to-ingest-pre-quantized-vectors) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
 15 |     "\n",
 16 |     "This notebook takes you through how to pre-quantize and ingest your vectors for vector search from **existing data in Atlas** by using the `voyage-3-large` model from [Voyage AI](https://www.voyageai.com).\n",
 17 |     "\n",
 18 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/mongodb/docs-notebooks/blob/main/quantization/new-data.ipynb\">\n",
 19 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 20 |     "</a>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "vscode": {
 28 |      "languageId": "shellscript"
 29 |     }
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "pip install --quiet --upgrade voyageai pymongo"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import os\n",
 43 |     "import voyageai\n",
 44 |     "from bson.binary import Binary, BinaryVectorDtype\n",
 45 |     "\n",
 46 |     "# Initialize the VoyageAI Client\n",
 47 |     "os.environ[\"VOYAGE_API_KEY\"] = \"<VOYAGEAI-API-KEY>\"\n",
 48 |     "vo = voyageai.Client()\n",
 49 |     "\n",
 50 |     "# Define a function to generate embeddings for all strings in `texts`\n",
 51 |     "def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):\n",
 52 |     "    embeddings = []\n",
 53 |     "    for text in texts:  # Process eachstring in the data list\n",
 54 |     "        embedding = vo.embed(\n",
 55 |     "            texts=[text],  # Pass each string as a list with a single item\n",
 56 |     "            model=model,\n",
 57 |     "            output_dtype=dtype,\n",
 58 |     "            output_dimension=output_dimension,\n",
 59 |     "        ).embeddings[0]\n",
 60 |     "        embeddings.append(embedding)  # Collect the embedding for the current text\n",
 61 |     "    return embeddings\n",
 62 |     "\n",
 63 |     "# Convert embeddings to BSON vectors\n",
 64 |     "def generate_bson_vector(vector, vector_dtype):\n",
 65 |     "   return Binary.from_vector(vector, vector_dtype)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Load sample data\n",
 75 |     "data = [\n",
 76 |     "   \"The Great Wall of China is visible from space.\",\n",
 77 |     "   \"The Eiffel Tower was completed in Paris in 1889.\",\n",
 78 |     "   \"Mount Everest is the highest peak on Earth at 8,848m.\",\n",
 79 |     "   \"Shakespeare wrote 37 plays and 154 sonnets during his lifetime.\",\n",
 80 |     "   \"The Mona Lisa was painted by Leonardo da Vinci.\"\n",
 81 |     "]"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# Use the function with different output data types to generate embeddings\n",
 91 |     "model_name = \"voyage-3-large\"\n",
 92 |     "output_dimension = 1024\n",
 93 |     "\n",
 94 |     "# Generate embeddings in all supported data types\n",
 95 |     "float32_embeddings = generate_embeddings(data, model=model_name, dtype=\"float\", output_dimension=output_dimension)\n",
 96 |     "int8_embeddings = generate_embeddings(data, model=model_name, dtype=\"int8\", output_dimension=output_dimension)\n",
 97 |     "int1_embeddings = generate_embeddings(data, model=model_name, dtype=\"ubinary\", output_dimension=output_dimension)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# For all vectors in your collection, generate BSON vectors of float32, int8, and int1 embeddings\n",
107 |     "bson_float32_embeddings = []\n",
108 |     "bson_int8_embeddings = []\n",
109 |     "bson_int1_embeddings = []\n",
110 |     "for i, (f32_emb, int8_emb, int1_emb) in enumerate(zip(float32_embeddings, int8_embeddings, int1_embeddings)):\n",
111 |     "   bson_float32_embeddings.append(generate_bson_vector(f32_emb, BinaryVectorDtype.FLOAT32))\n",
112 |     "   bson_int8_embeddings.append(generate_bson_vector(int8_emb, BinaryVectorDtype.INT8))\n",
113 |     "   bson_int1_embeddings.append(generate_bson_vector(int1_emb, BinaryVectorDtype.PACKED_BIT))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# Specify the field names for the float32, int8, and int1 embeddings\n",
123 |     "float32_field = \"float32_embedding\" \n",
124 |     "int8_field = \"int8_embedding\"\n",
125 |     "int1_field = \"int1_embedding\"\n",
126 |     "\n",
127 |     "# Define function to create documents with BSON vector embeddings\n",
128 |     "def create_new_docs_with_bson_vectors(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, data):\n",
129 |     "  docs = []\n",
130 |     "  for i, (bson_f32_emb, bson_int8_emb, bson_int1_emb, text) in enumerate(zip(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, data)):\n",
131 |     "\n",
132 |     "     doc = {\n",
133 |     "          \"_id\": i,\n",
134 |     "          \"text\": text,\n",
135 |     "          float32_field: bson_f32_emb,\n",
136 |     "          int8_field: bson_int8_emb,\n",
137 |     "          int1_field: bson_int1_emb\n",
138 |     "     }\n",
139 |     "     docs.append(doc)\n",
140 |     "  return docs\n",
141 |     "\n",
142 |     "# Create the documents\n",
143 |     "documents = create_new_docs_with_bson_vectors(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, data)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "import pymongo\n",
153 |     "\n",
154 |     "mongo_client = pymongo.MongoClient(\"<CONNECTION-STRING>\")\n",
155 |     "# Insert documents into a new database and collection\n",
156 |     "db = mongo_client[\"<DATABASE-NAME>\"]\n",
157 |     "collection_name = \"<COLLECTION-NAME>\"\n",
158 |     "db.create_collection(collection_name)\n",
159 |     "collection = db[collection_name]\n",
160 |     "\n",
161 |     "collection.insert_many(documents)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "from pymongo.operations import SearchIndexModel\n",
171 |     "import time\n",
172 |     "\n",
173 |     "# Define and create the vector search index\n",
174 |     "index_name = \"vector_index\"\n",
175 |     "search_index_model = SearchIndexModel(\n",
176 |     "  definition={\n",
177 |     "    \"fields\": [\n",
178 |     "      {\n",
179 |     "        \"type\": \"vector\",\n",
180 |     "        \"path\": float32_field,\n",
181 |     "        \"similarity\": \"dotProduct\",\n",
182 |     "        \"numDimensions\": 1024\n",
183 |     "      },\n",
184 |     "      {\n",
185 |     "        \"type\": \"vector\",\n",
186 |     "        \"path\": int8_field,\n",
187 |     "        \"similarity\": \"dotProduct\",\n",
188 |     "        \"numDimensions\": 1024\n",
189 |     "      },\n",
190 |     "      {\n",
191 |     "        \"type\": \"vector\",\n",
192 |     "        \"path\": int1_field,\n",
193 |     "        \"similarity\": \"euclidean\",\n",
194 |     "        \"numDimensions\": 1024\n",
195 |     "      }\n",
196 |     "    ]\n",
197 |     "  },\n",
198 |     "  name=index_name,\n",
199 |     "  type=\"vectorSearch\"\n",
200 |     ")\n",
201 |     "result = collection.create_search_index(model=search_index_model)\n",
202 |     "print(\"New search index named \" + result + \" is building.\")\n",
203 |     "\n",
204 |     "# Wait for initial sync to complete\n",
205 |     "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
206 |     "predicate=None\n",
207 |     "if predicate is None:\n",
208 |     "  predicate = lambda index: index.get(\"queryable\") is True\n",
209 |     "while True:\n",
210 |     "  indices = list(collection.list_search_indexes(index_name))\n",
211 |     "  if len(indices) and predicate(indices[0]):\n",
212 |     "    break\n",
213 |     "  time.sleep(5)\n",
214 |     "print(result + \" is ready for querying.\")"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "import voyageai\n",
224 |     "from bson.binary import Binary, BinaryVectorDtype\n",
225 |     "\n",
226 |     "# Define a function to run a vector search query\n",
227 |     "def run_vector_search(query_text, collection, path):\n",
228 |     "    # Map path to output dtype and BSON vector type\n",
229 |     "    path_to_dtype = {\n",
230 |     "        float32_field: (\"float\", BinaryVectorDtype.FLOAT32),\n",
231 |     "        int8_field: (\"int8\", BinaryVectorDtype.INT8),\n",
232 |     "        int1_field: (\"ubinary\", BinaryVectorDtype.PACKED_BIT),\n",
233 |     "    }\n",
234 |     "\n",
235 |     "    if path not in path_to_dtype:\n",
236 |     "        raise ValueError(\"Invalid path. Must be one of float32_field, int8_field, int1_field.\")\n",
237 |     "\n",
238 |     "    # Get Voyage AI output dtype and BSON vector type based on the path\n",
239 |     "    output_dtype, bson_dtype = path_to_dtype[path]\n",
240 |     "\n",
241 |     "    # Generate query embeddings using Voyage AI\n",
242 |     "    query_vector = vo.embed(\n",
243 |     "        texts=[query_text],\n",
244 |     "        model=\"voyage-3-large\",\n",
245 |     "        input_type=\"query\",\n",
246 |     "        output_dtype=output_dtype\n",
247 |     "    ).embeddings[0]\n",
248 |     "\n",
249 |     "    # Convert the query vector to BSON format\n",
250 |     "    bson_query_vector = Binary.from_vector(query_vector, bson_dtype)\n",
251 |     "\n",
252 |     "    # Define the aggregation pipeline for vector search\n",
253 |     "    pipeline = [\n",
254 |     "        {\n",
255 |     "            \"$vectorSearch\": {\n",
256 |     "                \"index\": index_name,  # Replace with your index name\n",
257 |     "                \"path\": path,         # Path to the embedding field\n",
258 |     "                \"queryVector\": bson_query_vector,  # BSON-encoded query vector\n",
259 |     "                \"numCandidates\": 5,\n",
260 |     "                \"limit\": 2\n",
261 |     "            }\n",
262 |     "        },\n",
263 |     "        {\n",
264 |     "            \"$project\": {\n",
265 |     "                \"_id\": 0,\n",
266 |     "                \"text\": 1,\n",
267 |     "                \"score\": { \"$meta\": \"vectorSearchScore\" }  # Include the similarity score\n",
268 |     "            }\n",
269 |     "        }\n",
270 |     "    ]\n",
271 |     "\n",
272 |     "    # Run the aggregation pipeline and return results\n",
273 |     "    return collection.aggregate(pipeline)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "from pprint import pprint\n",
283 |     "\n",
284 |     "# Define a list of embedding fields to query\n",
285 |     "embedding_fields = [float32_field, int8_field, int1_field] \n",
286 |     "results = {}\n",
287 |     "\n",
288 |     "# Run vector search queries for each embedding type\n",
289 |     "query_text = \"science fact\"\n",
290 |     "for field in embedding_fields:\n",
291 |     "    results[field] = list(run_vector_search(query_text, collection, field)) \n",
292 |     "\n",
293 |     "# Print the results\n",
294 |     "for field, field_results in results.items():\n",
295 |     "    print(f\"Results from {field}\")\n",
296 |     "    pprint(field_results)"
297 |    ]
298 |   }
299 |  ],
300 |  "metadata": {
301 |   "kernelspec": {
302 |    "display_name": "Python 3",
303 |    "language": "python",
304 |    "name": "python3"
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.9.12"
317 |   }
318 |  },
319 |  "nbformat": 4,
320 |  "nbformat_minor": 2
321 | }
322 | 


--------------------------------------------------------------------------------