├── .github
├── CODEOWNERS
└── workflows
│ └── devdocs-review.yml
├── create-embeddings
├── README.md
├── open-source-existing-data.ipynb
├── voyage-existing-data.ipynb
├── openai-existing-data.ipynb
├── open-source-new-data.ipynb
├── voyage-new-data.ipynb
└── openai-new-data.ipynb
├── manage-indexes
├── delete-indexes.ipynb
├── view-indexes.ipynb
├── edit-indexes.ipynb
├── create-indexes-basic.ipynb
└── create-indexes-filter.ipynb
├── ai-integrations
├── README.md
├── langchain-parent-document-retrieval.ipynb
├── langchain-hybrid-search.ipynb
├── langchain-natural-language.ipynb
├── langchain-local-rag.ipynb
├── semantic-kernel.ipynb
├── haystack.ipynb
├── langchain.ipynb
├── llamaindex.ipynb
├── langchain-graphrag.ipynb
└── langchain-memory-semantic-cache.ipynb
├── README.md
├── use-cases
├── local-rag.ipynb
├── rag.ipynb
└── rag-with-voyage.ipynb
├── quantization
├── existing-data.ipynb
└── new-data.ipynb
└── LICENSE
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | @mongodb/devdocs @davidhou17
2 |
3 | /.github/ @mongodb/devdocs
--------------------------------------------------------------------------------
/create-embeddings/README.md:
--------------------------------------------------------------------------------
1 | # Create Embeddings
2 |
3 | This folder contains Jupyter Notebooks that describe how to generate
4 | vector embeddings.
5 |
6 | Select one of the following notebooks based on your preferred
7 | embedding model, and whether you're generating embeddings from
8 | new data or from data you already have in MongoDB.
9 |
10 | | Notebook | Description |
11 | |----------|-------------|
12 | | [open-source-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-new-data.ipynb) | Generate embeddings from new data using an open-source embedding model |
13 | | [open-source-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/open-source-existing-data.ipynb) | Generate embeddings from existing data in MongoDB using an open-source embedding model |
14 | | [voyage-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-new-data.ipynb) | Generate embeddings from new data using an embedding model from Voyage AI |
15 | | [voyage-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb) | Generate embeddings from existing data in MongoDB using an embedding model from Voyage AI |
16 | | [openai-new-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/openai-new-data.ipynb) | Generate embeddings from new data using an embedding model from OpenAI |
17 | | [openai-existing-data](https://github.com/mongodb/docs-notebooks/blob/main/create-embeddings/voyage-existing-data.ipynb) | Generate embeddings from existing data in MongoDB using an embedding model from OpenAI |
18 |
--------------------------------------------------------------------------------
/.github/workflows/devdocs-review.yml:
--------------------------------------------------------------------------------
1 | name: Notify DevDocs on Reviews
2 |
3 | on:
4 | issue_comment:
5 | types: [created, edited]
6 | pull_request_review_comment:
7 | types: [created, edited]
8 | pull_request_target:
9 | types: [opened, edited]
10 | pull_request_review:
11 | types: [submitted, edited]
12 |
13 | jobs:
14 | notify:
15 | runs-on: ubuntu-latest
16 | if: ${{ github.event.pull_request || github.event.issue.pull_request }}
17 | steps:
18 | - name: Send Slack Notification
19 | env:
20 | EVENT_BODY: ${{ github.event.pull_request.body || github.event.comment.body }}
21 | PR_TITLE: ${{ github.event.pull_request.title || github.event.issue.title }}
22 | PR_USER: ${{ github.event.pull_request.user.login || github.event.comment.user.login }}
23 | PR_URL: ${{ github.event.pull_request.html_url || github.event.comment.html_url }}
24 | REPO_NAME: ${{ github.repository }}
25 | SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_URL }}
26 | run: |
27 | if ! echo "$EVENT_BODY" | grep -qE "@DevDocs|@mongodb/devdocs"; then
28 | echo "::notice::No '@DevDocs' or '@mongodb/devdocs' tag found. Skipping notification."
29 | exit 0
30 | fi
31 |
32 | # 2. Build the message and payload in one step
33 | SLACK_PAYLOAD=$(jq -n \
34 | --arg repo "$REPO_NAME" \
35 | --arg title "$PR_TITLE" \
36 | --arg user "$PR_USER" \
37 | --arg url "$PR_URL" \
38 | '{
39 | "channel": "#docs-devdocs-notifications",
40 | "username": "Issue Notifier",
41 | "icon_emoji": ":mega:",
42 | "text": "*📢 @DevDocs mentioned in * \($repo)\n*Title:* \($title)\n*By:* \($user)\n*URL:* \($url)"
43 | }')
44 |
45 | # 3. Send to Slack
46 | curl -X POST \
47 | -H 'Content-type: application/json' \
48 | --data "$SLACK_PAYLOAD" \
49 | "$SLACK_WEBHOOK"
50 |
--------------------------------------------------------------------------------
/manage-indexes/delete-indexes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Delete Vector Indexes"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "from pymongo.mongo_client import MongoClient\n",
41 | "\n",
42 | "# Connect to your MongoDB cluster\n",
43 | "uri = \"\"\n",
44 | "client = MongoClient(uri)\n",
45 | "\n",
46 | "# Access your database and collection\n",
47 | "database = client[\"\"]\n",
48 | "collection = database[\"\"]\n",
49 | "\n",
50 | "# Delete your search index\n",
51 | "collection.drop_search_index(\"\")"
52 | ]
53 | }
54 | ],
55 | "metadata": {
56 | "kernelspec": {
57 | "display_name": "Python 3 (ipykernel)",
58 | "language": "python",
59 | "name": "python3"
60 | },
61 | "language_info": {
62 | "codemirror_mode": {
63 | "name": "ipython",
64 | "version": 3
65 | },
66 | "file_extension": ".py",
67 | "mimetype": "text/x-python",
68 | "name": "python",
69 | "nbconvert_exporter": "python",
70 | "pygments_lexer": "ipython3",
71 | "version": "3.9.6"
72 | }
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 2
76 | }
77 |
--------------------------------------------------------------------------------
/manage-indexes/view-indexes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - View Vector Indexes"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "from pymongo.mongo_client import MongoClient\n",
41 | "\n",
42 | "# Connect to your MongoDB cluster\n",
43 | "uri = \"\"\n",
44 | "client = MongoClient(uri)\n",
45 | "\n",
46 | "# Access your database and collection\n",
47 | "database = client[\"\"]\n",
48 | "collection = database[\"\"]\n",
49 | "\n",
50 | "# Get a list of the collection's search indexes and print them\n",
51 | "cursor = collection.list_search_indexes()\n",
52 | "for index in cursor:\n",
53 | " print(index)"
54 | ]
55 | }
56 | ],
57 | "metadata": {
58 | "kernelspec": {
59 | "display_name": "Python 3 (ipykernel)",
60 | "language": "python",
61 | "name": "python3"
62 | },
63 | "language_info": {
64 | "codemirror_mode": {
65 | "name": "ipython",
66 | "version": 3
67 | },
68 | "file_extension": ".py",
69 | "mimetype": "text/x-python",
70 | "name": "python",
71 | "nbconvert_exporter": "python",
72 | "pygments_lexer": "ipython3",
73 | "version": "3.9.6"
74 | }
75 | },
76 | "nbformat": 4,
77 | "nbformat_minor": 2
78 | }
79 |
--------------------------------------------------------------------------------
/ai-integrations/README.md:
--------------------------------------------------------------------------------
1 | # AI Integrations
2 |
3 | This folder contains Jupyter Notebooks that demonstrate how to integrate various AI frameworks with MongoDB. These notebooks show you how to implement RAG and other features for your AI-powered and agentic applications by leveraging MongoDB as both a vector database and document database.
4 |
5 | | Notebook | Description |
6 | |----------|-------------|
7 | | [langchain](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain.ipynb) | Implement basic RAG with LangChain and MongoDB Vector Search |
8 | | [langchain-memory-semantic-cache](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-memory-semantic-cache.ipynb) | Implement RAG with memory with LangChain and MongoDB |
9 | | [langchain-hybrid-search](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-hybrid-search.ipynb) | Combine vector search with full-text search using LangChain and MongoDB |
10 | | [langchain-parent-document-retrieval](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-parent-document-retrieval.ipynb) | Perform parent-document retrieval with LangChain and MongoDB |
11 | | [langchain-self-query-retrieval](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-self-query-retrieval.ipynb) | Perform self-querying retrieval with LangChain and MongoDB |
12 | | [langchain-local-rag](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-local-rag.ipynb) | Implement RAG with local models with LangChain and MongoDB |
13 | | [langchain-graphrag](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-graphrag.ipynb) | Implement graph-based RAG with LangChain and MongoDB |
14 | | [langchain-natural-language](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langchain-natural-language.ipynb) | Perform natural language querying with LangChain and MongoDB |
15 | | [langgraph](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/langgraph.ipynb) | Build an AI agent with LangGraph and MongoDB |
16 | | [llamaindex](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/llamaindex.ipynb) | Implement basic RAG with LlamaIndex and MongoDB |
17 | | [haystack](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/haystack.ipynb) | Implement basic RAG with Haystack and MongoDB |
18 | | [semantic-kernel](https://github.com/mongodb/docs-notebooks/blob/main/ai-integrations/semantic-kernel.ipynb) | Implement basic RAG with Microsoft Semantic Kernel and MongoDB |
19 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MongoDB Documentation Notebooks
2 |
3 | This repository contains Jupyter Notebooks that follow
4 | tutorials and code examples in the official [MongoDB Vector Search documentation](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/). You can run, download, and modify these notebooks as you learn how to use MongoDB Vector Search for your use case.
5 |
6 | ## Overview
7 |
8 | Each notebook corresponds to a page or example in our documentation.
9 | Refer to the docs page linked in each notebook for prerequisites, set-up instructions, and detailed explanations of the code.
10 |
11 | The following table summarizes the contents of the notebooks in each directory:
12 |
13 | | Directory | Description |
14 | |--------------------|--------------------------------------------------|
15 | | [/create-embeddings](https://github.com/mongodb/docs-notebooks/tree/main/create-embeddings) | Learn how to generate embeddings for vector search |
16 | | [/get-started](https://github.com/mongodb/docs-notebooks/tree/main/get-started) | Complete our quick start tutorial |
17 | | [/ai-integrations](https://github.com/mongodb/docs-notebooks/tree/main/ai-integrations) | Build AI applications and agents with popular AI frameworks that integrate with MongoDB |
18 | | [/manage-indexes](https://github.com/mongodb/docs-notebooks/tree/main/manage-indexes) | Create, view, edit, and delete vector search indexes |
19 | | [/quantization](https://github.com/mongodb/docs-notebooks/tree/main/quantization) | Quantize your vector embeddings for efficient processing |
20 | | [/run-queries](https://github.com/mongodb/docs-notebooks/tree/main/run-queries) | Learn how to run vector search queries (ANN and ENN) |
21 | | [/use-cases](https://github.com/mongodb/docs-notebooks/tree/main/use-cases) | Implement RAG and build AI agents using a MongoDB-native retrieval system |
22 |
23 | ## Other Resources
24 |
25 | - [MongoDB Vector Search Documentation](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-overview/)
26 | - [Generative AI Use Cases Repository](https://github.com/mongodb-developer/GenAI-Showcase/tree/main)
27 |
28 | ## License
29 |
30 | This project is licensed under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0).
31 |
32 | ## Issues
33 |
34 | To report an issue with any of these notebooks, please leave feedback through
35 | the corresponding documentation page linked at the top of the file. Using the
36 | `Rate This Page` button, you can add a comment about the issue after leaving
37 | a star rating.
38 |
39 | ## Contributing
40 |
41 | We are not currently accepting public contributions to this repository at this
42 | time.
43 |
--------------------------------------------------------------------------------
/manage-indexes/edit-indexes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Edit Vector Indexes"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "from pymongo.mongo_client import MongoClient\n",
41 | "\n",
42 | "# Connect to your MongoDB cluster\n",
43 | "uri = \"\"\n",
44 | "client = MongoClient(uri)\n",
45 | "\n",
46 | "# Access your database and collection\n",
47 | "database = client[\"\"]\n",
48 | "collection = database[\"\"]\n",
49 | "\n",
50 | "definition = {\n",
51 | " \"fields\": [\n",
52 | " {\n",
53 | " \"type\": \"vector\",\n",
54 | " \"numDimensions\": ,\n",
55 | " \"path\": \"\",\n",
56 | " \"similarity\": \"euclidean | cosine | dotProduct\",\n",
57 | " \"quantization\": \" none | scalar | binary \"\n",
58 | " },\n",
59 | " {\n",
60 | " \"type\": \"filter\",\n",
61 | " \"path\": \"\"\n",
62 | " },\n",
63 | " ...\n",
64 | " ]\n",
65 | "}\n",
66 | " \n",
67 | "# Update your search index\n",
68 | "collection.update_search_index(\"\", definition)"
69 | ]
70 | }
71 | ],
72 | "metadata": {
73 | "kernelspec": {
74 | "display_name": "Python 3 (ipykernel)",
75 | "language": "python",
76 | "name": "python3"
77 | },
78 | "language_info": {
79 | "codemirror_mode": {
80 | "name": "ipython",
81 | "version": 3
82 | },
83 | "file_extension": ".py",
84 | "mimetype": "text/x-python",
85 | "name": "python",
86 | "nbconvert_exporter": "python",
87 | "pygments_lexer": "ipython3",
88 | "version": "3.9.6"
89 | }
90 | },
91 | "nbformat": 4,
92 | "nbformat_minor": 2
93 | }
94 |
--------------------------------------------------------------------------------
/manage-indexes/create-indexes-basic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Vector Indexes - Basic Example"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion for the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "from pymongo.mongo_client import MongoClient\n",
41 | "from pymongo.operations import SearchIndexModel\n",
42 | "import time\n",
43 | "\n",
44 | "# Connect to your MongoDB cluster\n",
45 | "uri = \"\"\n",
46 | "client = MongoClient(uri)\n",
47 | "\n",
48 | "# Access your database and collection\n",
49 | "database = client[\"sample_mflix\"]\n",
50 | "collection = database[\"embedded_movies\"]\n",
51 | "\n",
52 | "# Create your index model, then create the search index\n",
53 | "search_index_model = SearchIndexModel(\n",
54 | " definition={\n",
55 | " \"fields\": [\n",
56 | " {\n",
57 | " \"type\": \"vector\",\n",
58 | " \"path\": \"plot_embedding_voyage_3_large\",\n",
59 | " \"numDimensions\": 2048,\n",
60 | " \"similarity\": \"dotProduct\"\n",
61 | " }\n",
62 | " ]\n",
63 | " },\n",
64 | " name=\"vector_index\",\n",
65 | " type=\"vectorSearch\",\n",
66 | ")\n",
67 | "\n",
68 | "result = collection.create_search_index(model=search_index_model)\n",
69 | "print(\"New search index named \" + result + \" is building.\")\n",
70 | "\n",
71 | "# Wait for initial sync to complete\n",
72 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
73 | "predicate=None\n",
74 | "if predicate is None:\n",
75 | " predicate = lambda index: index.get(\"queryable\") is True\n",
76 | "\n",
77 | "while True:\n",
78 | " indices = list(collection.list_search_indexes(result))\n",
79 | " if len(indices) and predicate(indices[0]):\n",
80 | " break\n",
81 | " time.sleep(5)\n",
82 | "print(result + \" is ready for querying.\")"
83 | ]
84 | }
85 | ],
86 | "metadata": {
87 | "kernelspec": {
88 | "display_name": "Python 3 (ipykernel)",
89 | "language": "python",
90 | "name": "python3"
91 | },
92 | "language_info": {
93 | "codemirror_mode": {
94 | "name": "ipython",
95 | "version": 3
96 | },
97 | "file_extension": ".py",
98 | "mimetype": "text/x-python",
99 | "name": "python",
100 | "nbconvert_exporter": "python",
101 | "pygments_lexer": "ipython3",
102 | "version": "3.9.6"
103 | }
104 | },
105 | "nbformat": 4,
106 | "nbformat_minor": 2
107 | }
108 |
--------------------------------------------------------------------------------
/manage-indexes/create-indexes-filter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Vector Indexes - Filter Example"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create and Manage Indexes](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "from pymongo.mongo_client import MongoClient\n",
41 | "from pymongo.operations import SearchIndexModel\n",
42 | "import time\n",
43 | "\n",
44 | "# Connect to your MongoDB cluster\n",
45 | "uri = \"\"\n",
46 | "client = MongoClient(uri)\n",
47 | "\n",
48 | "# Access your database and collection\n",
49 | "database = client[\"sample_mflix\"]\n",
50 | "collection = database[\"embedded_movies\"]\n",
51 | "\n",
52 | "# Create your index model, then create the search index\n",
53 | "search_index_model = SearchIndexModel(\n",
54 | " definition={\n",
55 | " \"fields\": [\n",
56 | " {\n",
57 | " \"type\": \"vector\",\n",
58 | " \"path\": \"plot_embedding_voyage_3_large\",\n",
59 | " \"numDimensions\": 2048,\n",
60 | " \"similarity\": \"dotProduct\",\n",
61 | " \"quantization\": \"scalar\"\n",
62 | " },\n",
63 | " {\n",
64 | " \"type\": \"filter\",\n",
65 | " \"path\": \"genres\"\n",
66 | " },\n",
67 | " {\n",
68 | " \"type\": \"filter\",\n",
69 | " \"path\": \"year\"\n",
70 | " }\n",
71 | " ]\n",
72 | " },\n",
73 | " name=\"vector_index\",\n",
74 | " type=\"vectorSearch\",\n",
75 | ")\n",
76 | "\n",
77 | "result = collection.create_search_index(model=search_index_model)\n",
78 | "print(\"New search index named \" + result + \" is building.\")\n",
79 | "\n",
80 | "# Wait for initial sync to complete\n",
81 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
82 | "predicate=None\n",
83 | "if predicate is None:\n",
84 | " predicate = lambda index: index.get(\"queryable\") is True\n",
85 | "\n",
86 | "while True:\n",
87 | " indices = list(collection.list_search_indexes(result))\n",
88 | " if len(indices) and predicate(indices[0]):\n",
89 | " break\n",
90 | " time.sleep(5)\n",
91 | "print(result + \" is ready for querying.\")\n",
92 | "\n",
93 | "client.close()\n"
94 | ]
95 | }
96 | ],
97 | "metadata": {
98 | "kernelspec": {
99 | "display_name": "Python 3",
100 | "language": "python",
101 | "name": "python3"
102 | },
103 | "language_info": {
104 | "codemirror_mode": {
105 | "name": "ipython",
106 | "version": 3
107 | },
108 | "file_extension": ".py",
109 | "mimetype": "text/x-python",
110 | "name": "python",
111 | "nbconvert_exporter": "python",
112 | "pygments_lexer": "ipython3",
113 | "version": "3.9.12"
114 | }
115 | },
116 | "nbformat": 4,
117 | "nbformat_minor": 2
118 | }
119 |
--------------------------------------------------------------------------------
/ai-integrations/langchain-parent-document-retrieval.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# LangChain MongoDB Integration - Parent Document Retrieval"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Parent Document Retrieval](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/parent-document-retrieval/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai langchain-text-splitters pymongo pypdf"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "\n",
42 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
43 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
44 | "MONGODB_URI = \"\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
54 | "from langchain_community.document_loaders import PyPDFLoader\n",
55 | "\n",
56 | "# Load the PDF\n",
57 | "loader = PyPDFLoader(\"https://investors.mongodb.com/node/12881/pdf\") \n",
58 | "data = loader.load()\n",
59 | "\n",
60 | "# Chunk into parent documents\n",
61 | "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)\n",
62 | "docs = parent_splitter.split_documents(data)\n",
63 | "\n",
64 | "# Print a document\n",
65 | "docs[0]"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "from langchain_mongodb.retrievers import MongoDBAtlasParentDocumentRetriever\n",
75 | "from langchain_voyageai import VoyageAIEmbeddings\n",
76 | "\n",
77 | "# Define the embedding model to use\n",
78 | "embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n",
79 | "\n",
80 | "# Define the chunking method for the child documents\n",
81 | "child_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
82 | "\n",
83 | "# Specify the database and collection name\n",
84 | "database_name = \"langchain_db\"\n",
85 | "collection_name = \"parent_document\"\n",
86 | "\n",
87 | "# Create the parent document retriever\n",
88 | "parent_doc_retriever = MongoDBAtlasParentDocumentRetriever.from_connection_string(\n",
89 | " connection_string = MONGODB_URI,\n",
90 | " child_splitter = child_splitter,\n",
91 | " embedding_model = embedding_model,\n",
92 | " database_name = database_name,\n",
93 | " collection_name = collection_name,\n",
94 | " text_key = \"page_content\",\n",
95 | " relevance_score_fn = \"dotProduct\",\n",
96 | " search_kwargs = { \"k\": 10 },\n",
97 | ")"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# Ingest the documents into Atlas\n",
107 | "parent_doc_retriever.add_documents(docs)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "# Get the vector store instance from the retriever\n",
117 | "vector_store = parent_doc_retriever.vectorstore\n",
118 | "\n",
119 | "# Use helper method to create the vector search index\n",
120 | "vector_store.create_vector_search_index(\n",
121 | " dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
122 | " wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
123 | ")\n"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "# Run a vector search query\n",
133 | "parent_doc_retriever.invoke(\"AI technology\")"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "from langchain_core.output_parsers import StrOutputParser\n",
143 | "from langchain_core.prompts import PromptTemplate\n",
144 | "from langchain_core.runnables import RunnablePassthrough\n",
145 | "from langchain_openai import ChatOpenAI\n",
146 | "\n",
147 | "# Define a prompt template\n",
148 | "template = \"\"\"\n",
149 | " Use the following pieces of context to answer the question at the end.\n",
150 | " {context}\n",
151 | " Question: {query}?\n",
152 | "\"\"\"\n",
153 | "prompt = PromptTemplate.from_template(template)\n",
154 | "model = ChatOpenAI()\n",
155 | "\n",
156 | "# Construct a chain to answer questions on your data\n",
157 | "chain = (\n",
158 | " {\"context\": parent_doc_retriever, \"query\": RunnablePassthrough()}\n",
159 | " | prompt\n",
160 | " | model\n",
161 | " | StrOutputParser()\n",
162 | ")\n",
163 | "\n",
164 | "# Prompt the chain\n",
165 | "query = \"In a list, what are MongoDB's latest AI announcements?\"\n",
166 | "answer = chain.invoke(query)\n",
167 | "print(answer)"
168 | ]
169 | }
170 | ],
171 | "metadata": {
172 | "kernelspec": {
173 | "display_name": "Python 3",
174 | "language": "python",
175 | "name": "python3"
176 | },
177 | "language_info": {
178 | "codemirror_mode": {
179 | "name": "ipython",
180 | "version": 3
181 | },
182 | "file_extension": ".py",
183 | "mimetype": "text/x-python",
184 | "name": "python",
185 | "nbconvert_exporter": "python",
186 | "pygments_lexer": "ipython3",
187 | "version": "3.9.12"
188 | }
189 | },
190 | "nbformat": 4,
191 | "nbformat_minor": 2
192 | }
193 |
--------------------------------------------------------------------------------
/ai-integrations/langchain-hybrid-search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# LangChain MongoDB Integration - Hybrid Search"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [LangChain Hybrid Search](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/hybrid-search/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai pymongo pypdf"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "\n",
42 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
43 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
44 | "MONGODB_URI = \"\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
54 | "from langchain_voyageai import VoyageAIEmbeddings\n",
55 | "\n",
56 | "# Create the vector store\n",
57 | "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
58 | " connection_string = MONGODB_URI,\n",
59 | " embedding = VoyageAIEmbeddings(model = \"voyage-3-large\", output_dimension = 2048),\n",
60 | " namespace = \"sample_mflix.embedded_movies\",\n",
61 | " text_key = \"plot\",\n",
62 | " embedding_key = \"plot_embedding_voyage_3_large\",\n",
63 | " relevance_score_fn = \"dotProduct\"\n",
64 | ")"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "# Use helper method to create the vector search index\n",
74 | "vector_store.create_vector_search_index(\n",
75 | " dimensions = 2048, # The dimensions of the vector embeddings to be indexed\n",
76 | " wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
77 | ")"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "from langchain_mongodb.index import create_fulltext_search_index\n",
87 | "from pymongo import MongoClient\n",
88 | "\n",
89 | "# Connect to your cluster\n",
90 | "client = MongoClient(MONGODB_URI)\n",
91 | "\n",
92 | "# Use helper method to create the search index\n",
93 | "create_fulltext_search_index(\n",
94 | " collection = client[\"sample_mflix\"][\"embedded_movies\"],\n",
95 | " field = \"plot\",\n",
96 | " index_name = \"search_index\",\n",
97 | " wait_until_complete = 60\n",
98 | ")"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever\n",
108 | "\n",
109 | "# Initialize the retriever\n",
110 | "retriever = MongoDBAtlasHybridSearchRetriever(\n",
111 | " vectorstore = vector_store,\n",
112 | " search_index_name = \"search_index\",\n",
113 | " top_k = 5,\n",
114 | " fulltext_penalty = 50,\n",
115 | " vector_penalty = 50,\n",
116 | " post_filter=[\n",
117 | " {\n",
118 | " \"$project\": {\n",
119 | " \"plot_embedding\": 0,\n",
120 | " \"plot_embedding_voyage_3_large\": 0\n",
121 | " }\n",
122 | " }\n",
123 | " ])\n",
124 | "\n",
125 | "# Define your query\n",
126 | "query = \"time travel\"\n",
127 | "\n",
128 | "# Print results\n",
129 | "documents = retriever.invoke(query)\n",
130 | "for doc in documents:\n",
131 | " print(\"Title: \" + doc.metadata[\"title\"])\n",
132 | " print(\"Plot: \" + doc.page_content)\n",
133 | " print(\"Search score: {}\".format(doc.metadata[\"fulltext_score\"]))\n",
134 | " print(\"Vector Search score: {}\".format(doc.metadata[\"vector_score\"]))\n",
135 | " print(\"Total score: {}\\n\".format(doc.metadata[\"fulltext_score\"] + doc.metadata[\"vector_score\"]))"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "from langchain_core.output_parsers import StrOutputParser\n",
145 | "from langchain_core.prompts import PromptTemplate\n",
146 | "from langchain_core.runnables import RunnablePassthrough\n",
147 | "from langchain_openai import ChatOpenAI\n",
148 | "\n",
149 | "# Define a prompt template\n",
150 | "template = \"\"\"\n",
151 | " Use the following pieces of context to answer the question at the end.\n",
152 | " {context}\n",
153 | " Question: Can you recommend some movies about {query}?\n",
154 | "\"\"\"\n",
155 | "prompt = PromptTemplate.from_template(template)\n",
156 | "model = ChatOpenAI()\n",
157 | "\n",
158 | "# Construct a chain to answer questions on your data\n",
159 | "chain = (\n",
160 | " {\"context\": retriever, \"query\": RunnablePassthrough()}\n",
161 | " | prompt\n",
162 | " | model\n",
163 | " | StrOutputParser()\n",
164 | ")\n",
165 | "\n",
166 | "# Prompt the chain\n",
167 | "query = \"time travel\"\n",
168 | "answer = chain.invoke(query)\n",
169 | "print(answer)"
170 | ]
171 | }
172 | ],
173 | "metadata": {
174 | "kernelspec": {
175 | "display_name": "Python 3",
176 | "language": "python",
177 | "name": "python3"
178 | },
179 | "language_info": {
180 | "codemirror_mode": {
181 | "name": "ipython",
182 | "version": 3
183 | },
184 | "file_extension": ".py",
185 | "mimetype": "text/x-python",
186 | "name": "python",
187 | "nbconvert_exporter": "python",
188 | "pygments_lexer": "ipython3",
189 | "version": "3.9.12"
190 | }
191 | },
192 | "nbformat": 4,
193 | "nbformat_minor": 2
194 | }
195 |
--------------------------------------------------------------------------------
/ai-integrations/langchain-natural-language.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "f9696293",
6 | "metadata": {},
7 | "source": [
8 | "# Query MongoDB with Natural Language Using LangChain and LangGraph"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "e696dea0",
14 | "metadata": {},
15 | "source": [
16 | "This notebook is a companion to the [Query MongoDB with Natural Language Using LangChain and LangGraph](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/natural-language-to-mql/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
17 | "\n",
18 | "This notebook demonstrates how to query a MongoDB cluster with a natural language prompt using an AI agent built with the [LangChain MongoDB Toolkit](https://langchain-mongodb.readthedocs.io/en/latest/langchain_mongodb/agent_toolkit/langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit.html#langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit) and the [LangGraph ReAct Agent Framework](https://langchain-ai.github.io/langgraph/agents/agents/).\n",
19 | "\n",
20 | "\n",
21 | "
\n",
22 | ""
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "f106dda9",
29 | "metadata": {
30 | "vscode": {
31 | "languageId": "shellscript"
32 | }
33 | },
34 | "outputs": [],
35 | "source": [
36 | "pip install --quiet --upgrade langchain-mongodb langchain-openai langgraph"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "id": "998157e0",
42 | "metadata": {},
43 | "source": [
44 | "## Set up your environment\n",
45 | "\n",
46 | "Before you begin, make sure you have the following:\n",
47 | "\n",
48 | "- A MongoDB cluster up and running (you'll need the [connection string](https://www.mongodb.com/docs/manual/reference/connection-string/))\n",
49 | "- An API key to access an LLM (This tutorial uses a model from OpenAI, but you can use any model [supported by LangChain](https://python.langchain.com/docs/integrations/chat/))"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "694ccd64",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "os.environ[\"OPENAI_API_KEY\"] = ''\n",
60 | "MONGODB_URI = ''\n",
61 | "DB_NAME = 'sample_restaurants'\n",
62 | "NATURAL_LANGUAGE_QUERY = 'Find all restaurants that serve hamburgers.'"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "id": "c764c565",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "import os\n",
73 | "from langchain_openai import ChatOpenAI\n",
74 | "from langgraph.prebuilt import create_react_agent\n",
75 | "from langchain_mongodb.agent_toolkit import (\n",
76 | " MONGODB_AGENT_SYSTEM_PROMPT,\n",
77 | " MongoDBDatabase,\n",
78 | " MongoDBDatabaseToolkit,\n",
79 | ")"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "id": "5a6b006c",
85 | "metadata": {},
86 | "source": [
87 | "## Build the agent\n",
88 | "\n",
89 | "Next, define the `NaturalLanguageToMQL` Python class.\n",
90 | "\n",
91 | "#### Key Points\n",
92 | "\n",
93 | "- `self.toolkit`, the tools that the agent can use, is an instance of the [MongoDB Toolkit](https://langchain-mongodb.readthedocs.io/en/latest/langchain_mongodb/agent_toolkit/langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit.html#langchain_mongodb.agent_toolkit.toolkit.MongoDBDatabaseToolkit). \n",
94 | "\n",
95 | "- `self.agent`, the agent itself, is an instance of the [ReAct Agent framework](https://langchain-ai.github.io/langgraph/agents/agents/), which takes `self.toolkit` as a parameter."
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "b45185db",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "class NaturalLanguageToMQL:\n",
106 | " def __init__(self):\n",
107 | " self.llm = ChatOpenAI(model=\"gpt-4o-mini\", timeout=60)\n",
108 | " self.system_message = MONGODB_AGENT_SYSTEM_PROMPT.format(top_k=5)\n",
109 | " self.db_wrapper = MongoDBDatabase.from_connection_string(\n",
110 | " MONGODB_URI, \n",
111 | " database=DB_NAME)\n",
112 | " self.toolkit = MongoDBDatabaseToolkit(db=self.db_wrapper, llm=self.llm)\n",
113 | " self.agent = create_react_agent(\n",
114 | " self.llm, \n",
115 | " self.toolkit.get_tools(), \n",
116 | " state_modifier=self.system_message)\n",
117 | " self.messages = []\n",
118 | "\n",
119 | " def convert_to_mql_and_execute_query(self, query):\n",
120 | " # Start the agent with the agent.stream() method\n",
121 | " events = self.agent.stream(\n",
122 | " {'messages': [('user', query)]},\n",
123 | " stream_mode='values',\n",
124 | " )\n",
125 | " # Add output (events) from the agent to the self.messages list\n",
126 | " for event in events:\n",
127 | " self.messages.extend(event['messages'])\n",
128 | " \n",
129 | " def print_results(self):\n",
130 | " # Print the the end-user's expected output from \n",
131 | " # the final message produced by the agent.\n",
132 | " print(self.messages[-1].content)"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "id": "c90825eb",
138 | "metadata": {},
139 | "source": [
140 | "## Run a sample query\n",
141 | "\n",
142 | "And finally, instantiate the `NaturalLanguageToMQL` class and run a sample query."
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "id": "b7284c63",
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "def main():\n",
153 | " converter = NaturalLanguageToMQL()\n",
154 | " converter.convert_to_mql_and_execute_query(NATURAL_LANGUAGE_QUERY)\n",
155 | " converter.print_results()\n",
156 | "\n",
157 | "main()"
158 | ]
159 | }
160 | ],
161 | "metadata": {
162 | "kernelspec": {
163 | "display_name": "Python 3",
164 | "language": "python",
165 | "name": "python3"
166 | },
167 | "language_info": {
168 | "codemirror_mode": {
169 | "name": "ipython",
170 | "version": 3
171 | },
172 | "file_extension": ".py",
173 | "mimetype": "text/x-python",
174 | "name": "python",
175 | "nbconvert_exporter": "python",
176 | "pygments_lexer": "ipython3",
177 | "version": "3.10.12"
178 | }
179 | },
180 | "nbformat": 4,
181 | "nbformat_minor": 2
182 | }
183 |
--------------------------------------------------------------------------------
/ai-integrations/langchain-local-rag.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# LangChain MongoDB Integration - Implement RAG Locally"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [LangChain Local RAG](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/get-started/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "vscode": {
25 | "languageId": "shellscript"
26 | }
27 | },
28 | "source": [
29 | "## Create a local Atlas deployment\n",
30 | "\n",
31 | "Run the following command in your terminal to set up your local Atlas deployment. \n",
32 | "\n",
33 | "```\n",
34 | "atlas deployments setup\n",
35 | "```"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {
41 | "vscode": {
42 | "languageId": "shellscript"
43 | }
44 | },
45 | "source": [
46 | "## Set up the environment"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "vscode": {
54 | "languageId": "shellscript"
55 | }
56 | },
57 | "outputs": [],
58 | "source": [
59 | "pip install --quiet --upgrade pymongo langchain langchain-community langchain-huggingface langchain-text-splitters gpt4all pypdf"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "MONGODB_URI = (\"mongodb://localhost:/?directConnection=true\")"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## Configure the vector store"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
85 | "from langchain_huggingface import HuggingFaceEmbeddings\n",
86 | "\n",
87 | "# Load the embedding model (https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)\n",
88 | "embedding_model = HuggingFaceEmbeddings(model_name=\"mixedbread-ai/mxbai-embed-large-v1\")\n",
89 | "\n",
90 | "# Instantiate vector store\n",
91 | "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
92 | " connection_string = MONGODB_URI,\n",
93 | " namespace = \"langchain_db.local_rag\",\n",
94 | " embedding=embedding_model,\n",
95 | " index_name=\"vector_index\"\n",
96 | ")"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "from langchain_community.document_loaders import PyPDFLoader\n",
106 | "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
107 | "\n",
108 | "# Load the PDF\n",
109 | "loader = PyPDFLoader(\"https://investors.mongodb.com/node/13176/pdf\")\n",
110 | "data = loader.load()\n",
111 | "\n",
112 | "# Split PDF into documents\n",
113 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
114 | "docs = text_splitter.split_documents(data)\n",
115 | "\n",
116 | "# Add data to the vector store\n",
117 | "vector_store.add_documents(docs)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "vector_store.create_vector_search_index(\n",
127 | " dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
128 | " wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
129 | ")"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "## Implement RAG with a local LLM\n",
137 | "Before running the following code, [download the local model](https://gpt4all.io/models/gguf/mistral-7b-openorca.gguf2.Q4_0.gguf)."
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
147 | "from langchain_community.llms import GPT4All\n",
148 | "\n",
149 | "# Configure the LLM\n",
150 | "local_path = \"\"\n",
151 | "\n",
152 | "# Callbacks support token-wise streaming\n",
153 | "callbacks = [StreamingStdOutCallbackHandler()]\n",
154 | "\n",
155 | "# Verbose is required to pass to the callback manager\n",
156 | "llm = GPT4All(model=local_path, callbacks=callbacks, verbose=True)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "from langchain_core.prompts import PromptTemplate\n",
166 | "from langchain_core.output_parsers import StrOutputParser\n",
167 | "from langchain_core.runnables import RunnablePassthrough\n",
168 | "import pprint\n",
169 | "\n",
170 | "# Instantiate MongoDB Vector Search as a retriever\n",
171 | "retriever = vector_store.as_retriever()\n",
172 | "\n",
173 | "# Define prompt template\n",
174 | "template = \"\"\"\n",
175 | "Use the following pieces of context to answer the question at the end.\n",
176 | "{context}\n",
177 | "Question: {question}\n",
178 | "\"\"\"\n",
179 | "custom_rag_prompt = PromptTemplate.from_template(template)\n",
180 | "\n",
181 | "def format_docs(docs):\n",
182 | " return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
183 | "\n",
184 | "# Create chain \n",
185 | "rag_chain = (\n",
186 | " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
187 | " | custom_rag_prompt\n",
188 | " | llm\n",
189 | " | StrOutputParser()\n",
190 | ")\n",
191 | "\n",
192 | "# Prompt the chain\n",
193 | "question = \"What was MongoDB's latest acquisition?\"\n",
194 | "answer = rag_chain.invoke(question)\n",
195 | "\n",
196 | "# Return source documents\n",
197 | "documents = retriever.invoke(question)\n",
198 | "print(\"\\nSource documents:\")\n",
199 | "pprint.pprint(documents)"
200 | ]
201 | }
202 | ],
203 | "metadata": {
204 | "kernelspec": {
205 | "display_name": "Python 3",
206 | "language": "python",
207 | "name": "python3"
208 | },
209 | "language_info": {
210 | "codemirror_mode": {
211 | "name": "ipython",
212 | "version": 3
213 | },
214 | "file_extension": ".py",
215 | "mimetype": "text/x-python",
216 | "name": "python",
217 | "nbconvert_exporter": "python",
218 | "pygments_lexer": "ipython3",
219 | "version": "3.10.12"
220 | }
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 2
224 | }
225 |
--------------------------------------------------------------------------------
/ai-integrations/semantic-kernel.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Semantic Kernel Integration"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Semantic Kernel Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/semantic-kernel/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install --quiet --upgrade semantic-kernel openai motor"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import semantic_kernel as sk\n",
41 | "from semantic_kernel.connectors.ai.open_ai import (OpenAIChatCompletion, OpenAITextEmbedding)\n",
42 | "from semantic_kernel.connectors.memory.mongodb_atlas import MongoDBAtlasMemoryStore\n",
43 | "from semantic_kernel.core_plugins.text_memory_plugin import TextMemoryPlugin\n",
44 | "from semantic_kernel.memory.semantic_text_memory import SemanticTextMemory\n",
45 | "from semantic_kernel.prompt_template.input_variable import InputVariable\n",
46 | "from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig\n",
47 | "from pymongo import MongoClient\n",
48 | "from pymongo.operations import SearchIndexModel"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "OPENAI_API_KEY = \"\"\n",
58 | "MONGODB_URI = \"\""
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "kernel = sk.Kernel()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "chat_service = OpenAIChatCompletion(\n",
77 | " service_id=\"chat\",\n",
78 | " ai_model_id=\"gpt-3.5-turbo\",\n",
79 | " api_key=OPENAI_API_KEY\n",
80 | ")\n",
81 | "embedding_service = OpenAITextEmbedding(\n",
82 | " ai_model_id=\"text-embedding-ada-002\",\n",
83 | " api_key=OPENAI_API_KEY\n",
84 | ")\n",
85 | "kernel.add_service(chat_service)\n",
86 | "kernel.add_service(embedding_service)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "mongodb_atlas_memory_store = MongoDBAtlasMemoryStore(\n",
96 | " connection_string=MONGODB_URI,\n",
97 | " database_name=\"semantic_kernel_db\",\n",
98 | " index_name=\"vector_index\"\n",
99 | ")\n",
100 | "\n",
101 | "memory = SemanticTextMemory(\n",
102 | " storage=mongodb_atlas_memory_store,\n",
103 | " embeddings_generator=embedding_service\n",
104 | ")\n",
105 | "kernel.add_plugin(TextMemoryPlugin(memory), \"TextMemoryPlugin\")"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "async def populate_memory(kernel: sk.Kernel) -> None:\n",
115 | " await memory.save_information(\n",
116 | " collection=\"test\", id=\"1\", text=\"I am a developer\"\n",
117 | " )\n",
118 | " await memory.save_information(\n",
119 | " collection=\"test\", id=\"2\", text=\"I started using MongoDB two years ago\"\n",
120 | " )\n",
121 | " await memory.save_information(\n",
122 | " collection=\"test\", id=\"3\", text=\"I'm using MongoDB Vector Search with Semantic Kernel to implement RAG\"\n",
123 | " )\n",
124 | " await memory.save_information(\n",
125 | " collection=\"test\", id=\"4\", text=\"I like coffee\"\n",
126 | " )\n",
127 | "\n",
128 | "print(\"Populating memory...\")\n",
129 | "await populate_memory(kernel)\n",
130 | "print(kernel)"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# Connect to your MongoDB cluster and specify the collection\n",
140 | "client = MongoClient(MONGODB_URI)\n",
141 | "collection = client[\"semantic_kernel_db\"][\"test\"]\n",
142 | "\n",
143 | "# Create your index model, then create the search index\n",
144 | "search_index_model = SearchIndexModel(\n",
145 | " definition={\n",
146 | " \"fields\": [\n",
147 | " {\n",
148 | " \"type\": \"vector\",\n",
149 | " \"path\": \"embedding\",\n",
150 | " \"numDimensions\": 1536,\n",
151 | " \"similarity\": \"cosine\"\n",
152 | " }\n",
153 | " ]\n",
154 | " },\n",
155 | " name=\"vector_index\",\n",
156 | " type=\"vectorSearch\"\n",
157 | ")\n",
158 | "\n",
159 | "collection.create_search_index(model=search_index_model)"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "## Semantic Search Query"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "result = await memory.search(\"test\", \"What is my job title?\")\n",
176 | "print(f\"Retrieved document: {result[0].text}, {result[0].relevance}\")"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "## Basic RAG"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "service_id = \"chat\"\n",
193 | "settings = kernel.get_service(service_id).instantiate_prompt_execution_settings(\n",
194 | " service_id=service_id\n",
195 | ")\n",
196 | "\n",
197 | "prompt_template = \"\"\"\n",
198 | " Answer the following question based on the given context.\n",
199 | "\n",
200 | " Question: {{$input}}\n",
201 | " Context: {{$context}}\n",
202 | "\"\"\"\n",
203 | "\n",
204 | "chat_prompt_template_config = PromptTemplateConfig(\n",
205 | " execution_settings=settings,\n",
206 | " input_variables=[\n",
207 | " InputVariable(name=\"input\"),\n",
208 | " InputVariable(name=\"context\")\n",
209 | " ],\n",
210 | " template=prompt_template\n",
211 | ")\n",
212 | "\n",
213 | "prompt = kernel.add_function(\n",
214 | " function_name=\"RAG\",\n",
215 | " plugin_name=\"TextMemoryPlugin\",\n",
216 | " prompt_template_config=chat_prompt_template_config,\n",
217 | ")\n",
218 | "\n",
219 | "question = \"When did I start using MongoDB?\"\n",
220 | "results = await memory.search(\"test\", question)\n",
221 | "retrieved_document = results[0].text\n",
222 | "answer = await prompt.invoke(\n",
223 | " kernel=kernel, input=question, context=retrieved_document\n",
224 | ")\n",
225 | "print(answer)"
226 | ]
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "Python 3",
232 | "language": "python",
233 | "name": "python3"
234 | },
235 | "language_info": {
236 | "codemirror_mode": {
237 | "name": "ipython",
238 | "version": 3
239 | },
240 | "file_extension": ".py",
241 | "mimetype": "text/x-python",
242 | "name": "python",
243 | "nbconvert_exporter": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.9.12"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 2
250 | }
251 |
--------------------------------------------------------------------------------
/use-cases/local-rag.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Local Retrieval-Augmented Generation (RAG)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Local Retrieval-Augmented Generation (RAG)](https://www.mongodb.com/docs/atlas/atlas-vector-search/tutorials/local-rag/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you a RAG implementation with MongoDB Vector Search that you can run **completely locally** by using models from Hugging Face and GPT4All.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "## Create a local Atlas deployment\n",
28 | "\n",
29 | "Run the following commands in your terminal to set up your local Atlas deployment. \n",
30 | "\n",
31 | "```\n",
32 | "atlas deployments setup\n",
33 | "curl https://atlas-education.s3.amazonaws.com/sampledata.archive -o sampledata.archive\n",
34 | "mongorestore --archive=sampledata.archive --port=\n",
35 | "```"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "vscode": {
43 | "languageId": "shellscript"
44 | }
45 | },
46 | "outputs": [],
47 | "source": [
48 | "pip install --quiet --upgrade pymongo gpt4all sentence_transformers"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "MONGODB_URI = (\"\")\n",
58 | "# Use \"mongodb://localhost:/?directConnection=true\" for local Atlas deployments"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "from pymongo import MongoClient\n",
68 | "from sentence_transformers import SentenceTransformer\n",
69 | "\n",
70 | "# Connect to your local Atlas deployment or MongoDB cluster\n",
71 | "client = MongoClient(MONGODB_URI)\n",
72 | "\n",
73 | "# Select the sample_airbnb.listingsAndReviews collection\n",
74 | "collection = client[\"sample_airbnb\"][\"listingsAndReviews\"]\n",
75 | "\n",
76 | "# Load the embedding model (https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)\n",
77 | "model_path = \"\"\n",
78 | "model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')\n",
79 | "model.save(model_path)\n",
80 | "model = SentenceTransformer(model_path)\n",
81 | "\n",
82 | "# Define function to generate embeddings\n",
83 | "def get_embedding(text):\n",
84 | " return model.encode(text).tolist()\n",
85 | "\n",
86 | "# Filters for only documents with a summary field and without an embeddings field\n",
87 | "filter = { '$and': [ { 'summary': { '$exists': True, '$ne': None } }, { 'embeddings': { '$exists': False } } ] }\n",
88 | "\n",
89 | "# Creates embeddings for subset of the collection\n",
90 | "updated_doc_count = 0\n",
91 | "for document in collection.find(filter).limit(50):\n",
92 | " text = document['summary']\n",
93 | " embedding = get_embedding(text)\n",
94 | " collection.update_one({ '_id': document['_id'] }, { \"$set\": { 'embeddings': embedding } }, upsert=True)\n",
95 | " updated_doc_count += 1\n",
96 | "\n",
97 | "print(\"Documents updated: {}\".format(updated_doc_count))"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from pymongo.operations import SearchIndexModel\n",
107 | "\n",
108 | "# Create your index model, then create the search index\n",
109 | "search_index_model = SearchIndexModel(\n",
110 | " definition = {\n",
111 | " \"fields\": [\n",
112 | " {\n",
113 | " \"type\": \"vector\",\n",
114 | " \"numDimensions\": 1024,\n",
115 | " \"path\": \"embeddings\",\n",
116 | " \"similarity\": \"cosine\"\n",
117 | " }\n",
118 | " ]\n",
119 | " },\n",
120 | " name = \"vector_index\",\n",
121 | " type = \"vectorSearch\" \n",
122 | ")\n",
123 | "collection.create_search_index(model=search_index_model)\n"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "# Function to get the results of a vector search query\n",
133 | "def get_query_results(query):\n",
134 | " query_embedding = get_embedding(query)\n",
135 | "\n",
136 | " pipeline = [\n",
137 | " {\n",
138 | " \"$vectorSearch\": {\n",
139 | " \"index\": \"vector_index\",\n",
140 | " \"queryVector\": query_embedding,\n",
141 | " \"path\": \"embeddings\",\n",
142 | " \"exact\": True,\n",
143 | " \"limit\": 5\n",
144 | " }\n",
145 | " }, {\n",
146 | " \"$project\": {\n",
147 | " \"_id\": 0,\n",
148 | " \"summary\": 1,\n",
149 | " \"listing_url\": 1,\n",
150 | " \"score\": {\n",
151 | " \"$meta\": \"vectorSearchScore\"\n",
152 | " }\n",
153 | " }\n",
154 | " }\n",
155 | " ]\n",
156 | "\n",
157 | " results = collection.aggregate(pipeline)\n",
158 | "\n",
159 | " array_of_results = []\n",
160 | " for doc in results:\n",
161 | " array_of_results.append(doc)\n",
162 | " return array_of_results"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "import pprint\n",
172 | "pprint.pprint(get_query_results(\"beach house\"))"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "from gpt4all import GPT4All\n",
182 | "\n",
183 | "# Download the model and move it to the same directory as this notebook\n",
184 | "# For complete details, refer to the documentation page\n",
185 | "local_llm_path = \"./mistral-7b-openorca.gguf2.Q4_0.gguf\"\n",
186 | "local_llm = GPT4All(local_llm_path)"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "question = \"Can you recommend a few AirBnBs that are beach houses? Include a link to the listing.\"\n",
196 | "documents = get_query_results(question)\n",
197 | "\n",
198 | "text_documents = \"\"\n",
199 | "for doc in documents:\n",
200 | " summary = doc.get(\"summary\", \"\")\n",
201 | " link = doc.get(\"listing_url\", \"\")\n",
202 | " string = f\"Summary: {summary} Link: {link}. \\n\"\n",
203 | " text_documents += string\n",
204 | "\n",
205 | "prompt = f\"\"\"Use the following pieces of context to answer the question at the end.\n",
206 | " {text_documents}\n",
207 | " Question: {question}\n",
208 | "\"\"\"\n",
209 | "\n",
210 | "response = local_llm.generate(prompt)\n",
211 | "cleaned_response = response.replace('\\\\n', '\\n')\n",
212 | "print(cleaned_response)"
213 | ]
214 | }
215 | ],
216 | "metadata": {
217 | "kernelspec": {
218 | "display_name": "Python 3",
219 | "language": "python",
220 | "name": "python3"
221 | },
222 | "language_info": {
223 | "codemirror_mode": {
224 | "name": "ipython",
225 | "version": 3
226 | },
227 | "file_extension": ".py",
228 | "mimetype": "text/x-python",
229 | "name": "python",
230 | "nbconvert_exporter": "python",
231 | "pygments_lexer": "ipython3",
232 | "version": "3.9.12"
233 | }
234 | },
235 | "nbformat": 4,
236 | "nbformat_minor": 2
237 | }
238 |
--------------------------------------------------------------------------------
/use-cases/rag.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Retrieval-Augmented Generation (RAG)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Retrieval-Augmented Generation (RAG)](https://www.mongodb.com/docs/atlas/atlas-vector-search/rag/#get-started) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to implement RAG with MongoDB Vector Search by using open-source models from Hugging Face.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade pymongo sentence_transformers einops langchain langchain_community langchain-text-splitters pypdf huggingface_hub"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import os\n",
43 | "\n",
44 | "# Specify your Hugging Face access token\n",
45 | "os.environ[\"HF_TOKEN\"] = \"\""
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "from sentence_transformers import SentenceTransformer\n",
55 | "\n",
56 | "# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1\")\n",
57 | "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1\", trust_remote_code=True)\n",
58 | " \n",
59 | "# Define a function to generate embeddings\n",
60 | "def get_embedding(data):\n",
61 | " \"\"\"Generates vector embeddings for the given data.\"\"\"\n",
62 | "\n",
63 | " embedding = model.encode(data)\n",
64 | " return embedding.tolist()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "from langchain_community.document_loaders import PyPDFLoader\n",
74 | "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
75 | "\n",
76 | "# Load the PDF\n",
77 | "loader = PyPDFLoader(\"https://investors.mongodb.com/node/12236/pdf\")\n",
78 | "data = loader.load()\n",
79 | "\n",
80 | "# Split the data into chunks\n",
81 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)\n",
82 | "documents = text_splitter.split_documents(data)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# Prepare documents for insertion\n",
92 | "docs_to_insert = [{\n",
93 | " \"text\": doc.page_content,\n",
94 | " \"embedding\": get_embedding(doc.page_content)\n",
95 | "} for doc in documents]"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "from pymongo import MongoClient\n",
105 | "\n",
106 | "# Connect to your MongoDB cluster\n",
107 | "client = MongoClient(\"\")\n",
108 | "collection = client[\"rag_db\"][\"test\"]\n",
109 | "\n",
110 | "# Insert documents into the collection\n",
111 | "result = collection.insert_many(docs_to_insert)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "from pymongo.operations import SearchIndexModel\n",
121 | "import time\n",
122 | "\n",
123 | "# Create your index model, then create the search index\n",
124 | "index_name=\"vector_index\"\n",
125 | "search_index_model = SearchIndexModel(\n",
126 | " definition = {\n",
127 | " \"fields\": [\n",
128 | " {\n",
129 | " \"type\": \"vector\",\n",
130 | " \"numDimensions\": 768,\n",
131 | " \"path\": \"embedding\",\n",
132 | " \"similarity\": \"cosine\"\n",
133 | " }\n",
134 | " ]\n",
135 | " },\n",
136 | " name = index_name,\n",
137 | " type = \"vectorSearch\"\n",
138 | ")\n",
139 | "collection.create_search_index(model=search_index_model)\n",
140 | "\n",
141 | "# Wait for initial sync to complete\n",
142 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
143 | "predicate=None\n",
144 | "if predicate is None:\n",
145 | " predicate = lambda index: index.get(\"queryable\") is True\n",
146 | "\n",
147 | "while True:\n",
148 | " indices = list(collection.list_search_indexes(index_name))\n",
149 | " if len(indices) and predicate(indices[0]):\n",
150 | " break\n",
151 | " time.sleep(5)\n",
152 | "print(index_name + \" is ready for querying.\")"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "# Define a function to run vector search queries\n",
162 | "def get_query_results(query):\n",
163 | " \"\"\"Gets results from a vector search query.\"\"\"\n",
164 | "\n",
165 | " query_embedding = get_embedding(query)\n",
166 | " pipeline = [\n",
167 | " {\n",
168 | " \"$vectorSearch\": {\n",
169 | " \"index\": \"vector_index\",\n",
170 | " \"queryVector\": query_embedding,\n",
171 | " \"path\": \"embedding\",\n",
172 | " \"exact\": True,\n",
173 | " \"limit\": 5\n",
174 | " }\n",
175 | " }, {\n",
176 | " \"$project\": {\n",
177 | " \"_id\": 0,\n",
178 | " \"text\": 1\n",
179 | " }\n",
180 | " }\n",
181 | " ]\n",
182 | "\n",
183 | " results = collection.aggregate(pipeline)\n",
184 | "\n",
185 | " array_of_results = []\n",
186 | " for doc in results:\n",
187 | " array_of_results.append(doc)\n",
188 | " return array_of_results\n",
189 | "\n",
190 | "# Test the function with a sample query\n",
191 | "import pprint\n",
192 | "pprint.pprint(get_query_results(\"AI technology\"))"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "from huggingface_hub import InferenceClient\n",
202 | "\n",
203 | "# Specify search query, retrieve relevant documents, and convert to string\n",
204 | "query = \"What are MongoDB's latest AI announcements?\"\n",
205 | "context_docs = get_query_results(query)\n",
206 | "context_string = \" \".join([doc[\"text\"] for doc in context_docs])\n",
207 | "\n",
208 | "# Construct prompt for the LLM using the retrieved documents as the context\n",
209 | "prompt = f\"\"\"Use the following pieces of context to answer the question at the end.\n",
210 | " {context_string}\n",
211 | " Question: {query}\n",
212 | "\"\"\"\n",
213 | "\n",
214 | "# Use a model from Hugging Face\n",
215 | "llm = InferenceClient(\n",
216 | " \"mistralai/Mixtral-8x22B-Instruct-v0.1\",\n",
217 | " provider = \"fireworks-ai\", \n",
218 | " token = os.getenv(\"HF_TOKEN\"))\n",
219 | "\n",
220 | "# Prompt the LLM (this code varies depending on the model you use)\n",
221 | "output = llm.chat_completion(\n",
222 | " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
223 | " max_tokens=150\n",
224 | ")\n",
225 | "print(output.choices[0].message.content)"
226 | ]
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "Python 3",
232 | "language": "python",
233 | "name": "python3"
234 | },
235 | "language_info": {
236 | "codemirror_mode": {
237 | "name": "ipython",
238 | "version": 3
239 | },
240 | "file_extension": ".py",
241 | "mimetype": "text/x-python",
242 | "name": "python",
243 | "nbconvert_exporter": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.9.12"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 2
250 | }
251 |
--------------------------------------------------------------------------------
/use-cases/rag-with-voyage.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Retrieval-Augmented Generation (RAG)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Retrieval-Augmented Generation (RAG)](https://www.mongodb.com/docs/atlas/atlas-vector-search/rag/#get-started) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to implement RAG with MongoDB Vector Search by using the ``voyage-3-large`` embedding model from Voyage AI and an open-source generative model from Hugging Face.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade pymongo sentence_transformers voyageai einops langchain langchain_community pypdf huggingface_hub"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import os\n",
43 | "\n",
44 | "# Specify your Hugging Face access token and Voyage API key\n",
45 | "os.environ[\"HF_TOKEN\"] = \"\"\n",
46 | "os.environ[\"VOYAGE_API_KEY\"] = \"\""
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "import voyageai\n",
56 | "\n",
57 | "# Specify the Voyage AI embedding model\n",
58 | "model = \"voyage-3-large\"\n",
59 | "vo = voyageai.Client()\n",
60 | "\n",
61 | "# Define a function to generate embeddings\n",
62 | "def get_embedding(data, input_type = \"document\"):\n",
63 | " embeddings = vo.embed(\n",
64 | " data, model = model, input_type = input_type\n",
65 | " ).embeddings\n",
66 | " return embeddings[0]"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "from langchain_community.document_loaders import PyPDFLoader\n",
76 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
77 | "\n",
78 | "# Load the PDF\n",
79 | "loader = PyPDFLoader(\"https://investors.mongodb.com/node/12236/pdf\")\n",
80 | "data = loader.load()\n",
81 | "\n",
82 | "# Split the data into chunks\n",
83 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)\n",
84 | "documents = text_splitter.split_documents(data)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "# Prepare documents for insertion\n",
94 | "docs_to_insert = [{\n",
95 | " \"text\": doc.page_content,\n",
96 | " \"embedding\": get_embedding(doc.page_content)\n",
97 | "} for doc in documents]"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from pymongo import MongoClient\n",
107 | "\n",
108 | "# Connect to your MongoDB cluster\n",
109 | "client = MongoClient(\"\")\n",
110 | "collection = client[\"rag_db\"][\"test\"]\n",
111 | "\n",
112 | "# Insert documents into the collection\n",
113 | "result = collection.insert_many(docs_to_insert)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "from pymongo.operations import SearchIndexModel\n",
123 | "import time\n",
124 | "\n",
125 | "# Create your index model, then create the search index\n",
126 | "index_name=\"vector_index\"\n",
127 | "search_index_model = SearchIndexModel(\n",
128 | " definition = {\n",
129 | " \"fields\": [\n",
130 | " {\n",
131 | " \"type\": \"vector\",\n",
132 | " \"numDimensions\": 1024,\n",
133 | " \"path\": \"embedding\",\n",
134 | " \"similarity\": \"cosine\"\n",
135 | " }\n",
136 | " ]\n",
137 | " },\n",
138 | " name = index_name,\n",
139 | " type = \"vectorSearch\"\n",
140 | ")\n",
141 | "collection.create_search_index(model=search_index_model)\n",
142 | "\n",
143 | "# Wait for initial sync to complete\n",
144 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
145 | "predicate=None\n",
146 | "if predicate is None:\n",
147 | " predicate = lambda index: index.get(\"queryable\") is True\n",
148 | "\n",
149 | "while True:\n",
150 | " indices = list(collection.list_search_indexes(index_name))\n",
151 | " if len(indices) and predicate(indices[0]):\n",
152 | " break\n",
153 | " time.sleep(5)\n",
154 | "print(index_name + \" is ready for querying.\")"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "# Define a function to run vector search queries\n",
164 | "def get_query_results(query):\n",
165 | " \"\"\"Gets results from a vector search query.\"\"\"\n",
166 | "\n",
167 | " query_embedding = get_embedding(query, input_type=\"query\")\n",
168 | " pipeline = [\n",
169 | " {\n",
170 | " \"$vectorSearch\": {\n",
171 | " \"index\": \"vector_index\",\n",
172 | " \"queryVector\": query_embedding,\n",
173 | " \"path\": \"embedding\",\n",
174 | " \"exact\": True,\n",
175 | " \"limit\": 5\n",
176 | " }\n",
177 | " }, {\n",
178 | " \"$project\": {\n",
179 | " \"_id\": 0,\n",
180 | " \"text\": 1\n",
181 | " }\n",
182 | " }\n",
183 | " ]\n",
184 | "\n",
185 | " results = collection.aggregate(pipeline)\n",
186 | "\n",
187 | " array_of_results = []\n",
188 | " for doc in results:\n",
189 | " array_of_results.append(doc)\n",
190 | " return array_of_results\n",
191 | "\n",
192 | "# Test the function with a sample query\n",
193 | "import pprint\n",
194 | "pprint.pprint(get_query_results(\"AI technology\"))"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "from huggingface_hub import InferenceClient\n",
204 | "\n",
205 | "# Specify search query, retrieve relevant documents, and convert to string\n",
206 | "query = \"What are MongoDB's latest AI announcements?\"\n",
207 | "context_docs = get_query_results(query)\n",
208 | "context_string = \" \".join([doc[\"text\"] for doc in context_docs])\n",
209 | "\n",
210 | "# Construct prompt for the LLM using the retrieved documents as the context\n",
211 | "prompt = f\"\"\"Use the following pieces of context to answer the question at the end.\n",
212 | " {context_string}\n",
213 | " Question: {query}\n",
214 | "\"\"\"\n",
215 | "\n",
216 | "# Use a model from Hugging Face\n",
217 | "llm = InferenceClient(\n",
218 | " \"mistralai/Mixtral-8x22B-Instruct-v0.1\",\n",
219 | " provider = \"fireworks-ai\",\n",
220 | " token = os.getenv(\"HF_TOKEN\"))\n",
221 | "\n",
222 | "# Prompt the LLM (this code varies depending on the model you use)\n",
223 | "output = llm.chat_completion(\n",
224 | " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
225 | " max_tokens=150\n",
226 | ")\n",
227 | "print(output.choices[0].message.content)"
228 | ]
229 | }
230 | ],
231 | "metadata": {
232 | "kernelspec": {
233 | "display_name": "Python 3",
234 | "language": "python",
235 | "name": "python3"
236 | },
237 | "language_info": {
238 | "codemirror_mode": {
239 | "name": "ipython",
240 | "version": 3
241 | },
242 | "file_extension": ".py",
243 | "mimetype": "text/x-python",
244 | "name": "python",
245 | "nbconvert_exporter": "python",
246 | "pygments_lexer": "ipython3",
247 | "version": "3.9.12"
248 | }
249 | },
250 | "nbformat": 4,
251 | "nbformat_minor": 2
252 | }
253 |
--------------------------------------------------------------------------------
/create-embeddings/open-source-existing-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Embeddings - Open Source - Existing Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to generate embeddings from **existing data in MongoDB** by using the open-source ``nomic-embed-text-v1`` model.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade sentence-transformers pymongo einops"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "vscode": {
40 | "languageId": "shellscript"
41 | }
42 | },
43 | "source": [
44 | "## Use an Embedding Model"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from sentence_transformers import SentenceTransformer\n",
54 | "\n",
55 | "# Load the embedding model\n",
56 | "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1\", trust_remote_code=True)\n",
57 | "\n",
58 | "# Define a function to generate embeddings\n",
59 | "def get_embedding(data, precision=\"float32\"):\n",
60 | " return model.encode(data, precision=precision).tolist()\n",
61 | "\n",
62 | "# Generate an embedding\n",
63 | "embedding = get_embedding(\"foo\")\n",
64 | "print(embedding)"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### (Optional) Compress your embeddings\n",
72 | "\n",
73 | "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "from bson.binary import Binary \n",
83 | "from bson.binary import BinaryVectorDtype\n",
84 | "\n",
85 | "# Define a function to generate BSON vectors\n",
86 | "def generate_bson_vector(vector, vector_dtype):\n",
87 | " return Binary.from_vector(vector, vector_dtype)\n",
88 | "\n",
89 | "# Generate BSON vector from the sample float32 embedding\n",
90 | "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
91 | "\n",
92 | "# Print the converted embedding\n",
93 | "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "## Generate Embeddings"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "import pymongo\n",
110 | "\n",
111 | "# Connect to your MongoDB cluster\n",
112 | "mongo_client = pymongo.MongoClient(\"\")\n",
113 | "db = mongo_client[\"sample_airbnb\"]\n",
114 | "collection = db[\"listingsAndReviews\"]\n",
115 | "\n",
116 | "# Define a filter to exclude documents with null or empty 'summary' fields\n",
117 | "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
118 | "\n",
119 | "# Get a subset of documents in the collection\n",
120 | "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "from pymongo import UpdateOne\n",
130 | "\n",
131 | "# Generate the list of bulk write operations\n",
132 | "operations = []\n",
133 | "for doc in documents:\n",
134 | " summary = doc[\"summary\"]\n",
135 | " # Generate embeddings for this document\n",
136 | " embedding = get_embedding(summary)\n",
137 | "\n",
138 | " # Uncomment the following line to convert to BSON vectors\n",
139 | " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
140 | "\n",
141 | " # Add the update operation to the list\n",
142 | " operations.append(UpdateOne(\n",
143 | " {\"_id\": doc[\"_id\"]},\n",
144 | " {\"$set\": {\n",
145 | " \"embedding\": embedding\n",
146 | " }}\n",
147 | " ))\n",
148 | "\n",
149 | "# Execute the bulk write operation\n",
150 | "if operations:\n",
151 | " result = collection.bulk_write(operations)\n",
152 | " updated_doc_count = result.modified_count\n",
153 | "\n",
154 | "print(f\"Updated {updated_doc_count} documents.\")"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "## Index and Query Your Embeddings"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "from pymongo.operations import SearchIndexModel\n",
171 | "import time\n",
172 | "\n",
173 | "# Create your index model, then create the search index\n",
174 | "search_index_model = SearchIndexModel(\n",
175 | " definition = {\n",
176 | " \"fields\": [\n",
177 | " {\n",
178 | " \"type\": \"vector\",\n",
179 | " \"path\": \"embedding\",\n",
180 | " \"similarity\": \"dotProduct\",\n",
181 | " \"numDimensions\": 768\n",
182 | " }\n",
183 | " ]\n",
184 | " },\n",
185 | " name=\"vector_index\",\n",
186 | " type=\"vectorSearch\"\n",
187 | ")\n",
188 | "result = collection.create_search_index(model=search_index_model)\n",
189 | "\n",
190 | "# Wait for initial sync to complete\n",
191 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
192 | "predicate=None\n",
193 | "if predicate is None:\n",
194 | " predicate = lambda index: index.get(\"queryable\") is True\n",
195 | "\n",
196 | "while True:\n",
197 | " indices = list(collection.list_search_indexes(result))\n",
198 | " if len(indices) and predicate(indices[0]):\n",
199 | " break\n",
200 | " time.sleep(5)\n",
201 | "print(result + \" is ready for querying.\")"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "# Generate embedding for the search query\n",
211 | "query_embedding = get_embedding(\"beach house\")\n",
212 | "\n",
213 | "# Sample vector search pipeline\n",
214 | "pipeline = [\n",
215 | " {\n",
216 | " \"$vectorSearch\": {\n",
217 | " \"index\": \"vector_index\",\n",
218 | " \"queryVector\": query_embedding,\n",
219 | " \"path\": \"embedding\",\n",
220 | " \"exact\": True,\n",
221 | " \"limit\": 5\n",
222 | " }\n",
223 | " }, \n",
224 | " {\n",
225 | " \"$project\": {\n",
226 | " \"_id\": 0, \n",
227 | " \"summary\": 1,\n",
228 | " \"score\": {\n",
229 | " \"$meta\": \"vectorSearchScore\"\n",
230 | " }\n",
231 | " }\n",
232 | " }\n",
233 | "]\n",
234 | "\n",
235 | "# Execute the search\n",
236 | "results = collection.aggregate(pipeline)\n",
237 | "\n",
238 | "# Print results\n",
239 | "for i in results:\n",
240 | " print(i)\n"
241 | ]
242 | }
243 | ],
244 | "metadata": {
245 | "kernelspec": {
246 | "display_name": "Python 3",
247 | "language": "python",
248 | "name": "python3"
249 | },
250 | "language_info": {
251 | "codemirror_mode": {
252 | "name": "ipython",
253 | "version": 3
254 | },
255 | "file_extension": ".py",
256 | "mimetype": "text/x-python",
257 | "name": "python",
258 | "nbconvert_exporter": "python",
259 | "pygments_lexer": "ipython3",
260 | "version": "3.10.12"
261 | }
262 | },
263 | "nbformat": 4,
264 | "nbformat_minor": 2
265 | }
266 |
--------------------------------------------------------------------------------
/create-embeddings/voyage-existing-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Embeddings - Voyage AI - Existing Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to generate embeddings from **existing data in MongoDB** by using the ``voyage-3-large`` model from Voyage AI.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade voyageai pymongo"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "vscode": {
40 | "languageId": "shellscript"
41 | }
42 | },
43 | "source": [
44 | "## Use an Embedding Model"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import os\n",
54 | "import voyageai\n",
55 | "\n",
56 | "# Specify your Voyage API key and embedding model\n",
57 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
58 | "model = \"voyage-3-large\"\n",
59 | "vo = voyageai.Client()\n",
60 | "\n",
61 | "# Define a function to generate embeddings\n",
62 | "def get_embedding(data, input_type = \"document\"):\n",
63 | " embeddings = vo.embed(\n",
64 | " data, model = model, input_type = input_type\n",
65 | " ).embeddings\n",
66 | " return embeddings[0]\n",
67 | "\n",
68 | "# Generate an embedding\n",
69 | "embedding = get_embedding(\"foo\")\n",
70 | "print(embedding)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### (Optional) Compress your embeddings\n",
78 | "\n",
79 | "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "from bson.binary import Binary \n",
89 | "from bson.binary import BinaryVectorDtype\n",
90 | "\n",
91 | "# Define a function to generate BSON vectors\n",
92 | "def generate_bson_vector(vector, vector_dtype):\n",
93 | " return Binary.from_vector(vector, vector_dtype)\n",
94 | "\n",
95 | "# Generate BSON vector from the sample float32 embedding\n",
96 | "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
97 | "\n",
98 | "# Print the converted embedding\n",
99 | "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Generate Embeddings"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "import pymongo\n",
116 | "\n",
117 | "# Connect to your MongoDB cluster\n",
118 | "mongo_client = pymongo.MongoClient(\"\")\n",
119 | "db = mongo_client[\"sample_airbnb\"]\n",
120 | "collection = db[\"listingsAndReviews\"]\n",
121 | "\n",
122 | "# Define a filter to exclude documents with null or empty 'summary' fields\n",
123 | "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
124 | "\n",
125 | "# Get a subset of documents in the collection\n",
126 | "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "from pymongo import UpdateOne\n",
136 | "\n",
137 | "# Generate the list of bulk write operations\n",
138 | "operations = []\n",
139 | "for doc in documents:\n",
140 | " summary = doc[\"summary\"]\n",
141 | " # Generate embeddings for this document\n",
142 | " embedding = get_embedding(summary)\n",
143 | "\n",
144 | " # Uncomment the following line to convert to BSON vectors\n",
145 | " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
146 | "\n",
147 | " # Add the update operation to the list\n",
148 | " operations.append(UpdateOne(\n",
149 | " {\"_id\": doc[\"_id\"]},\n",
150 | " {\"$set\": {\n",
151 | " \"embedding\": embedding\n",
152 | " }}\n",
153 | " ))\n",
154 | "\n",
155 | "# Execute the bulk write operation\n",
156 | "if operations:\n",
157 | " result = collection.bulk_write(operations)\n",
158 | " updated_doc_count = result.modified_count\n",
159 | "\n",
160 | "print(f\"Updated {updated_doc_count} documents.\")"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "## Index and Query Your Embeddings"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "from pymongo.operations import SearchIndexModel\n",
177 | "import time\n",
178 | "\n",
179 | "# Create your index model, then create the search index\n",
180 | "search_index_model = SearchIndexModel(\n",
181 | " definition = {\n",
182 | " \"fields\": [\n",
183 | " {\n",
184 | " \"type\": \"vector\",\n",
185 | " \"path\": \"embedding\",\n",
186 | " \"similarity\": \"dotProduct\",\n",
187 | " \"numDimensions\": 1024\n",
188 | " }\n",
189 | " ]\n",
190 | " },\n",
191 | " name=\"vector_index\",\n",
192 | " type=\"vectorSearch\"\n",
193 | ")\n",
194 | "result = collection.create_search_index(model=search_index_model)\n",
195 | "\n",
196 | "# Wait for initial sync to complete\n",
197 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
198 | "predicate=None\n",
199 | "if predicate is None:\n",
200 | " predicate = lambda index: index.get(\"queryable\") is True\n",
201 | "\n",
202 | "while True:\n",
203 | " indices = list(collection.list_search_indexes(result))\n",
204 | " if len(indices) and predicate(indices[0]):\n",
205 | " break\n",
206 | " time.sleep(5)\n",
207 | "print(result + \" is ready for querying.\")"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "# Generate embedding for the search query\n",
217 | "query_embedding = get_embedding(\"beach house\", input_type=\"query\")\n",
218 | "\n",
219 | "# Sample vector search pipeline\n",
220 | "pipeline = [\n",
221 | " {\n",
222 | " \"$vectorSearch\": {\n",
223 | " \"index\": \"vector_index\",\n",
224 | " \"queryVector\": query_embedding,\n",
225 | " \"path\": \"embedding\",\n",
226 | " \"exact\": True,\n",
227 | " \"limit\": 5\n",
228 | " }\n",
229 | " }, \n",
230 | " {\n",
231 | " \"$project\": {\n",
232 | " \"_id\": 0, \n",
233 | " \"summary\": 1,\n",
234 | " \"score\": {\n",
235 | " \"$meta\": \"vectorSearchScore\"\n",
236 | " }\n",
237 | " }\n",
238 | " }\n",
239 | "]\n",
240 | "\n",
241 | "# Execute the search\n",
242 | "results = collection.aggregate(pipeline)\n",
243 | "\n",
244 | "# Print results\n",
245 | "for i in results:\n",
246 | " print(i)\n"
247 | ]
248 | }
249 | ],
250 | "metadata": {
251 | "kernelspec": {
252 | "display_name": "Python 3",
253 | "language": "python",
254 | "name": "python3"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.10.12"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 2
271 | }
272 |
--------------------------------------------------------------------------------
/create-embeddings/openai-existing-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Embeddings - OpenAI - Existing Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to generate embeddings from **existing data in MongoDB** by using OpenAI's ``text-embedding-3-small`` model.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade openai pymongo"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "vscode": {
40 | "languageId": "shellscript"
41 | }
42 | },
43 | "source": [
44 | "## Use an Embedding Model"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import os\n",
54 | "from openai import OpenAI\n",
55 | "\n",
56 | "# Specify your OpenAI API key and embedding model\n",
57 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
58 | "model = \"text-embedding-3-small\"\n",
59 | "openai_client = OpenAI()\n",
60 | "\n",
61 | "# Define a function to generate embeddings\n",
62 | "def get_embedding(text):\n",
63 | " \"\"\"Generates vector embeddings for the given text.\"\"\"\n",
64 | "\n",
65 | " embedding = openai_client.embeddings.create(input = [text], model=model).data[0].embedding\n",
66 | " return embedding\n",
67 | "\n",
68 | "# Generate an embedding\n",
69 | "embedding = get_embedding(\"foo\")\n",
70 | "print(embedding)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### (Optional) Compress your embeddings\n",
78 | "\n",
79 | "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "from bson.binary import Binary \n",
89 | "from bson.binary import BinaryVectorDtype\n",
90 | "\n",
91 | "# Define a function to generate BSON vectors\n",
92 | "def generate_bson_vector(vector, vector_dtype):\n",
93 | " return Binary.from_vector(vector, vector_dtype)\n",
94 | "\n",
95 | "# Generate BSON vector from the sample float32 embedding\n",
96 | "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
97 | "\n",
98 | "# Print the converted embedding\n",
99 | "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Generate Embeddings"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "import pymongo\n",
116 | "\n",
117 | "# Connect to your MongoDB cluster\n",
118 | "mongo_client = pymongo.MongoClient(\"\")\n",
119 | "db = mongo_client[\"sample_airbnb\"]\n",
120 | "collection = db[\"listingsAndReviews\"]\n",
121 | "\n",
122 | "# Define a filter to exclude documents with null or empty 'summary' fields\n",
123 | "filter = { 'summary': { '$exists': True, \"$nin\": [ None, \"\" ] } }\n",
124 | "\n",
125 | "# Get a subset of documents in the collection\n",
126 | "documents = collection.find(filter, {'_id': 1, 'summary': 1}).limit(50)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "from pymongo import UpdateOne\n",
136 | "\n",
137 | "# Generate the list of bulk write operations\n",
138 | "operations = []\n",
139 | "for doc in documents:\n",
140 | " summary = doc[\"summary\"]\n",
141 | " # Generate embeddings for this document\n",
142 | " embedding = get_embedding(summary)\n",
143 | "\n",
144 | " # Uncomment the following line to convert to BSON vectors\n",
145 | " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
146 | "\n",
147 | " # Add the update operation to the list\n",
148 | " operations.append(UpdateOne(\n",
149 | " {\"_id\": doc[\"_id\"]},\n",
150 | " {\"$set\": {\n",
151 | " \"embedding\": embedding\n",
152 | " }}\n",
153 | " ))\n",
154 | "\n",
155 | "# Execute the bulk write operation\n",
156 | "if operations:\n",
157 | " result = collection.bulk_write(operations)\n",
158 | " updated_doc_count = result.modified_count\n",
159 | "\n",
160 | "print(f\"Updated {updated_doc_count} documents.\")"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "## Index and Query Your Embeddings"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "from pymongo.operations import SearchIndexModel\n",
177 | "import time\n",
178 | "\n",
179 | "# Create your index model, then create the search index\n",
180 | "search_index_model = SearchIndexModel(\n",
181 | " definition = {\n",
182 | " \"fields\": [\n",
183 | " {\n",
184 | " \"type\": \"vector\",\n",
185 | " \"path\": \"embedding\",\n",
186 | " \"similarity\": \"dotProduct\",\n",
187 | " \"numDimensions\": 1536\n",
188 | " }\n",
189 | " ]\n",
190 | " },\n",
191 | " name=\"vector_index\",\n",
192 | " type=\"vectorSearch\"\n",
193 | ")\n",
194 | "result = collection.create_search_index(model=search_index_model)\n",
195 | "\n",
196 | "# Wait for initial sync to complete\n",
197 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
198 | "predicate=None\n",
199 | "if predicate is None:\n",
200 | " predicate = lambda index: index.get(\"queryable\") is True\n",
201 | "\n",
202 | "while True:\n",
203 | " indices = list(collection.list_search_indexes(result))\n",
204 | " if len(indices) and predicate(indices[0]):\n",
205 | " break\n",
206 | " time.sleep(5)\n",
207 | "print(result + \" is ready for querying.\")"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "# Generate embedding for the search query\n",
217 | "query_embedding = get_embedding(\"beach house\")\n",
218 | "\n",
219 | "# Sample vector search pipeline\n",
220 | "pipeline = [\n",
221 | " {\n",
222 | " \"$vectorSearch\": {\n",
223 | " \"index\": \"vector_index\",\n",
224 | " \"queryVector\": query_embedding,\n",
225 | " \"path\": \"embedding\",\n",
226 | " \"exact\": True,\n",
227 | " \"limit\": 5\n",
228 | " }\n",
229 | " }, \n",
230 | " {\n",
231 | " \"$project\": {\n",
232 | " \"_id\": 0, \n",
233 | " \"summary\": 1,\n",
234 | " \"score\": {\n",
235 | " \"$meta\": \"vectorSearchScore\"\n",
236 | " }\n",
237 | " }\n",
238 | " }\n",
239 | "]\n",
240 | "\n",
241 | "# Execute the search\n",
242 | "results = collection.aggregate(pipeline)\n",
243 | "\n",
244 | "# Print results\n",
245 | "for i in results:\n",
246 | " print(i)\n"
247 | ]
248 | }
249 | ],
250 | "metadata": {
251 | "kernelspec": {
252 | "display_name": "Python 3",
253 | "language": "python",
254 | "name": "python3"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.10.12"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 2
271 | }
272 |
--------------------------------------------------------------------------------
/ai-integrations/haystack.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Haystack Integration"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Haystack Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/haystack/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install --quiet --upgrade mongodb-atlas-haystack voyage-embedders-haystack pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "\n",
42 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
43 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
44 | "os.environ[\"MONGO_CONNECTION_STRING\"]= \"\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from pymongo import MongoClient\n",
54 | "client = MongoClient(os.environ.get(\"MONGO_CONNECTION_STRING\"))"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Create your database and collection\n",
64 | "db_name = \"haystack_db\"\n",
65 | "collection_name = \"test\"\n",
66 | "database = client[db_name]\n",
67 | "database.create_collection(collection_name)\n",
68 | "\n",
69 | "# Define collection\n",
70 | "collection = client[db_name][collection_name]"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "from pymongo.operations import SearchIndexModel\n",
80 | "import time\n",
81 | "\n",
82 | "# Create your index model, then create the search index\n",
83 | "search_index_model = SearchIndexModel(\n",
84 | " definition={\n",
85 | " \"fields\": [\n",
86 | " {\n",
87 | " \"type\": \"vector\",\n",
88 | " \"path\": \"embedding\",\n",
89 | " \"numDimensions\": 1024,\n",
90 | " \"similarity\": \"cosine\"\n",
91 | " }\n",
92 | " ]\n",
93 | " },\n",
94 | " name=\"vector_index\",\n",
95 | " type=\"vectorSearch\"\n",
96 | ")\n",
97 | "result = collection.create_search_index(model=search_index_model)\n",
98 | "\n",
99 | "# Wait for initial sync to complete\n",
100 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
101 | "predicate=None\n",
102 | "if predicate is None:\n",
103 | " predicate = lambda index: index.get(\"queryable\") is True\n",
104 | "\n",
105 | "while True:\n",
106 | " indices = list(collection.list_search_indexes(result))\n",
107 | " if len(indices) and predicate(indices[0]):\n",
108 | " break\n",
109 | " time.sleep(5)\n",
110 | "print(result + \" is ready for querying.\")"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore\n",
120 | "\n",
121 | "document_store = MongoDBAtlasDocumentStore(\n",
122 | " database_name=\"haystack_db\",\n",
123 | " collection_name=\"test\",\n",
124 | " vector_search_index=\"vector_index\",\n",
125 | " full_text_search_index=\"search_index\" # Declared but not used in this example\n",
126 | ")"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "from haystack import Pipeline, Document\n",
136 | "from haystack.document_stores.types import DuplicatePolicy\n",
137 | "from haystack.components.writers import DocumentWriter\n",
138 | "from haystack_integrations.components.embedders.voyage_embedders import VoyageDocumentEmbedder, VoyageTextEmbedder\n",
139 | "\n",
140 | "# Create some example documents\n",
141 | "documents = [\n",
142 | " Document(content=\"My name is Jean and I live in Paris.\"),\n",
143 | " Document(content=\"My name is Mark and I live in Berlin.\"),\n",
144 | " Document(content=\"My name is Giorgio and I live in Rome.\"),\n",
145 | "]\n",
146 | "\n",
147 | "# Initializing a document embedder to convert text content into vectorized form.\n",
148 | "doc_embedder = VoyageDocumentEmbedder()\n",
149 | "\n",
150 | "# Setting up a document writer to handle the insertion of documents into the MongoDB collection.\n",
151 | "doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)\n",
152 | "\n",
153 | "# Creating a pipeline for indexing documents. The pipeline includes embedding and writing documents.\n",
154 | "indexing_pipe = Pipeline()\n",
155 | "indexing_pipe.add_component(instance=doc_embedder, name=\"doc_embedder\")\n",
156 | "indexing_pipe.add_component(instance=doc_writer, name=\"doc_writer\")\n",
157 | "\n",
158 | "# Connecting the components of the pipeline for document flow.\n",
159 | "indexing_pipe.connect(\"doc_embedder.documents\", \"doc_writer.documents\")\n",
160 | "\n",
161 | "# Running the pipeline with the list of documents to index them in MongoDB.\n",
162 | "indexing_pipe.run({\"doc_embedder\": {\"documents\": documents}})"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "## Basic RAG"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "from haystack.components.generators import OpenAIGenerator\n",
179 | "from haystack.components.builders.prompt_builder import PromptBuilder\n",
180 | "from haystack_integrations.components.retrievers.mongodb_atlas import MongoDBAtlasEmbeddingRetriever\n",
181 | "\n",
182 | "# Template for generating prompts for a movie recommendation engine.\n",
183 | "prompt_template = \"\"\"\n",
184 | " You are an assistant allowed to use the following context documents.\\nDocuments:\n",
185 | " {% for doc in documents %}\n",
186 | " {{ doc.content }}\n",
187 | " {% endfor %}\n",
188 | "\n",
189 | " \\nQuery: {{query}}\n",
190 | " \\nAnswer:\n",
191 | "\"\"\"\n",
192 | "\n",
193 | "# Setting up a retrieval-augmented generation (RAG) pipeline for generating responses.\n",
194 | "rag_pipeline = Pipeline()\n",
195 | "rag_pipeline.add_component(\"text_embedder\", VoyageTextEmbedder())\n",
196 | "\n",
197 | "# Adding a component for retrieving related documents from MongoDB based on the query embedding.\n",
198 | "rag_pipeline.add_component(instance=MongoDBAtlasEmbeddingRetriever(document_store=document_store,top_k=15), name=\"retriever\")\n",
199 | "\n",
200 | "# Building prompts based on retrieved documents to be used for generating responses.\n",
201 | "rag_pipeline.add_component(\"prompt_builder\", PromptBuilder(template=prompt_template, required_variables=[\"query\", \"documents\"]))\n",
202 | "\n",
203 | "# Adding a language model generator to produce the final text output.\n",
204 | "rag_pipeline.add_component(\"llm\", OpenAIGenerator())\n",
205 | "\n",
206 | "# Connecting the components of the RAG pipeline to ensure proper data flow.\n",
207 | "rag_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n",
208 | "rag_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n",
209 | "rag_pipeline.connect(\"prompt_builder\", \"llm\")\n",
210 | "\n",
211 | "# Run the pipeline\n",
212 | "query = \"Where does Mark live?\"\n",
213 | "result = rag_pipeline.run(\n",
214 | " {\n",
215 | " \"text_embedder\": {\"text\": query},\n",
216 | " \"prompt_builder\": {\"query\": query},\n",
217 | " })\n",
218 | "print(result['llm']['replies'][0])"
219 | ]
220 | }
221 | ],
222 | "metadata": {
223 | "kernelspec": {
224 | "display_name": "Python 3",
225 | "language": "python",
226 | "name": "python3"
227 | },
228 | "language_info": {
229 | "codemirror_mode": {
230 | "name": "ipython",
231 | "version": 3
232 | },
233 | "file_extension": ".py",
234 | "mimetype": "text/x-python",
235 | "name": "python",
236 | "nbconvert_exporter": "python",
237 | "pygments_lexer": "ipython3",
238 | "version": "3.9.12"
239 | }
240 | },
241 | "nbformat": 4,
242 | "nbformat_minor": 2
243 | }
244 |
--------------------------------------------------------------------------------
/create-embeddings/open-source-new-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Embeddings - Open Source - New Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to generate embeddings from **new data** by using the open-source ``nomic-embed-text-v1`` model.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade sentence-transformers pymongo einops"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "vscode": {
40 | "languageId": "shellscript"
41 | }
42 | },
43 | "source": [
44 | "## Use an Embedding Model"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from sentence_transformers import SentenceTransformer\n",
54 | "\n",
55 | "# Load the embedding model\n",
56 | "model = SentenceTransformer(\"nomic-ai/nomic-embed-text-v1\", trust_remote_code=True)\n",
57 | "\n",
58 | "# Define a function to generate embeddings\n",
59 | "def get_embedding(data, precision=\"float32\"):\n",
60 | " return model.encode(data, precision=precision).tolist()\n",
61 | "\n",
62 | "# Generate an embedding\n",
63 | "embedding = get_embedding(\"foo\")\n",
64 | "print(embedding)"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "### (Optional) Compress your embeddings\n",
72 | "\n",
73 | "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "from bson.binary import Binary \n",
83 | "from bson.binary import BinaryVectorDtype\n",
84 | "\n",
85 | "# Define a function to generate BSON vectors\n",
86 | "def generate_bson_vector(vector, vector_dtype):\n",
87 | " return Binary.from_vector(vector, vector_dtype)\n",
88 | "\n",
89 | "# Generate BSON vector from the sample float32 embedding\n",
90 | "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
91 | "\n",
92 | "# Print the converted embedding\n",
93 | "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "## Generate Embeddings"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "# Sample data\n",
110 | "texts = [\n",
111 | " \"Titanic: The story of the 1912 sinking of the largest luxury liner ever built\",\n",
112 | " \"The Lion King: Lion cub and future king Simba searches for his identity\",\n",
113 | " \"Avatar: A marine is dispatched to the moon Pandora on a unique mission\"\n",
114 | "]"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "# Generate embeddings from the sample data\n",
124 | "embeddings = []\n",
125 | "for text in texts:\n",
126 | " embedding = get_embedding(text)\n",
127 | "\n",
128 | " # Uncomment the following line to convert to BSON vectors\n",
129 | " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
130 | " \n",
131 | " embeddings.append(embedding)\n",
132 | "\n",
133 | " # Print the embeddings\n",
134 | " print(f\"\\nText: {text}\")\n",
135 | " print(f\"Embedding: {embedding[:3]}... (truncated)\")"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "## Ingest Embeddings into MongoDB"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "def create_docs_with_embeddings(embeddings, data):\n",
152 | " docs = []\n",
153 | " for i, (embedding, text) in enumerate(zip(embeddings, data)):\n",
154 | " doc = {\n",
155 | " \"_id\": i,\n",
156 | " \"text\": text,\n",
157 | " \"embedding\": embedding,\n",
158 | " }\n",
159 | " docs.append(doc)\n",
160 | " return docs"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "# Create documents with embeddings and sample data\n",
170 | "docs = create_docs_with_embeddings(embeddings, texts)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "import pymongo\n",
180 | "\n",
181 | "# Connect to your MongoDB cluster\n",
182 | "mongo_client = pymongo.MongoClient(\"\")\n",
183 | "db = mongo_client[\"sample_db\"]\n",
184 | "collection = db[\"embeddings\"]\n",
185 | "\n",
186 | "# Ingest data into MongoDB\n",
187 | "collection.insert_many(docs)"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {},
193 | "source": [
194 | "## Index and Query Your Embeddings"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "from pymongo.operations import SearchIndexModel\n",
204 | "import time\n",
205 | "\n",
206 | "# Create your index model, then create the search index\n",
207 | "search_index_model = SearchIndexModel(\n",
208 | " definition = {\n",
209 | " \"fields\": [\n",
210 | " {\n",
211 | " \"type\": \"vector\",\n",
212 | " \"path\": \"embedding\",\n",
213 | " \"similarity\": \"dotProduct\",\n",
214 | " \"numDimensions\": 768\n",
215 | " }\n",
216 | " ]\n",
217 | " },\n",
218 | " name=\"vector_index\",\n",
219 | " type=\"vectorSearch\"\n",
220 | ")\n",
221 | "result = collection.create_search_index(model=search_index_model)\n",
222 | "\n",
223 | "# Wait for initial sync to complete\n",
224 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
225 | "predicate=None\n",
226 | "if predicate is None:\n",
227 | " predicate = lambda index: index.get(\"queryable\") is True\n",
228 | "\n",
229 | "while True:\n",
230 | " indices = list(collection.list_search_indexes(result))\n",
231 | " if len(indices) and predicate(indices[0]):\n",
232 | " break\n",
233 | " time.sleep(5)\n",
234 | "print(result + \" is ready for querying.\")"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "# Generate embedding for the search query\n",
244 | "query_embedding = get_embedding(\"ocean tragedy\")\n",
245 | "\n",
246 | "# Sample vector search pipeline\n",
247 | "pipeline = [\n",
248 | " {\n",
249 | " \"$vectorSearch\": {\n",
250 | " \"index\": \"vector_index\",\n",
251 | " \"queryVector\": query_embedding,\n",
252 | " \"path\": \"embedding\",\n",
253 | " \"exact\": True,\n",
254 | " \"limit\": 5\n",
255 | " }\n",
256 | " }, \n",
257 | " {\n",
258 | " \"$project\": {\n",
259 | " \"_id\": 0, \n",
260 | " \"text\": 1,\n",
261 | " \"score\": {\n",
262 | " \"$meta\": \"vectorSearchScore\"\n",
263 | " }\n",
264 | " }\n",
265 | " }\n",
266 | "]\n",
267 | "\n",
268 | "# Execute the search\n",
269 | "results = collection.aggregate(pipeline)\n",
270 | "\n",
271 | "# Print results\n",
272 | "for i in results:\n",
273 | " print(i)\n"
274 | ]
275 | }
276 | ],
277 | "metadata": {
278 | "kernelspec": {
279 | "display_name": "Python 3",
280 | "language": "python",
281 | "name": "python3"
282 | },
283 | "language_info": {
284 | "codemirror_mode": {
285 | "name": "ipython",
286 | "version": 3
287 | },
288 | "file_extension": ".py",
289 | "mimetype": "text/x-python",
290 | "name": "python",
291 | "nbconvert_exporter": "python",
292 | "pygments_lexer": "ipython3",
293 | "version": "3.10.12"
294 | }
295 | },
296 | "nbformat": 4,
297 | "nbformat_minor": 2
298 | }
299 |
--------------------------------------------------------------------------------
/create-embeddings/voyage-new-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Embeddings - Voyage AI - New Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to generate embeddings from **new data** by using the ``voyage-3-large`` model from Voyage AI.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade voyageai pymongo"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "vscode": {
40 | "languageId": "shellscript"
41 | }
42 | },
43 | "source": [
44 | "## Use an Embedding Model"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import os\n",
54 | "import voyageai\n",
55 | "\n",
56 | "# Specify your Voyage API key and embedding model\n",
57 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
58 | "model = \"voyage-3-large\"\n",
59 | "vo = voyageai.Client()\n",
60 | "\n",
61 | "# Define a function to generate embeddings\n",
62 | "def get_embedding(data, input_type = \"document\"):\n",
63 | " embeddings = vo.embed(\n",
64 | " data, model = model, input_type = input_type\n",
65 | " ).embeddings\n",
66 | " return embeddings[0]\n",
67 | "\n",
68 | "# Generate an embedding\n",
69 | "embedding = get_embedding(\"foo\")\n",
70 | "print(embedding)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### (Optional) Compress your embeddings\n",
78 | "\n",
79 | "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "from bson.binary import Binary \n",
89 | "from bson.binary import BinaryVectorDtype\n",
90 | "\n",
91 | "# Define a function to generate BSON vectors\n",
92 | "def generate_bson_vector(vector, vector_dtype):\n",
93 | " return Binary.from_vector(vector, vector_dtype)\n",
94 | "\n",
95 | "# Generate BSON vectors using the `BinaryVectorDtype` class\n",
96 | "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
97 | "\n",
98 | "# Print the converted embedding\n",
99 | "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Generate Embeddings"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "# Sample data\n",
116 | "texts = [\n",
117 | " \"Titanic: The story of the 1912 sinking of the largest luxury liner ever built\",\n",
118 | " \"The Lion King: Lion cub and future king Simba searches for his identity\",\n",
119 | " \"Avatar: A marine is dispatched to the moon Pandora on a unique mission\"\n",
120 | "]"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "# Generate embeddings from the sample data\n",
130 | "embeddings = []\n",
131 | "for text in texts:\n",
132 | " embedding = get_embedding(text)\n",
133 | "\n",
134 | " # Uncomment the following line to convert to BSON vectors\n",
135 | " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
136 | " \n",
137 | " embeddings.append(embedding)\n",
138 | "\n",
139 | " # Print the embeddings\n",
140 | " print(f\"\\nText: {text}\")\n",
141 | " print(f\"Embedding: {embedding[:3]}... (truncated)\")"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "## Ingest Embeddings into MongoDB"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "def create_docs_with_embeddings(embeddings, data):\n",
158 | " docs = []\n",
159 | " for i, (embedding, text) in enumerate(zip(embeddings, data)):\n",
160 | " doc = {\n",
161 | " \"_id\": i,\n",
162 | " \"text\": text,\n",
163 | " \"embedding\": embedding,\n",
164 | " }\n",
165 | " docs.append(doc)\n",
166 | " return docs"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "# Create documents with embeddings and sample data\n",
176 | "docs = create_docs_with_embeddings(embeddings, texts)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "import pymongo\n",
186 | "\n",
187 | "# Connect to your MongoDB cluster\n",
188 | "mongo_client = pymongo.MongoClient(\"\")\n",
189 | "db = mongo_client[\"sample_db\"]\n",
190 | "collection = db[\"embeddings\"]\n",
191 | "\n",
192 | "# Ingest data into MongoDB\n",
193 | "collection.insert_many(docs)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "## Index and Query Your Embeddings"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "from pymongo.operations import SearchIndexModel\n",
210 | "import time\n",
211 | "\n",
212 | "# Create your index model, then create the search index\n",
213 | "search_index_model = SearchIndexModel(\n",
214 | " definition = {\n",
215 | " \"fields\": [\n",
216 | " {\n",
217 | " \"type\": \"vector\",\n",
218 | " \"path\": \"embedding\",\n",
219 | " \"similarity\": \"dotProduct\",\n",
220 | " \"numDimensions\": 1024\n",
221 | " }\n",
222 | " ]\n",
223 | " },\n",
224 | " name=\"vector_index\",\n",
225 | " type=\"vectorSearch\"\n",
226 | ")\n",
227 | "result = collection.create_search_index(model=search_index_model)\n",
228 | "\n",
229 | "# Wait for initial sync to complete\n",
230 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
231 | "predicate=None\n",
232 | "if predicate is None:\n",
233 | " predicate = lambda index: index.get(\"queryable\") is True\n",
234 | "\n",
235 | "while True:\n",
236 | " indices = list(collection.list_search_indexes(result))\n",
237 | " if len(indices) and predicate(indices[0]):\n",
238 | " break\n",
239 | " time.sleep(5)\n",
240 | "print(result + \" is ready for querying.\")"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# Generate embedding for the search query\n",
250 | "query_embedding = get_embedding(\"ocean tragedy\", input_type=\"query\")\n",
251 | "\n",
252 | "# Sample vector search pipeline\n",
253 | "pipeline = [\n",
254 | " {\n",
255 | " \"$vectorSearch\": {\n",
256 | " \"index\": \"vector_index\",\n",
257 | " \"queryVector\": query_embedding,\n",
258 | " \"path\": \"embedding\",\n",
259 | " \"exact\": True,\n",
260 | " \"limit\": 5\n",
261 | " }\n",
262 | " }, \n",
263 | " {\n",
264 | " \"$project\": {\n",
265 | " \"_id\": 0, \n",
266 | " \"text\": 1,\n",
267 | " \"score\": {\n",
268 | " \"$meta\": \"vectorSearchScore\"\n",
269 | " }\n",
270 | " }\n",
271 | " }\n",
272 | "]\n",
273 | "\n",
274 | "# Execute the search\n",
275 | "results = collection.aggregate(pipeline)\n",
276 | "\n",
277 | "# Print results\n",
278 | "for i in results:\n",
279 | " print(i)\n"
280 | ]
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "Python 3",
286 | "language": "python",
287 | "name": "python3"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 3
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython3",
299 | "version": "3.10.12"
300 | }
301 | },
302 | "nbformat": 4,
303 | "nbformat_minor": 2
304 | }
305 |
--------------------------------------------------------------------------------
/create-embeddings/openai-new-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Create Embeddings - OpenAI - New Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Create Embeddings](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to generate embeddings from **new data** by using OpenAI's ``text-embedding-3-small`` model.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade openai pymongo"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {
39 | "vscode": {
40 | "languageId": "shellscript"
41 | }
42 | },
43 | "source": [
44 | "## Use an Embedding Model"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import os\n",
54 | "from openai import OpenAI\n",
55 | "\n",
56 | "# Specify your OpenAI API key and embedding model\n",
57 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
58 | "model = \"text-embedding-3-small\"\n",
59 | "openai_client = OpenAI()\n",
60 | "\n",
61 | "# Define a function to generate embeddings\n",
62 | "def get_embedding(text):\n",
63 | " \"\"\"Generates vector embeddings for the given text.\"\"\"\n",
64 | "\n",
65 | " embedding = openai_client.embeddings.create(input = [text], model=model).data[0].embedding\n",
66 | " return embedding\n",
67 | "\n",
68 | "# Generate an embedding\n",
69 | "embedding = get_embedding(\"foo\")\n",
70 | "print(embedding)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### (Optional) Compress your embeddings\n",
78 | "\n",
79 | "Optionally, run the following code to define a function that converts your embeddings into BSON `binData` vectors for [efficient storage and retrieval](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-embeddings/#vector-compression)."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "from bson.binary import Binary \n",
89 | "from bson.binary import BinaryVectorDtype\n",
90 | "\n",
91 | "# Define a function to generate BSON vectors\n",
92 | "def generate_bson_vector(vector, vector_dtype):\n",
93 | " return Binary.from_vector(vector, vector_dtype)\n",
94 | "\n",
95 | "# Generate BSON vectors using the `BinaryVectorDtype` class\n",
96 | "bson_float32_embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
97 | "\n",
98 | "# Print the converted embedding\n",
99 | "print(f\"The converted BSON embedding is: {bson_float32_embedding}\")"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Generate Embeddings"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "# Sample data\n",
116 | "texts = [\n",
117 | " \"Titanic: The story of the 1912 sinking of the largest luxury liner ever built\",\n",
118 | " \"The Lion King: Lion cub and future king Simba searches for his identity\",\n",
119 | " \"Avatar: A marine is dispatched to the moon Pandora on a unique mission\"\n",
120 | "]"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "# Generate embeddings from the sample data\n",
130 | "embeddings = []\n",
131 | "for text in texts:\n",
132 | " embedding = get_embedding(text)\n",
133 | "\n",
134 | " # Uncomment the following line to convert to BSON vectors\n",
135 | " # embedding = generate_bson_vector(embedding, BinaryVectorDtype.FLOAT32)\n",
136 | " \n",
137 | " embeddings.append(embedding)\n",
138 | "\n",
139 | " # Print the embeddings\n",
140 | " print(f\"\\nText: {text}\")\n",
141 | " print(f\"Embedding: {embedding[:3]}... (truncated)\")"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "## Ingest Embeddings into MongoDB"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "def create_docs_with_embeddings(embeddings, data):\n",
158 | " docs = []\n",
159 | " for i, (embedding, text) in enumerate(zip(embeddings, data)):\n",
160 | " doc = {\n",
161 | " \"_id\": i,\n",
162 | " \"text\": text,\n",
163 | " \"embedding\": embedding,\n",
164 | " }\n",
165 | " docs.append(doc)\n",
166 | " return docs"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "# Create documents with embeddings and sample data\n",
176 | "docs = create_docs_with_embeddings(embeddings, texts)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "import pymongo\n",
186 | "\n",
187 | "# Connect to your MongoDB cluster\n",
188 | "mongo_client = pymongo.MongoClient(\"\")\n",
189 | "db = mongo_client[\"sample_db\"]\n",
190 | "collection = db[\"embeddings\"]\n",
191 | "\n",
192 | "# Ingest data into MongoDB\n",
193 | "collection.insert_many(docs)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "## Index and Query Your Embeddings"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "from pymongo.operations import SearchIndexModel\n",
210 | "import time\n",
211 | "\n",
212 | "# Create your index model, then create the search index\n",
213 | "search_index_model = SearchIndexModel(\n",
214 | " definition = {\n",
215 | " \"fields\": [\n",
216 | " {\n",
217 | " \"type\": \"vector\",\n",
218 | " \"path\": \"embedding\",\n",
219 | " \"similarity\": \"dotProduct\",\n",
220 | " \"numDimensions\": 1536\n",
221 | " }\n",
222 | " ]\n",
223 | " },\n",
224 | " name=\"vector_index\",\n",
225 | " type=\"vectorSearch\"\n",
226 | ")\n",
227 | "result = collection.create_search_index(model=search_index_model)\n",
228 | "\n",
229 | "# Wait for initial sync to complete\n",
230 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
231 | "predicate=None\n",
232 | "if predicate is None:\n",
233 | " predicate = lambda index: index.get(\"queryable\") is True\n",
234 | "\n",
235 | "while True:\n",
236 | " indices = list(collection.list_search_indexes(result))\n",
237 | " if len(indices) and predicate(indices[0]):\n",
238 | " break\n",
239 | " time.sleep(5)\n",
240 | "print(result + \" is ready for querying.\")"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# Generate embedding for the search query\n",
250 | "query_embedding = get_embedding(\"ocean tragedy\")\n",
251 | "\n",
252 | "# Sample vector search pipeline\n",
253 | "pipeline = [\n",
254 | " {\n",
255 | " \"$vectorSearch\": {\n",
256 | " \"index\": \"vector_index\",\n",
257 | " \"queryVector\": query_embedding,\n",
258 | " \"path\": \"embedding\",\n",
259 | " \"exact\": True,\n",
260 | " \"limit\": 5\n",
261 | " }\n",
262 | " }, \n",
263 | " {\n",
264 | " \"$project\": {\n",
265 | " \"_id\": 0, \n",
266 | " \"text\": 1,\n",
267 | " \"score\": {\n",
268 | " \"$meta\": \"vectorSearchScore\"\n",
269 | " }\n",
270 | " }\n",
271 | " }\n",
272 | "]\n",
273 | "\n",
274 | "# Execute the search\n",
275 | "results = collection.aggregate(pipeline)\n",
276 | "\n",
277 | "# Print results\n",
278 | "for i in results:\n",
279 | " print(i)\n"
280 | ]
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "Python 3",
286 | "language": "python",
287 | "name": "python3"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 3
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython3",
299 | "version": "3.10.12"
300 | }
301 | },
302 | "nbformat": 4,
303 | "nbformat_minor": 2
304 | }
305 |
--------------------------------------------------------------------------------
/ai-integrations/langchain.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# LangChain MongoDB Integration - Implement RAG"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [LangChain Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/get-started/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai langchain-text-splitters pymongo pypdf"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "\n",
42 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
43 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
44 | "MONGODB_URI = \"\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from langchain_community.document_loaders import PyPDFLoader\n",
54 | "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
55 | "\n",
56 | "# Load the PDF\n",
57 | "loader = PyPDFLoader(\"https://investors.mongodb.com/node/13176/pdf\")\n",
58 | "data = loader.load()\n",
59 | "\n",
60 | "# Split PDF into documents\n",
61 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
62 | "docs = text_splitter.split_documents(data)\n",
63 | "\n",
64 | "# Print the first document\n",
65 | "docs[0]"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
75 | "from langchain_voyageai import VoyageAIEmbeddings\n",
76 | "\n",
77 | "# Instantiate the vector store using your MongoDB connection string\n",
78 | "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
79 | " connection_string = MONGODB_URI,\n",
80 | " namespace = \"langchain_db.test\",\n",
81 | " embedding = VoyageAIEmbeddings(model=\"voyage-3-large\"),\n",
82 | " index_name = \"vector_index\"\n",
83 | ")\n",
84 | "\n",
85 | "# Add documents to the vector store\n",
86 | "vector_store.add_documents(documents=docs)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "# Use helper method to create the vector search index\n",
96 | "vector_store.create_vector_search_index(\n",
97 | " dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
98 | " filters = [ \"page_label\" ],\n",
99 | " wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
100 | ")"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "## Semantic Search Query"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "import pprint\n",
117 | "\n",
118 | "query = \"MongoDB acquisition\"\n",
119 | "results = vector_store.similarity_search(query)\n",
120 | "\n",
121 | "pprint.pprint(results)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## Semantic Search with Score"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "query = \"MongoDB acquisition\"\n",
138 | "results = vector_store.similarity_search_with_score(\n",
139 | " query = query, k = 3\n",
140 | ")\n",
141 | "\n",
142 | "pprint.pprint(results)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "## Semantic Search with Filtering"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "query = \"MongoDB acquisition\"\n",
159 | "\n",
160 | "results = vector_store.similarity_search_with_score(\n",
161 | " query = query,\n",
162 | " k = 3,\n",
163 | " pre_filter = { \"page_label\": { \"$eq\": 2 } }\n",
164 | ")\n",
165 | "\n",
166 | "pprint.pprint(results)"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## Basic RAG"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "from langchain_core.output_parsers import StrOutputParser\n",
183 | "from langchain_core.runnables import RunnablePassthrough\n",
184 | "from langchain_openai import ChatOpenAI\n",
185 | "from langchain.prompts import PromptTemplate\n",
186 | "\n",
187 | "# Instantiate MongoDB Vector Search as a retriever\n",
188 | "retriever = vector_store.as_retriever(\n",
189 | " search_type = \"similarity\",\n",
190 | " search_kwargs = { \"k\": 10 }\n",
191 | ")\n",
192 | "\n",
193 | "# Define a prompt template\n",
194 | "template = \"\"\"\n",
195 | "\n",
196 | "Use the following pieces of context to answer the question at the end.\n",
197 | "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
198 | "\n",
199 | "{context}\n",
200 | "\n",
201 | "Question: {question}\n",
202 | "\"\"\"\n",
203 | "custom_rag_prompt = PromptTemplate.from_template(template)\n",
204 | "\n",
205 | "llm = ChatOpenAI(model=\"gpt-4o\")\n",
206 | "\n",
207 | "def format_docs(docs):\n",
208 | " return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
209 | "\n",
210 | "# Construct a chain to answer questions on your data\n",
211 | "rag_chain = (\n",
212 | " { \"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
213 | " | custom_rag_prompt\n",
214 | " | llm\n",
215 | " | StrOutputParser()\n",
216 | ")\n",
217 | "\n",
218 | "# Prompt the chain\n",
219 | "question = \"What was MongoDB's latest acquisition?\"\n",
220 | "answer = rag_chain.invoke(question)\n",
221 | "\n",
222 | "print(\"Question: \" + question)\n",
223 | "print(\"Answer: \" + answer)\n",
224 | "\n",
225 | "# Return source documents\n",
226 | "documents = retriever.invoke(question)\n",
227 | "print(\"\\nSource documents:\")\n",
228 | "pprint.pprint(documents)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "## RAG with Filters"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "# Instantiate MongoDB Vector Search as a retriever\n",
245 | "retriever = vector_store.as_retriever(\n",
246 | " search_type = \"similarity\",\n",
247 | " search_kwargs = {\n",
248 | " \"k\": 10,\n",
249 | " \"score_threshold\": 0.75,\n",
250 | " \"pre_filter\": { \"page_label\": { \"$eq\": 2 } }\n",
251 | " }\n",
252 | ")\n",
253 | "\n",
254 | "# Define a prompt template\n",
255 | "template = \"\"\"\n",
256 | "\n",
257 | "Use the following pieces of context to answer the question at the end.\n",
258 | "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n",
259 | "\n",
260 | "{context}\n",
261 | "\n",
262 | "Question: {question}\n",
263 | "\"\"\"\n",
264 | "custom_rag_prompt = PromptTemplate.from_template(template)\n",
265 | "\n",
266 | "llm = ChatOpenAI(model=\"gpt-4o\")\n",
267 | "\n",
268 | "def format_docs(docs):\n",
269 | " return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
270 | "\n",
271 | "# Construct a chain to answer questions on your data\n",
272 | "rag_chain = (\n",
273 | " { \"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
274 | " | custom_rag_prompt\n",
275 | " | llm\n",
276 | " | StrOutputParser()\n",
277 | ")\n",
278 | "\n",
279 | "# Prompt the chain\n",
280 | "question = \"What was MongoDB's latest acquisition?\"\n",
281 | "answer = rag_chain.invoke(question)\n",
282 | "\n",
283 | "print(\"Question: \" + question)\n",
284 | "print(\"Answer: \" + answer)\n",
285 | "\n",
286 | "# Return source documents\n",
287 | "documents = retriever.invoke(question)\n",
288 | "print(\"\\nSource documents:\")\n",
289 | "pprint.pprint(documents)"
290 | ]
291 | }
292 | ],
293 | "metadata": {
294 | "kernelspec": {
295 | "display_name": "Python 3",
296 | "language": "python",
297 | "name": "python3"
298 | },
299 | "language_info": {
300 | "codemirror_mode": {
301 | "name": "ipython",
302 | "version": 3
303 | },
304 | "file_extension": ".py",
305 | "mimetype": "text/x-python",
306 | "name": "python",
307 | "nbconvert_exporter": "python",
308 | "pygments_lexer": "ipython3",
309 | "version": "3.9.12"
310 | }
311 | },
312 | "nbformat": 4,
313 | "nbformat_minor": 2
314 | }
315 |
--------------------------------------------------------------------------------
/ai-integrations/llamaindex.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - LlamaIndex Integration"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [LlamaIndex Get Started](https://www.mongodb.com/docs/atlas/ai-integrations/llamaindex/) page. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "\n",
17 | "
\n",
18 | ""
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "vscode": {
26 | "languageId": "shellscript"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "pip install --quiet --upgrade llama-index llama-index-vector-stores-mongodb llama-index-llms-openai llama-index-embeddings-voyageai pymongo"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "\n",
42 | "os.environ[\"VOYAGEAI_API_KEY\"] = \"\"\n",
43 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
44 | "MONGODB_URI = \"\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "from llama_index.embeddings.voyageai import VoyageEmbedding\n",
54 | "from llama_index.llms.openai import OpenAI\n",
55 | "from llama_index.core.settings import Settings\n",
56 | "\n",
57 | "embed_model= VoyageEmbedding(\n",
58 | " voyage_api_key = os.environ[\"VOYAGEAI_API_KEY\"],\n",
59 | " model_name = \"voyage-3-large\",\n",
60 | ")\n",
61 | "\n",
62 | "Settings.llm = OpenAI()\n",
63 | "Settings.embed_model = embed_model\n",
64 | "Settings.chunk_size = 100\n",
65 | "Settings.chunk_overlap = 10"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "from llama_index.core import SimpleDirectoryReader\n",
75 | "\n",
76 | "# Load the sample data\n",
77 | "from urllib.request import urlretrieve\n",
78 | "urlretrieve(\"https://investors.mongodb.com/node/13176/pdf\", \"mongodb-earnings-report.pdf\")\n",
79 | "sample_data = SimpleDirectoryReader(input_files=[\"mongodb-earnings-report.pdf\"]).load_data()\n",
80 | "\n",
81 | "# Print the first document\n",
82 | "sample_data[0]"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "import pymongo\n",
92 | "from llama_index.core import StorageContext\n",
93 | "from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch\n",
94 | "\n",
95 | "# Connect to your MongoDB cluster\n",
96 | "mongo_client = pymongo.MongoClient(MONGODB_URI)\n",
97 | "\n",
98 | "# Instantiate the vector store\n",
99 | "atlas_vector_store = MongoDBAtlasVectorSearch(\n",
100 | " mongo_client,\n",
101 | " db_name = \"llamaindex_db\",\n",
102 | " collection_name = \"test\",\n",
103 | " vector_index_name = \"vector_index\"\n",
104 | ")\n",
105 | "vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "from llama_index.core import VectorStoreIndex\n",
115 | "\n",
116 | "# Store the data as vector embeddings\n",
117 | "vector_store_index = VectorStoreIndex.from_documents(\n",
118 | " sample_data, storage_context=vector_store_context, show_progress=True\n",
119 | ")"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "from pymongo.operations import SearchIndexModel\n",
129 | "import time\n",
130 | "\n",
131 | "# Specify the collection for which to create the index\n",
132 | "collection = mongo_client[\"llamaindex_db\"][\"test\"]\n",
133 | "\n",
134 | "# Create your index model, then create the search index\n",
135 | "search_index_model = SearchIndexModel(\n",
136 | " definition={\n",
137 | " \"fields\": [\n",
138 | " {\n",
139 | " \"type\": \"vector\",\n",
140 | " \"path\": \"embedding\",\n",
141 | " \"numDimensions\": 1024,\n",
142 | " \"similarity\": \"cosine\"\n",
143 | " },\n",
144 | " {\n",
145 | " \"type\": \"filter\",\n",
146 | " \"path\": \"metadata.page_label\"\n",
147 | " }\n",
148 | " ]\n",
149 | " },\n",
150 | " name=\"vector_index\",\n",
151 | " type=\"vectorSearch\",\n",
152 | ")\n",
153 | "result = collection.create_search_index(model=search_index_model)\n",
154 | "\n",
155 | "# Wait for initial sync to complete\n",
156 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
157 | "predicate=None\n",
158 | "if predicate is None:\n",
159 | " predicate = lambda index: index.get(\"queryable\") is True\n",
160 | "\n",
161 | "while True:\n",
162 | " indices = list(collection.list_search_indexes(result))\n",
163 | " if len(indices) and predicate(indices[0]):\n",
164 | " break\n",
165 | " time.sleep(5)\n",
166 | "print(result + \" is ready for querying.\")"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "## Semantic Search Query"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "retriever = vector_store_index.as_retriever(similarity_top_k=3)\n",
183 | "nodes = retriever.retrieve(\"MongoDB acquisition\")\n",
184 | "\n",
185 | "for node in nodes:\n",
186 | " print(node)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "## Semantic Search with Filtering"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator\n",
203 | "\n",
204 | "# Specify metadata filters\n",
205 | "metadata_filters = MetadataFilters(\n",
206 | " filters=[ExactMatchFilter(key=\"metadata.page_label\", value=\"2\")]\n",
207 | ")\n",
208 | "retriever = vector_store_index.as_retriever(similarity_top_k=3, filters=metadata_filters)\n",
209 | "nodes = retriever.retrieve(\"MongoDB acquisition\")\n",
210 | "\n",
211 | "for node in nodes:\n",
212 | " print(node)"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "## Basic RAG"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "from llama_index.core.retrievers import VectorIndexRetriever\n",
229 | "from llama_index.core.query_engine import RetrieverQueryEngine\n",
230 | "import pprint\n",
231 | "\n",
232 | "# Instantiate MongoDB Vector Search as a retriever\n",
233 | "vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)\n",
234 | "\n",
235 | "# Pass the retriever into the query engine\n",
236 | "query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)\n",
237 | "\n",
238 | "# Prompt the LLM\n",
239 | "response = query_engine.query(\"What was MongoDB's latest acquisition?\")\n",
240 | "\n",
241 | "print(response)\n",
242 | "print(\"\\nSource documents: \")\n",
243 | "pprint.pprint(response.source_nodes)"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "## RAG with Filters"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "from llama_index.core.retrievers import VectorIndexRetriever\n",
260 | "from llama_index.core.query_engine import RetrieverQueryEngine\n",
261 | "import pprint\n",
262 | "\n",
263 | "# Specify metadata filters\n",
264 | "metadata_filters = MetadataFilters(\n",
265 | " filters=[ExactMatchFilter(key=\"metadata.page_label\", value=\"2\")]\n",
266 | ")\n",
267 | "\n",
268 | "# Instantiate MongoDB Vector Search as a retriever\n",
269 | "vector_store_retriever = VectorIndexRetriever(index=vector_store_index, filters=metadata_filters, similarity_top_k=5)\n",
270 | "\n",
271 | "# Pass the retriever into the query engine\n",
272 | "query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)\n",
273 | "\n",
274 | "# Prompt the LLM\n",
275 | "response = query_engine.query(\"What was MongoDB's latest acquisition?\")\n",
276 | "\n",
277 | "print(response)\n",
278 | "print(\"\\nSource documents: \")\n",
279 | "pprint.pprint(response.source_nodes)"
280 | ]
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "3.9.12",
286 | "language": "python",
287 | "name": "python3"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 3
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython3",
299 | "version": "3.9.12"
300 | }
301 | },
302 | "nbformat": 4,
303 | "nbformat_minor": 2
304 | }
305 |
--------------------------------------------------------------------------------
/ai-integrations/langchain-graphrag.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "b5dcbf95-9a30-416d-afed-d5b2bf0e8651",
6 | "metadata": {},
7 | "source": [
8 | "# LangChain MongoDB Integration - GraphRAG\n",
9 | "\n",
10 | "This notebook is a companion to the [GraphRAG with MongoDB and LangChain](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/graph-rag/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
11 | "\n",
12 | "This notebook demonstrates a GraphRAG implementation using MongoDB and LangChain. Compared to vector-based RAG, which structures your data as vector embeddings, GraphRAG structures data as a knowledge graph with entities and their relationships. This enables relationship-aware retrieval and multi-hop reasoning.\n",
13 | "\n",
14 | "\n",
15 | "
\n",
16 | ""
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "id": "23f70093-83ea-4ecc-87db-2f2f89e546d7",
23 | "metadata": {
24 | "scrolled": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "pip install --quiet --upgrade pymongo langchain_community wikipedia langchain_openai langchain_mongodb langchain-text-splitters pyvis"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "id": "d96955f9-a370-4f45-970d-ef187ee6195c",
34 | "metadata": {},
35 | "source": [
36 | "## Set up your environment\n",
37 | "\n",
38 | "Before you begin, make sure you have the following:\n",
39 | "\n",
40 | "- A MongoDB cluster up and running (you'll need the [connection string](https://www.mongodb.com/docs/manual/reference/connection-string/))\n",
41 | "- An API key to access an LLM (This tutorial uses a model from OpenAI, but you can use any model [supported by LangChain](https://python.langchain.com/docs/integrations/chat/))"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "0119b58d-f14e-4f36-a284-345d94478537",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import os\n",
52 | "\n",
53 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
54 | "MONGODB_URI = \"\"\n",
55 | "DB_NAME = \"langchain_db\" # MongoDB database to store the knowledge graph\n",
56 | "COLLECTION = \"wikipedia\" # MongoDB collection to store the knowledge graph"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "id": "0adf66a8",
62 | "metadata": {},
63 | "source": [
64 | "## Use MongoDB as a knowledge graph\n",
65 | "\n",
66 | "Use the `MongoDBGraphStore` component to store your data as a knowledge graph. This component allows you to implement GraphRAG by storing entities (nodes) and their relationships (edges) in a MongoDB collection. It stores each entity as a document with relationship fields that reference other documents in your collection."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "id": "f4e8db2f-d918-41aa-92f8-41f80a6d747a",
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "from langchain_openai import OpenAI\n",
77 | "from langchain.chat_models import init_chat_model\n",
78 | "\n",
79 | "# For best results, use latest models such as gpt-4o and Claude Sonnet 3.5+, etc.\n",
80 | "chat_model = init_chat_model(\"gpt-4o\", model_provider=\"openai\", temperature=0)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "id": "72cd5c08-e17b-4f47-bca7-ded0fb25fb85",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "from langchain_community.document_loaders import WikipediaLoader\n",
91 | "from langchain_text_splitters import TokenTextSplitter\n",
92 | "\n",
93 | "# Load Wikipedia pages corresponding to the query \"Sherlock Holmes\"\n",
94 | "wikipedia_pages = WikipediaLoader(query=\"Sherlock Holmes\", load_max_docs=3).load()\n",
95 | "\n",
96 | "# Split the documents into chunks for efficient downstream processing (graph creation)\n",
97 | "text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=0)\n",
98 | "wikipedia_docs = text_splitter.split_documents(wikipedia_pages)\n",
99 | "\n",
100 | "# Print the first document\n",
101 | "wikipedia_docs[0]"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "id": "2dc8f05b-0f9a-4293-b9ea-761030c98dca",
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "from langchain_mongodb.graphrag.graph import MongoDBGraphStore\n",
112 | "\n",
113 | "graph_store = MongoDBGraphStore(\n",
114 | " connection_string = MONGODB_URI,\n",
115 | " database_name = DB_NAME,\n",
116 | " collection_name = COLLECTION,\n",
117 | " entity_extraction_model = chat_model\n",
118 | ")"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "id": "3664189e",
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "# Extract entities and create knowledge graph in MongoDB\n",
129 | "# This might take a few minutes; you can ignore any warnings\n",
130 | "graph_store.add_documents(wikipedia_docs)"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "id": "b167c2eb-b2c5-45ef-bdc9-8230f7da4c52",
136 | "metadata": {},
137 | "source": [
138 | "## Visualize the knowledge graph\n",
139 | "\n",
140 | "To visualize the knowledge graph, you can export the structured data to a visualization library like `pyvis`.\n",
141 | "This helps you to explore and understand the relationships and hierarchies within your data."
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "id": "8b515723-a8a4-435b-b386-5cb3244c2745",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "import networkx as nx\n",
152 | "from pyvis.network import Network\n",
153 | "\n",
154 | "def visualize_graph(collection):\n",
155 | " docs = list(collection.find())\n",
156 | " \n",
157 | " def format_attributes(attrs):\n",
158 | " return \"
\".join(f\"{k}: {', '.join(v)}\" for k, v in attrs.items()) if attrs else \"\"\n",
159 | " \n",
160 | " G = nx.DiGraph()\n",
161 | "\n",
162 | " # Create nodes\n",
163 | " for doc in docs:\n",
164 | " node_id = str(doc[\"_id\"])\n",
165 | " info = f\"Type: {doc.get('type', '')}\"\n",
166 | " if \"attributes\" in doc:\n",
167 | " attr_info = format_attributes(doc[\"attributes\"])\n",
168 | " if attr_info:\n",
169 | " info += \"
\" + attr_info\n",
170 | " G.add_node(node_id, label=node_id, title=info.replace(\"
\", \"\\n\"))\n",
171 | "\n",
172 | " # Create edges\n",
173 | " for doc in docs:\n",
174 | " source = str(doc[\"_id\"])\n",
175 | " rels = doc.get(\"relationships\", {})\n",
176 | " targets = rels.get(\"target_ids\", [])\n",
177 | " types = rels.get(\"types\", [])\n",
178 | " attrs = rels.get(\"attributes\", [])\n",
179 | " \n",
180 | " for i, target in enumerate(targets):\n",
181 | " edge_type = types[i] if i < len(types) else \"\"\n",
182 | " extra = attrs[i] if i < len(attrs) else {}\n",
183 | " edge_info = f\"Relationship: {edge_type}\"\n",
184 | " if extra:\n",
185 | " edge_info += \"
\" + format_attributes(extra)\n",
186 | " G.add_edge(source, str(target), label=edge_type, title=edge_info.replace(\"
\", \"\\n\"))\n",
187 | "\n",
188 | " # Build and configure network\n",
189 | " nt = Network(notebook=True, cdn_resources='in_line', width=\"800px\", height=\"600px\", directed=True)\n",
190 | " nt.from_nx(G)\n",
191 | " nt.set_options('''\n",
192 | " var options = {\n",
193 | " \"interaction\": {\n",
194 | " \"hover\": true,\n",
195 | " \"tooltipDelay\": 200\n",
196 | " },\n",
197 | " \"nodes\": {\n",
198 | " \"font\": {\"multi\": \"html\"}\n",
199 | " },\n",
200 | " \"physics\": {\n",
201 | " \"repulsion\": {\n",
202 | " \"nodeDistance\": 300,\n",
203 | " \"centralGravity\": 0.2,\n",
204 | " \"springLength\": 200,\n",
205 | " \"springStrength\": 0.05,\n",
206 | " \"damping\": 0.09\n",
207 | " }\n",
208 | " }\n",
209 | " }\n",
210 | " ''')\n",
211 | "\n",
212 | " return nt.generate_html()"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "id": "62f9040e",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "from IPython.display import HTML, display\n",
223 | "from pymongo import MongoClient\n",
224 | "\n",
225 | "client = MongoClient(MONGODB_URI)\n",
226 | "\n",
227 | "collection = client[DB_NAME][COLLECTION]\n",
228 | "html = visualize_graph(collection)\n",
229 | "\n",
230 | "display(HTML(html))"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "id": "fbea568d-c656-4271-9e40-6ee01292255e",
236 | "metadata": {},
237 | "source": [
238 | "## Answer questions on your data\n",
239 | "\n",
240 | "The `MongoDBGraphStore` class provides a `chat_response` method that you can use to answer questions on your data. It executes queries by using the `$graphLookup` aggregation stage."
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "id": "506c7366-972c-4e50-88c4-3d5b0151e363",
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "query = \"Who inspired Sherlock Holmes?\"\n",
251 | "\n",
252 | "answer = graph_store.chat_response(query)\n",
253 | "answer.content"
254 | ]
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "Python 3 (ipykernel)",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.9.6"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 5
278 | }
279 |
--------------------------------------------------------------------------------
/quantization/existing-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Vector Quantization - Existing Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Vector Quantization](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/#how-to-ingest-pre-quantized-vectors) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to pre-quantize and ingest your vectors for vector search from **existing data in Atlas** by using the `voyage-3-large` model from [Voyage AI](https://www.voyageai.com).\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade voyageai pymongo"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import os\n",
43 | "import voyageai\n",
44 | "from bson.binary import Binary, BinaryVectorDtype\n",
45 | "\n",
46 | "# Initialize the VoyageAI Client\n",
47 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
48 | "vo = voyageai.Client()\n",
49 | "\n",
50 | "# Define a function to generate embeddings for all strings in `texts`\n",
51 | "def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):\n",
52 | " embeddings = []\n",
53 | " for text in texts: # Process eachstring in the data list\n",
54 | " embedding = vo.embed(\n",
55 | " texts=[text], # Pass each string as a list with a single item\n",
56 | " model=model,\n",
57 | " output_dtype=dtype,\n",
58 | " output_dimension=output_dimension,\n",
59 | " ).embeddings[0]\n",
60 | " embeddings.append(embedding) # Collect the embedding for the current text\n",
61 | " return embeddings\n",
62 | "\n",
63 | "# Convert embeddings to BSON vectors\n",
64 | "def generate_bson_vector(vector, vector_dtype):\n",
65 | " return Binary.from_vector(vector, vector_dtype)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "import pymongo \n",
75 | "\n",
76 | "# Connect to your MongoDB cluster\n",
77 | "mongo_client = pymongo.MongoClient(\"\")\n",
78 | "db = mongo_client[\"sample_airbnb\"]\n",
79 | "collection = db[\"listingsAndReviews\"]\n",
80 | "\n",
81 | "# Filter to exclude null or empty summary fields\n",
82 | "filter = { \"summary\": {\"$nin\": [None, \"\"]} }\n",
83 | "\n",
84 | "# Get a subset of documents in the collection\n",
85 | "documents = collection.find(filter).limit(50)\n",
86 | "\n",
87 | "# Initialize the count of updated documents\n",
88 | "updated_doc_count = 0"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "model_name = \"voyage-3-large\"\n",
98 | "output_dimension = 1024\n",
99 | "float32_field = \"float32_embedding\"\n",
100 | "int8_field = \"int8_embedding\"\n",
101 | "int1_field = \"int1_embedding\"\n",
102 | "\n",
103 | "# Process and update each document\n",
104 | "updated_doc_count = 0 \n",
105 | "for document in documents: \n",
106 | " summary = document.get(\"summary\") \n",
107 | " if not summary: \n",
108 | " continue \n",
109 | " \n",
110 | " # Generate embeddings for the summary field \n",
111 | " float_embeddings = generate_embeddings([summary], model=model_name, dtype=\"float\", output_dimension=output_dimension) \n",
112 | " int8_embeddings = generate_embeddings([summary], model=model_name, dtype=\"int8\", output_dimension=output_dimension) \n",
113 | " ubinary_embeddings = generate_embeddings([summary], model=model_name, dtype=\"ubinary\", output_dimension=output_dimension) \n",
114 | " \n",
115 | " # Convert embeddings to BSON-compatible format \n",
116 | " bson_float = generate_bson_vector(float_embeddings[0], BinaryVectorDtype.FLOAT32) \n",
117 | " bson_int8 = generate_bson_vector(int8_embeddings[0], BinaryVectorDtype.INT8) \n",
118 | " bson_ubinary = generate_bson_vector(ubinary_embeddings[0], BinaryVectorDtype.PACKED_BIT) \n",
119 | " \n",
120 | " # Prepare the updated document \n",
121 | " updated_fields = { \n",
122 | " float32_field: bson_float, \n",
123 | " int8_field: bson_int8, \n",
124 | " int1_field: bson_ubinary,\n",
125 | " } \n",
126 | " \n",
127 | " # Update the document in MongoDB \n",
128 | " result = collection.update_one({\"_id\": document[\"_id\"]}, {\"$set\": updated_fields}) \n",
129 | " if result.modified_count > 0: \n",
130 | " updated_doc_count += 1 \n",
131 | " \n",
132 | "# Print the results \n",
133 | "print(f\"Number of documents updated: {updated_doc_count}\") "
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "from pymongo.operations import SearchIndexModel\n",
143 | "import time\n",
144 | "\n",
145 | "# Define and create the vector search index\n",
146 | "index_name = \"vector_index\"\n",
147 | "search_index_model = SearchIndexModel(\n",
148 | " definition={\n",
149 | " \"fields\": [\n",
150 | " {\n",
151 | " \"type\": \"vector\",\n",
152 | " \"path\": float32_field,\n",
153 | " \"similarity\": \"dotProduct\",\n",
154 | " \"numDimensions\": 1024\n",
155 | " },\n",
156 | " {\n",
157 | " \"type\": \"vector\",\n",
158 | " \"path\": int8_field,\n",
159 | " \"similarity\": \"dotProduct\",\n",
160 | " \"numDimensions\": 1024\n",
161 | " },\n",
162 | " {\n",
163 | " \"type\": \"vector\",\n",
164 | " \"path\": int1_field,\n",
165 | " \"similarity\": \"euclidean\",\n",
166 | " \"numDimensions\": 1024\n",
167 | " }\n",
168 | " ]\n",
169 | " },\n",
170 | " name=index_name,\n",
171 | " type=\"vectorSearch\"\n",
172 | ")\n",
173 | "result = collection.create_search_index(model=search_index_model)\n",
174 | "print(\"New search index named \" + result + \" is building.\")\n",
175 | "\n",
176 | "# Wait for initial sync to complete\n",
177 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
178 | "predicate=None\n",
179 | "if predicate is None:\n",
180 | " predicate = lambda index: index.get(\"queryable\") is True\n",
181 | "while True:\n",
182 | " indices = list(collection.list_search_indexes(index_name))\n",
183 | " if len(indices) and predicate(indices[0]):\n",
184 | " break\n",
185 | " time.sleep(5)\n",
186 | "print(result + \" is ready for querying.\")"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "import voyageai\n",
196 | "from bson.binary import Binary, BinaryVectorDtype\n",
197 | "\n",
198 | "# Define a function to run a vector search query\n",
199 | "def run_vector_search(query_text, collection, path):\n",
200 | " # Map path to output dtype and BSON vector type\n",
201 | " path_to_dtype = {\n",
202 | " float32_field: (\"float\", BinaryVectorDtype.FLOAT32),\n",
203 | " int8_field: (\"int8\", BinaryVectorDtype.INT8),\n",
204 | " int1_field: (\"ubinary\", BinaryVectorDtype.PACKED_BIT),\n",
205 | " }\n",
206 | "\n",
207 | " if path not in path_to_dtype:\n",
208 | " raise ValueError(\"Invalid path. Must be one of float32_field, int8_field, int1_field.\")\n",
209 | "\n",
210 | " # Get Voyage AI output dtype and BSON vector type based on the path\n",
211 | " output_dtype, bson_dtype = path_to_dtype[path]\n",
212 | "\n",
213 | " # Generate query embeddings using Voyage AI\n",
214 | " query_vector = vo.embed(\n",
215 | " texts=[query_text],\n",
216 | " model=\"voyage-3-large\",\n",
217 | " input_type=\"query\",\n",
218 | " output_dtype=output_dtype\n",
219 | " ).embeddings[0]\n",
220 | "\n",
221 | " # Convert the query vector to BSON format\n",
222 | " bson_query_vector = Binary.from_vector(query_vector, bson_dtype)\n",
223 | "\n",
224 | " # Define the aggregation pipeline for vector search\n",
225 | " pipeline = [\n",
226 | " {\n",
227 | " \"$vectorSearch\": {\n",
228 | " \"index\": index_name, # Replace with your index name\n",
229 | " \"path\": path, # Path to the embedding field\n",
230 | " \"queryVector\": bson_query_vector, # BSON-encoded query vector\n",
231 | " \"numCandidates\": 20,\n",
232 | " \"limit\": 5\n",
233 | " }\n",
234 | " },\n",
235 | " {\n",
236 | " \"$project\": {\n",
237 | " \"_id\": 0,\n",
238 | " \"summary\": 1,\n",
239 | " \"score\": { \"$meta\": \"vectorSearchScore\" } # Include the similarity score\n",
240 | " }\n",
241 | " }\n",
242 | " ]\n",
243 | "\n",
244 | " # Run the aggregation pipeline and return results\n",
245 | " return collection.aggregate(pipeline)"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "from pprint import pprint\n",
255 | "\n",
256 | "# Define a list of embedding fields to query\n",
257 | "embedding_fields = [float32_field, int8_field, int1_field] \n",
258 | "results = {}\n",
259 | "\n",
260 | "# Run vector search queries for each embedding type\n",
261 | "query_text = \"ocean view\"\n",
262 | "for field in embedding_fields:\n",
263 | " results[field] = list(run_vector_search(query_text, collection, field)) \n",
264 | "\n",
265 | "# Print the results\n",
266 | "for field, field_results in results.items():\n",
267 | " print(f\"Results from {field}\")\n",
268 | " pprint(field_results)"
269 | ]
270 | }
271 | ],
272 | "metadata": {
273 | "kernelspec": {
274 | "display_name": "Python 3",
275 | "language": "python",
276 | "name": "python3"
277 | },
278 | "language_info": {
279 | "codemirror_mode": {
280 | "name": "ipython",
281 | "version": 3
282 | },
283 | "file_extension": ".py",
284 | "mimetype": "text/x-python",
285 | "name": "python",
286 | "nbconvert_exporter": "python",
287 | "pygments_lexer": "ipython3",
288 | "version": "3.9.12"
289 | }
290 | },
291 | "nbformat": 4,
292 | "nbformat_minor": 2
293 | }
294 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/ai-integrations/langchain-memory-semantic-cache.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "922b6c28",
6 | "metadata": {},
7 | "source": [
8 | "# LangChain MongoDB Integration - Memory and Semantic Caching for RAG"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "1fc29d11",
14 | "metadata": {},
15 | "source": [
16 | "This notebook is a companion to the [Memory and Semantic Caching](https://www.mongodb.com/docs/atlas/ai-integrations/langchain/memory-semantic-cache/) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "id": "a289ba35",
27 | "metadata": {
28 | "vscode": {
29 | "languageId": "shellscript"
30 | }
31 | },
32 | "outputs": [],
33 | "source": [
34 | "pip install --quiet --upgrade langchain langchain-community langchain-core langchain-mongodb langchain-voyageai langchain-openai langchain-text-splitters pypdf"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "id": "c672ba1f",
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import os\n",
45 | "\n",
46 | "os.environ[\"OPENAI_API_KEY\"] = \"\"\n",
47 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
48 | "MONGODB_URI = \"\""
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "id": "8384c99d",
54 | "metadata": {},
55 | "source": [
56 | "## Configure the Vector Store"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "id": "f90ce770",
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
67 | "from langchain_voyageai import VoyageAIEmbeddings\n",
68 | "\n",
69 | "# Use the voyage-3-large embedding model\n",
70 | "embedding_model = VoyageAIEmbeddings(model=\"voyage-3-large\")\n",
71 | "\n",
72 | "# Create the vector store\n",
73 | "vector_store = MongoDBAtlasVectorSearch.from_connection_string(\n",
74 | " connection_string = MONGODB_URI,\n",
75 | " embedding = embedding_model,\n",
76 | " namespace = \"langchain_db.rag_with_memory\"\n",
77 | ")"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "id": "7fb2f164",
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "from langchain_community.document_loaders import PyPDFLoader\n",
88 | "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
89 | "\n",
90 | "# Load the PDF\n",
91 | "loader = PyPDFLoader(\"https://investors.mongodb.com/node/13176/pdf\")\n",
92 | "data = loader.load()\n",
93 | "\n",
94 | "# Split PDF into documents\n",
95 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n",
96 | "docs = text_splitter.split_documents(data)\n",
97 | "\n",
98 | "# Add data to the vector store\n",
99 | "vector_store.add_documents(docs)"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "id": "8bf1bff8",
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "# Use helper method to create the vector search index\n",
110 | "vector_store.create_vector_search_index(\n",
111 | " dimensions = 1024, # The dimensions of the vector embeddings to be indexed\n",
112 | " wait_until_complete = 60 # Number of seconds to wait for the index to build (can take around a minute)\n",
113 | ")"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "id": "8c3b6654",
119 | "metadata": {},
120 | "source": [
121 | "## Implement RAG with Memory"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "id": "55583167",
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "from langchain_openai import ChatOpenAI\n",
132 | "\n",
133 | "# Define the model to use for chat completion\n",
134 | "llm = ChatOpenAI(model = \"gpt-4o\")"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "3b3b0361",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "from langchain_mongodb.chat_message_histories import MongoDBChatMessageHistory\n",
145 | "from langchain_core.runnables.history import RunnableWithMessageHistory\n",
146 | "from langchain_core.prompts import MessagesPlaceholder\n",
147 | " \n",
148 | "# Define a function that gets the chat message history \n",
149 | "def get_session_history(session_id: str) -> MongoDBChatMessageHistory:\n",
150 | " return MongoDBChatMessageHistory(\n",
151 | " connection_string=MONGODB_URI,\n",
152 | " session_id=session_id,\n",
153 | " database_name=\"langchain_db\",\n",
154 | " collection_name=\"rag_with_memory\"\n",
155 | " )"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "id": "74dfa896",
162 | "metadata": {},
163 | "outputs": [],
164 | "source": [
165 | "from langchain_core.prompts import ChatPromptTemplate\n",
166 | "from langchain_core.output_parsers import StrOutputParser\n",
167 | "\n",
168 | "# Create a prompt to generate standalone questions from follow-up questions\n",
169 | "standalone_system_prompt = \"\"\"\n",
170 | " Given a chat history and a follow-up question, rephrase the follow-up question to be a standalone question.\n",
171 | " Do NOT answer the question, just reformulate it if needed, otherwise return it as is.\n",
172 | " Only return the final standalone question.\n",
173 | "\"\"\"\n",
174 | "\n",
175 | "standalone_question_prompt = ChatPromptTemplate.from_messages(\n",
176 | " [\n",
177 | " (\"system\", standalone_system_prompt),\n",
178 | " MessagesPlaceholder(variable_name=\"history\"),\n",
179 | " (\"human\", \"{question}\"),\n",
180 | " ]\n",
181 | ")\n",
182 | "# Parse output as a string\n",
183 | "parse_output = StrOutputParser()\n",
184 | "\n",
185 | "question_chain = standalone_question_prompt | llm | parse_output"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "c7ad7c83",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "from langchain_core.runnables import RunnablePassthrough\n",
196 | "\n",
197 | "# Create a retriever\n",
198 | "retriever = vector_store.as_retriever(search_type=\"similarity\", search_kwargs={ \"k\": 5 })\n",
199 | "\n",
200 | "# Create a retriever chain that processes the question with history and retrieves documents\n",
201 | "retriever_chain = RunnablePassthrough.assign(context=question_chain | retriever | (lambda docs: \"\\n\\n\".join([d.page_content for d in docs])))"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "id": "c15d460d",
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "# Create a prompt template that includes the retrieved context and chat history\n",
212 | "rag_system_prompt = \"\"\"Answer the question based only on the following context:\n",
213 | "{context}\n",
214 | "\"\"\"\n",
215 | "\n",
216 | "rag_prompt = ChatPromptTemplate.from_messages(\n",
217 | " [\n",
218 | " (\"system\", rag_system_prompt),\n",
219 | " MessagesPlaceholder(variable_name=\"history\"),\n",
220 | " (\"human\", \"{question}\"),\n",
221 | " ]\n",
222 | ")"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "id": "4401715b",
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "# Build the RAG chain\n",
233 | "rag_chain = (\n",
234 | " retriever_chain\n",
235 | " | rag_prompt\n",
236 | " | llm\n",
237 | " | parse_output\n",
238 | ")\n",
239 | "\n",
240 | "# Wrap the chain with message history\n",
241 | "rag_with_memory = RunnableWithMessageHistory(\n",
242 | " rag_chain,\n",
243 | " get_session_history,\n",
244 | " input_messages_key=\"question\",\n",
245 | " history_messages_key=\"history\",\n",
246 | ")"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "id": "2093d8c8",
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "# First question\n",
257 | "response_1 = rag_with_memory.invoke(\n",
258 | " {\"question\": \"What was MongoDB's latest acquisition?\"},\n",
259 | " {\"configurable\": {\"session_id\": \"user_1\"}}\n",
260 | ")\n",
261 | "print(response_1)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "id": "14513bb6",
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "# Follow-up question that references the previous question\n",
272 | "response_2 = rag_with_memory.invoke(\n",
273 | " {\"question\": \"Why did they do it?\"},\n",
274 | " {\"configurable\": {\"session_id\": \"user_1\"}}\n",
275 | ")\n",
276 | "print(response_2)"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "id": "d9b2c3c5",
282 | "metadata": {},
283 | "source": [
284 | "## Add Semantic Caching\n",
285 | "\n",
286 | "The semantic cache caches only the input to the LLM. When using it in retrieval chains, \n",
287 | "note that documents retrieved can change between runs, resulting in cache misses for \n",
288 | "semantically similar queries."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "594315fe",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "from langchain_mongodb.cache import MongoDBAtlasSemanticCache\n",
299 | "from langchain_core.globals import set_llm_cache\n",
300 | "\n",
301 | "# Configure the semantic cache\n",
302 | "set_llm_cache(MongoDBAtlasSemanticCache(\n",
303 | " connection_string = MONGODB_URI,\n",
304 | " database_name = \"langchain_db\",\n",
305 | " collection_name = \"semantic_cache\",\n",
306 | " embedding = embedding_model,\n",
307 | " index_name = \"vector_index\",\n",
308 | " similarity_threshold = 0.5 # Adjust based on your requirements\n",
309 | "))"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "id": "f8063217",
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "%%time\n",
320 | "\n",
321 | "# First query (not cached)\n",
322 | "rag_with_memory.invoke(\n",
323 | " {\"question\": \"What was MongoDB's latest acquisition?\"},\n",
324 | " {\"configurable\": {\"session_id\": \"user_2\"}}\n",
325 | ")"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "id": "df4b0318",
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "%%time\n",
336 | "\n",
337 | "# Second query (cached)\n",
338 | "rag_with_memory.invoke(\n",
339 | " {\"question\": \"What company did MongoDB acquire recently?\"},\n",
340 | " {\"configurable\": {\"session_id\": \"user_2\"}}\n",
341 | ")"
342 | ]
343 | }
344 | ],
345 | "metadata": {
346 | "kernelspec": {
347 | "display_name": "Python 3",
348 | "language": "python",
349 | "name": "python3"
350 | },
351 | "language_info": {
352 | "codemirror_mode": {
353 | "name": "ipython",
354 | "version": 3
355 | },
356 | "file_extension": ".py",
357 | "mimetype": "text/x-python",
358 | "name": "python",
359 | "nbconvert_exporter": "python",
360 | "pygments_lexer": "ipython3",
361 | "version": "3.10.12"
362 | }
363 | },
364 | "nbformat": 4,
365 | "nbformat_minor": 5
366 | }
367 |
--------------------------------------------------------------------------------
/quantization/new-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MongoDB Vector Search - Vector Quantization - New Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook is a companion to the [Vector Quantization](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/#how-to-ingest-pre-quantized-vectors) tutorial. Refer to the page for set-up instructions and detailed explanations.\n",
15 | "\n",
16 | "This notebook takes you through how to pre-quantize and ingest your vectors for vector search from **existing data in Atlas** by using the `voyage-3-large` model from [Voyage AI](https://www.voyageai.com).\n",
17 | "\n",
18 | "\n",
19 | "
\n",
20 | ""
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "vscode": {
28 | "languageId": "shellscript"
29 | }
30 | },
31 | "outputs": [],
32 | "source": [
33 | "pip install --quiet --upgrade voyageai pymongo"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import os\n",
43 | "import voyageai\n",
44 | "from bson.binary import Binary, BinaryVectorDtype\n",
45 | "\n",
46 | "# Initialize the VoyageAI Client\n",
47 | "os.environ[\"VOYAGE_API_KEY\"] = \"\"\n",
48 | "vo = voyageai.Client()\n",
49 | "\n",
50 | "# Define a function to generate embeddings for all strings in `texts`\n",
51 | "def generate_embeddings(texts, model: str, dtype: str, output_dimension: int):\n",
52 | " embeddings = []\n",
53 | " for text in texts: # Process eachstring in the data list\n",
54 | " embedding = vo.embed(\n",
55 | " texts=[text], # Pass each string as a list with a single item\n",
56 | " model=model,\n",
57 | " output_dtype=dtype,\n",
58 | " output_dimension=output_dimension,\n",
59 | " ).embeddings[0]\n",
60 | " embeddings.append(embedding) # Collect the embedding for the current text\n",
61 | " return embeddings\n",
62 | "\n",
63 | "# Convert embeddings to BSON vectors\n",
64 | "def generate_bson_vector(vector, vector_dtype):\n",
65 | " return Binary.from_vector(vector, vector_dtype)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "# Load sample data\n",
75 | "data = [\n",
76 | " \"The Great Wall of China is visible from space.\",\n",
77 | " \"The Eiffel Tower was completed in Paris in 1889.\",\n",
78 | " \"Mount Everest is the highest peak on Earth at 8,848m.\",\n",
79 | " \"Shakespeare wrote 37 plays and 154 sonnets during his lifetime.\",\n",
80 | " \"The Mona Lisa was painted by Leonardo da Vinci.\"\n",
81 | "]"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# Use the function with different output data types to generate embeddings\n",
91 | "model_name = \"voyage-3-large\"\n",
92 | "output_dimension = 1024\n",
93 | "\n",
94 | "# Generate embeddings in all supported data types\n",
95 | "float32_embeddings = generate_embeddings(data, model=model_name, dtype=\"float\", output_dimension=output_dimension)\n",
96 | "int8_embeddings = generate_embeddings(data, model=model_name, dtype=\"int8\", output_dimension=output_dimension)\n",
97 | "int1_embeddings = generate_embeddings(data, model=model_name, dtype=\"ubinary\", output_dimension=output_dimension)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# For all vectors in your collection, generate BSON vectors of float32, int8, and int1 embeddings\n",
107 | "bson_float32_embeddings = []\n",
108 | "bson_int8_embeddings = []\n",
109 | "bson_int1_embeddings = []\n",
110 | "for i, (f32_emb, int8_emb, int1_emb) in enumerate(zip(float32_embeddings, int8_embeddings, int1_embeddings)):\n",
111 | " bson_float32_embeddings.append(generate_bson_vector(f32_emb, BinaryVectorDtype.FLOAT32))\n",
112 | " bson_int8_embeddings.append(generate_bson_vector(int8_emb, BinaryVectorDtype.INT8))\n",
113 | " bson_int1_embeddings.append(generate_bson_vector(int1_emb, BinaryVectorDtype.PACKED_BIT))"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "# Specify the field names for the float32, int8, and int1 embeddings\n",
123 | "float32_field = \"float32_embedding\" \n",
124 | "int8_field = \"int8_embedding\"\n",
125 | "int1_field = \"int1_embedding\"\n",
126 | "\n",
127 | "# Define function to create documents with BSON vector embeddings\n",
128 | "def create_new_docs_with_bson_vectors(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, data):\n",
129 | " docs = []\n",
130 | " for i, (bson_f32_emb, bson_int8_emb, bson_int1_emb, text) in enumerate(zip(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, data)):\n",
131 | "\n",
132 | " doc = {\n",
133 | " \"_id\": i,\n",
134 | " \"text\": text,\n",
135 | " float32_field: bson_f32_emb,\n",
136 | " int8_field: bson_int8_emb,\n",
137 | " int1_field: bson_int1_emb\n",
138 | " }\n",
139 | " docs.append(doc)\n",
140 | " return docs\n",
141 | "\n",
142 | "# Create the documents\n",
143 | "documents = create_new_docs_with_bson_vectors(bson_float32_embeddings, bson_int8_embeddings, bson_int1_embeddings, data)"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "import pymongo\n",
153 | "\n",
154 | "mongo_client = pymongo.MongoClient(\"\")\n",
155 | "# Insert documents into a new database and collection\n",
156 | "db = mongo_client[\"\"]\n",
157 | "collection_name = \"\"\n",
158 | "db.create_collection(collection_name)\n",
159 | "collection = db[collection_name]\n",
160 | "\n",
161 | "collection.insert_many(documents)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "from pymongo.operations import SearchIndexModel\n",
171 | "import time\n",
172 | "\n",
173 | "# Define and create the vector search index\n",
174 | "index_name = \"vector_index\"\n",
175 | "search_index_model = SearchIndexModel(\n",
176 | " definition={\n",
177 | " \"fields\": [\n",
178 | " {\n",
179 | " \"type\": \"vector\",\n",
180 | " \"path\": float32_field,\n",
181 | " \"similarity\": \"dotProduct\",\n",
182 | " \"numDimensions\": 1024\n",
183 | " },\n",
184 | " {\n",
185 | " \"type\": \"vector\",\n",
186 | " \"path\": int8_field,\n",
187 | " \"similarity\": \"dotProduct\",\n",
188 | " \"numDimensions\": 1024\n",
189 | " },\n",
190 | " {\n",
191 | " \"type\": \"vector\",\n",
192 | " \"path\": int1_field,\n",
193 | " \"similarity\": \"euclidean\",\n",
194 | " \"numDimensions\": 1024\n",
195 | " }\n",
196 | " ]\n",
197 | " },\n",
198 | " name=index_name,\n",
199 | " type=\"vectorSearch\"\n",
200 | ")\n",
201 | "result = collection.create_search_index(model=search_index_model)\n",
202 | "print(\"New search index named \" + result + \" is building.\")\n",
203 | "\n",
204 | "# Wait for initial sync to complete\n",
205 | "print(\"Polling to check if the index is ready. This may take up to a minute.\")\n",
206 | "predicate=None\n",
207 | "if predicate is None:\n",
208 | " predicate = lambda index: index.get(\"queryable\") is True\n",
209 | "while True:\n",
210 | " indices = list(collection.list_search_indexes(index_name))\n",
211 | " if len(indices) and predicate(indices[0]):\n",
212 | " break\n",
213 | " time.sleep(5)\n",
214 | "print(result + \" is ready for querying.\")"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "import voyageai\n",
224 | "from bson.binary import Binary, BinaryVectorDtype\n",
225 | "\n",
226 | "# Define a function to run a vector search query\n",
227 | "def run_vector_search(query_text, collection, path):\n",
228 | " # Map path to output dtype and BSON vector type\n",
229 | " path_to_dtype = {\n",
230 | " float32_field: (\"float\", BinaryVectorDtype.FLOAT32),\n",
231 | " int8_field: (\"int8\", BinaryVectorDtype.INT8),\n",
232 | " int1_field: (\"ubinary\", BinaryVectorDtype.PACKED_BIT),\n",
233 | " }\n",
234 | "\n",
235 | " if path not in path_to_dtype:\n",
236 | " raise ValueError(\"Invalid path. Must be one of float32_field, int8_field, int1_field.\")\n",
237 | "\n",
238 | " # Get Voyage AI output dtype and BSON vector type based on the path\n",
239 | " output_dtype, bson_dtype = path_to_dtype[path]\n",
240 | "\n",
241 | " # Generate query embeddings using Voyage AI\n",
242 | " query_vector = vo.embed(\n",
243 | " texts=[query_text],\n",
244 | " model=\"voyage-3-large\",\n",
245 | " input_type=\"query\",\n",
246 | " output_dtype=output_dtype\n",
247 | " ).embeddings[0]\n",
248 | "\n",
249 | " # Convert the query vector to BSON format\n",
250 | " bson_query_vector = Binary.from_vector(query_vector, bson_dtype)\n",
251 | "\n",
252 | " # Define the aggregation pipeline for vector search\n",
253 | " pipeline = [\n",
254 | " {\n",
255 | " \"$vectorSearch\": {\n",
256 | " \"index\": index_name, # Replace with your index name\n",
257 | " \"path\": path, # Path to the embedding field\n",
258 | " \"queryVector\": bson_query_vector, # BSON-encoded query vector\n",
259 | " \"numCandidates\": 5,\n",
260 | " \"limit\": 2\n",
261 | " }\n",
262 | " },\n",
263 | " {\n",
264 | " \"$project\": {\n",
265 | " \"_id\": 0,\n",
266 | " \"text\": 1,\n",
267 | " \"score\": { \"$meta\": \"vectorSearchScore\" } # Include the similarity score\n",
268 | " }\n",
269 | " }\n",
270 | " ]\n",
271 | "\n",
272 | " # Run the aggregation pipeline and return results\n",
273 | " return collection.aggregate(pipeline)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "from pprint import pprint\n",
283 | "\n",
284 | "# Define a list of embedding fields to query\n",
285 | "embedding_fields = [float32_field, int8_field, int1_field] \n",
286 | "results = {}\n",
287 | "\n",
288 | "# Run vector search queries for each embedding type\n",
289 | "query_text = \"science fact\"\n",
290 | "for field in embedding_fields:\n",
291 | " results[field] = list(run_vector_search(query_text, collection, field)) \n",
292 | "\n",
293 | "# Print the results\n",
294 | "for field, field_results in results.items():\n",
295 | " print(f\"Results from {field}\")\n",
296 | " pprint(field_results)"
297 | ]
298 | }
299 | ],
300 | "metadata": {
301 | "kernelspec": {
302 | "display_name": "Python 3",
303 | "language": "python",
304 | "name": "python3"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 3
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython3",
316 | "version": "3.9.12"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 2
321 | }
322 |
--------------------------------------------------------------------------------