├── Quickstart-Agentic-Retrieval ├── .gitignore ├── requirements.txt ├── sample.env └── quickstart-agentic-retrieval.ipynb ├── azure-function-search ├── lookup.sample.dat ├── suggest.sample.dat ├── .funcignore ├── search.sample.dat ├── host.json ├── function_app.py ├── shared_code │ └── __init__.py ├── readme.md ├── local.settings.json.rename ├── requirements.txt ├── lookup.py ├── suggest.py ├── .gitignore └── search.py ├── Quickstart-Semantic-Search ├── requirements.txt └── sample.env ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── Quickstart-Vector-Search ├── sample.env └── requirements.txt ├── Quickstart-Document-Permissions-Push-API ├── sample.env ├── requirements.txt └── document-permissions-push-api.ipynb ├── Quickstart-Document-Permissions-Pull-API ├── requirements.txt ├── sample.env └── document-permissions-pull-api.ipynb ├── agentic-retrieval-pipeline-example ├── requirements.txt ├── sample.env └── agent-example.ipynb ├── bulk-insert ├── requirements.txt ├── readme.md ├── bulk-insert.py └── good-books-index.json ├── LICENSE.md ├── .gitignore ├── Quickstart ├── README.md └── azure-search-quickstart.ipynb ├── README.md ├── CONTRIBUTING.md └── cmk-example └── cmk-example.ipynb /Quickstart-Agentic-Retrieval/.gitignore: -------------------------------------------------------------------------------- 1 | *.jsonl -------------------------------------------------------------------------------- /azure-function-search/lookup.sample.dat: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Hello" 3 | } -------------------------------------------------------------------------------- /azure-function-search/suggest.sample.dat: -------------------------------------------------------------------------------- 1 | {"q": "w", "top": 5, suggester: "sg"} -------------------------------------------------------------------------------- /azure-function-search/.funcignore: -------------------------------------------------------------------------------- 1 | .git* 2 | .vscode 3 | local.settings.json 4 | test 5 | .venv -------------------------------------------------------------------------------- /Quickstart-Semantic-Search/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-identity 2 | dotenv 3 | azure-search-documents==11.7.0b2 4 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-python.python" 5 | ] 6 | } -------------------------------------------------------------------------------- /Quickstart-Semantic-Search/sample.env: -------------------------------------------------------------------------------- 1 | AZURE_SEARCH_ENDPOINT=https://your-search-service.search.windows.net 2 | AZURE_SEARCH_INDEX_NAME=hotels-sample-index -------------------------------------------------------------------------------- /Quickstart-Vector-Search/sample.env: -------------------------------------------------------------------------------- 1 | AZURE_SEARCH_ENDPOINT=https://your-search-service.search.windows.net 2 | AZURE_SEARCH_INDEX_NAME=vector-search-quickstart -------------------------------------------------------------------------------- /azure-function-search/search.sample.dat: -------------------------------------------------------------------------------- 1 | {"q":"","top":8,"skip":0,"filters":[{"field":"authors","value":"James Patterson"},{"field":"language_code","value":"en-US"}]} -------------------------------------------------------------------------------- /Quickstart-Document-Permissions-Push-API/sample.env: -------------------------------------------------------------------------------- 1 | AZURE_SEARCH_ENDPOINT=https://your-search-service.search.windows.net 2 | AZURE_SEARCH_INDEX=document-permissions-push-idx -------------------------------------------------------------------------------- /Quickstart-Document-Permissions-Push-API/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-identity 2 | aiohttp 3 | ipykernel 4 | dotenv 5 | requests 6 | msgraph-sdk 7 | azure-search-documents==11.7.0b2 -------------------------------------------------------------------------------- /Quickstart-Agentic-Retrieval/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-identity 2 | azure-ai-evaluation 3 | openai 4 | aiohttp 5 | ipykernel 6 | dotenv 7 | requests 8 | azure-search-documents==11.7.0b2 -------------------------------------------------------------------------------- /Quickstart-Vector-Search/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-ai-projects==1.0.0b11 2 | azure-ai-agents==1.0.0 3 | azure-identity 4 | aiohttp 5 | ipykernel 6 | dotenv 7 | azure-search-documents==11.7.0b2 8 | -------------------------------------------------------------------------------- /Quickstart-Document-Permissions-Pull-API/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-identity 2 | aiohttp 3 | ipykernel 4 | dotenv 5 | requests 6 | msgraph-sdk 7 | azure-storage-file-datalake 8 | azure-search-documents==11.7.0b2 -------------------------------------------------------------------------------- /agentic-retrieval-pipeline-example/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-ai-projects==2.0.0b1 2 | azure-mgmt-cognitiveservices 3 | azure-identity 4 | ipykernel 5 | dotenv 6 | azure-search-documents==11.7.0b2 7 | requests 8 | openai -------------------------------------------------------------------------------- /bulk-insert/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-core 2 | azure-search-documents 3 | requests 4 | certifi 5 | chardet 6 | idna 7 | isodate 8 | msrest 9 | numpy 10 | oauthlib 11 | pandas 12 | python-dateutil 13 | pytz 14 | requests-oauthlib 15 | six 16 | typing-extensions 17 | urllib3 -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach to Python Functions", 6 | "type": "python", 7 | "request": "attach", 8 | "port": 9091, 9 | "preLaunchTask": "func: host start" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /azure-function-search/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | }, 11 | "extensionBundle": { 12 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 13 | "version": "[4.*, 5.0.0)" 14 | } 15 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureFunctions.projectSubpath": "search-website-functions-v4/api", 3 | "azureFunctions.deploySubpath": "search-website-functions-v4/api", 4 | "azureFunctions.scmDoBuildDuringDeployment": true, 5 | "azureFunctions.pythonVenv": ".venv", 6 | "azureFunctions.projectLanguage": "Python", 7 | "azureFunctions.projectRuntime": "~4", 8 | "debug.internalConsoleOptions": "neverOpen" 9 | } -------------------------------------------------------------------------------- /Quickstart-Document-Permissions-Pull-API/sample.env: -------------------------------------------------------------------------------- 1 | AZURE_SEARCH_ENDPOINT=https://your-search-service.search.windows.net 2 | AZURE_SEARCH_INDEX=document-permissions-indexer-idx 3 | AZURE_SEARCH_INDEXER=document-permissions-indexer-idxr 4 | AZURE_SEARCH_DATASOURCE=document-permissions-indexer-ds 5 | AZURE_STORAGE_ACCOUNT_NAME= 6 | AZURE_STORAGE_CONTAINER_NAME=state-parks 7 | AZURE_STORAGE_CONNECTION_STRING= 8 | AZURE_STORAGE_RESOURCE_ID= -------------------------------------------------------------------------------- /azure-function-search/function_app.py: -------------------------------------------------------------------------------- 1 | import azure.functions as func 2 | import logging 3 | import json 4 | from search import bp as search_bp 5 | from lookup import bp as lookup_bp 6 | from suggest import bp as suggest_bp 7 | 8 | app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) 9 | 10 | app.register_functions(lookup_bp) 11 | app.register_functions(search_bp) 12 | app.register_functions(suggest_bp) 13 | 14 | 15 | -------------------------------------------------------------------------------- /azure-function-search/shared_code/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def azure_config(): 5 | 6 | configs = {} 7 | configs["search_facets"] = os.environ.get("SearchFacets", "") 8 | configs["search_index_name"] = os.environ.get("SearchIndexName", "") 9 | configs["search_service_name"] = os.environ.get("SearchServiceName", "") 10 | configs["search_api_key"] = os.environ.get("SearchApiKey", "") 11 | 12 | return configs 13 | -------------------------------------------------------------------------------- /azure-function-search/readme.md: -------------------------------------------------------------------------------- 1 | # Create an Azure function that specifies queries 2 | 3 | This folder contains source code for an Azure function that formulates queries, performs document lookup, and suggests typeahead queries. It's the Python version of the `api` content used in the [C# sample Add search to websites](https://learn.microsoft.com/azure/search/tutorial-csharp-overview). If you're a Python developer, you can substitute this code to create a Python version of the sample app. 4 | -------------------------------------------------------------------------------- /azure-function-search/local.settings.json.rename: -------------------------------------------------------------------------------- 1 | { 2 | "IsEncrypted": false, 3 | "Values": { 4 | "AzureWebJobsStorage": "", 5 | "AzureWebJobsFeatureFlags": "EnableWorkerIndexing", 6 | "FUNCTIONS_WORKER_RUNTIME": "python", 7 | "SearchApiKey": "YOUR-SEARCH-QUERY-KEY", 8 | "SearchServiceName": "YOUR-SEARCH-RESOURCE-NAME", 9 | "SearchIndexName": "good-books", 10 | "SearchFacets": "authors*,language_code" 11 | }, 12 | "Host": { 13 | "CORS": "*" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /agentic-retrieval-pipeline-example/sample.env: -------------------------------------------------------------------------------- 1 | PROJECT_ENDPOINT=https://your-foundry-resource.services.ai.azure.com/api/projects/your-foundry-project 2 | AGENT_MODEL=gpt-4.1-mini 3 | PROJECT_RESOURCE_ID=/subscriptions/your-subscription-id/resourceGroups/your-resource-group/providers/Microsoft.CognitiveServices/accounts/your-account/projects/your-project 4 | AZURE_OPENAI_ENDPOINT=https://your-openai-service.openai.azure.com 5 | AZURE_OPENAI_GPT_DEPLOYMENT=gpt-5-mini 6 | AZURE_SEARCH_ENDPOINT=https://your-search-service.search.windows.net 7 | AZURE_SEARCH_INDEX_NAME=earth-at-night -------------------------------------------------------------------------------- /azure-function-search/requirements.txt: -------------------------------------------------------------------------------- 1 | # DO NOT include azure-functions-worker in this file 2 | # The Python Worker is managed by Azure Functions platform 3 | # Manually managing azure-functions-worker may cause unexpected issues 4 | 5 | azure-common>=1.1.28,<2.0.0 6 | azure-core>=1.29.4,<2.0.0 7 | azure-functions>=1.17.0,<2.0.0 8 | azure-search-documents>=11.3.0,<12.0.0 9 | certifi>=2023.7.22,<2024.0.0 10 | chardet>=5.2.0,<6.0.0 11 | charset-normalizer>=3.2.0,<4.0.0 12 | idna>=3.4,<4.0.0 13 | isodate>=0.6.1,<1.0.0 14 | msrest>=0.7.1,<1.0.0 15 | oauthlib>=3.2.2,<4.0.0 16 | requests>=2.31.0,<3.0.0 17 | requests-oauthlib>=1.3.1,<2.0.0 18 | six>=1.16.0,<2.0.0 19 | typing_extensions>=4.8.0,<5.0.0 20 | urllib3>=2.0.4,<3.0.0 -------------------------------------------------------------------------------- /Quickstart-Agentic-Retrieval/sample.env: -------------------------------------------------------------------------------- 1 | # Variables for agentic retrieval in Azure AI Search 2 | SEARCH_ENDPOINT = https://your-search-service.search.windows.net 3 | AOAI_ENDPOINT = https://your-foundry-resource.openai.azure.com 4 | AOAI_EMBEDDING_MODEL = text-embedding-3-large 5 | AOAI_EMBEDDING_DEPLOYMENT = text-embedding-3-large 6 | AOAI_GPT_MODEL = gpt-5-mini 7 | AOAI_GPT_DEPLOYMENT = gpt-5-mini 8 | INDEX_NAME = earth-at-night 9 | KNOWLEDGE_SOURCE_NAME = earth-knowledge-source 10 | KNOWLEDGE_BASE_NAME = earth-knowledge-base 11 | 12 | # Variables for evaluation in Microsoft Foundry 13 | FOUNDRY_ENDPOINT = https://your-foundry-resource.services.ai.azure.com/api/projects/your-project-id 14 | AOAI_API_VERSION = 2025-04-01-preview -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "func", 6 | "command": "host start", 7 | "problemMatcher": "$func-python-watch", 8 | "isBackground": true, 9 | "dependsOn": "pip install (functions)", 10 | "options": { 11 | "cwd": "${workspaceFolder}/search-website-functions-v4/api" 12 | } 13 | }, 14 | { 15 | "label": "pip install (functions)", 16 | "type": "shell", 17 | "osx": { 18 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 19 | }, 20 | "windows": { 21 | "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" 22 | }, 23 | "linux": { 24 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 25 | }, 26 | "problemMatcher": [], 27 | "options": { 28 | "cwd": "${workspaceFolder}/search-website-functions-v4/api" 29 | } 30 | } 31 | ] 32 | } -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE -------------------------------------------------------------------------------- /azure-function-search/lookup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import azure.functions as func 3 | from azure.core.credentials import AzureKeyCredential 4 | from azure.search.documents import SearchClient 5 | from shared_code import azure_config 6 | import json 7 | 8 | environment_vars = azure_config() 9 | 10 | # Set Azure Search endpoint and key 11 | endpoint = f'https://{environment_vars["search_service_name"]}.search.windows.net' 12 | key = environment_vars["search_api_key"] 13 | 14 | # Your index name 15 | index_name = "good-books" 16 | 17 | # Create Azure SDK client 18 | search_client = SearchClient(endpoint, index_name, AzureKeyCredential(key)) 19 | 20 | bp = func.Blueprint() 21 | @bp.function_name("lookup") 22 | @bp.route(route="lookup", methods=[func.HttpMethod.GET, func.HttpMethod.POST]) 23 | def main(req: func.HttpRequest) -> func.HttpResponse: 24 | 25 | # http://localhost:7071/api/Lookup?id=100 26 | docid = req.params.get("id") 27 | 28 | if docid: 29 | logging.info(f"/Lookup id = {docid}") 30 | returnedDocument = search_client.get_document(key=docid) 31 | 32 | full_response = {} 33 | full_response["document"] = returnedDocument 34 | 35 | return func.HttpResponse( 36 | body=json.dumps(full_response), mimetype="application/json", status_code=200 37 | ) 38 | else: 39 | return func.HttpResponse("No doc id param found.", status_code=200) 40 | -------------------------------------------------------------------------------- /azure-function-search/suggest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import azure.functions as func 3 | from azure.core.credentials import AzureKeyCredential 4 | from azure.search.documents import SearchClient 5 | from shared_code import azure_config 6 | import json 7 | 8 | environment_vars = azure_config() 9 | 10 | # curl --header "Content-Type: application/json" \ 11 | # --request POST \ 12 | # --data '{"q":"code","top":"5", "suggester":"sg"}' \ 13 | # http://localhost:7071/api/Suggest 14 | 15 | # Set Azure Search endpoint and key 16 | service_name = environment_vars["search_service_name"] 17 | endpoint = f"https://{service_name}.search.windows.net" 18 | key = environment_vars["search_api_key"] 19 | 20 | # Your index name 21 | index_name = "good-books" 22 | 23 | # Create Azure SDK client 24 | search_client = SearchClient(endpoint, index_name, AzureKeyCredential(key)) 25 | 26 | 27 | bp=func.Blueprint() 28 | @bp.function_name("suggest") 29 | @bp.route(route="suggest", methods=[func.HttpMethod.GET, func.HttpMethod.POST] ) 30 | def main(req: func.HttpRequest) -> func.HttpResponse: 31 | 32 | # variables sent in body 33 | req_body = req.get_json() 34 | q = req_body.get("q") 35 | top = req_body.get("top") or 5 36 | suggester = req_body.get("suggester") or "sg" 37 | 38 | if q: 39 | logging.info("/Suggest q = %s", q) 40 | suggestions = search_client.suggest(search_text=q, suggester_name=suggester, top=top) 41 | 42 | # format the React app expects 43 | full_response = {} 44 | full_response["suggestions"] = suggestions 45 | logging.debug(suggestions) 46 | 47 | return func.HttpResponse( 48 | body=json.dumps(full_response), mimetype="application/json", status_code=200 49 | ) 50 | else: 51 | return func.HttpResponse("No query param found.", status_code=200) 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Azurite files for local environment 107 | __azurite_* 108 | -------------------------------------------------------------------------------- /Quickstart/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | page_type: sample 3 | languages: 4 | - python 5 | name: Python quickstart for Azure AI Search 6 | products: 7 | - azure 8 | - azure-cognitive-search 9 | description: | 10 | Learn how to create, load, and query an Azure AI Search index using Python. 11 | urlFragment: python-quickstart 12 | --- 13 | 14 | # Quickstart: Python for Azure AI Search 15 | 16 | ![Flask sample MIT license badge](https://img.shields.io/badge/license-MIT-green.svg) 17 | 18 | This sample demonstrates how to use the Azure SDK for Python to create an Azure AI Search index, load it with documents, and execute queries. The index is modeled on a subset of the hotels dataset, which is reduced in this sample for readability and comprehension. The code includes the index definition and documents. 19 | 20 | This sample uses a Jupyter notebook (.ipynb) file to perform the actions against the Azure AI Search service. 21 | 22 | ## Prerequisites 23 | 24 | * [Visual Studio Code](https://code.visualstudio.com/download) with the [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) and [Jupyter package](https://pypi.org/project/jupyter/). 25 | 26 | * The [azure-search-documents package](https://pypi.org/project/azure-search-documents/) from the Azure SDK for Python. 27 | 28 | ## Set up the sample 29 | 30 | 1. Clone or download this sample repository. 31 | 32 | 1. Extract contents if the download is a zip file. Make sure the files are read-write. 33 | 34 | ## Run the sample 35 | 36 | 1. Open the azure-search-quickstart.ipynb file in Visual Studio Code. 37 | 38 | 1. Set the service endpoint and API key for your search service: 39 | 40 | * service_name = "YOUR-SEARCH-SERVICE-NAME" 41 | * admin_key = "YOUR-SEARCH-SERVICE-ADMIN-API-KEY" 42 | 43 | 1. Run each step in sequence. 44 | 45 | ## Next step 46 | 47 | You can learn more about Azure AI Search on the [official documentation site](https://learn.microsoft.com/azure/search). 48 | -------------------------------------------------------------------------------- /bulk-insert/readme.md: -------------------------------------------------------------------------------- 1 | # Create an Azure AI Search index from a CSV file 2 | 3 | This folder contains source code for a bulk-insert program that creates and loads an index using the good-books sample data in a CSV folder. It's the Python version of the `bulk-insert` content used in the [C# sample Add search to websites](https://learn.microsoft.com/azure/search/tutorial-csharp-overview). If you're a Python developer, you can substitute this code to create a Python version of the sample app. 4 | 5 | You can also run this code standalone to create a good-books index on your search service. 6 | 7 | 1. Check your search service to make sure you have room for an extra index. The **Usage** tab on the Azure portal's search service page provides this information. The maximum limit on the free tier is 3 indexes. The maximum limit on the Basic tier is 15 indexes. 8 | 9 | 1. Change the following values in the `bulk-insert.py` file: 10 | 11 | * YOUR-SEARCH-RESOURCE-NAME (not the full URL) 12 | * YOUR-SEARCH-ADMIN-KEY 13 | 14 | 1. Create a virtual environment. Press Ctrl-Shift-P to open the command palette and search for `Python: Create Environment`. 15 | 16 | 1. Open an integrated terminal in Visual Studio Code. 17 | 18 | 1. Make sure the path is "azure-search-static-web-app/python/bulk-insert". 19 | 20 | 1. Install the dependencies: 21 | 22 | ```bash 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | 1. Run the program: 27 | 28 | ```bash 29 | py bulk-insert.py 30 | ``` 31 | 32 | 1. You should see the following output: 33 | 34 | ```bash 35 | Schema uploaded; Index created for good-books. 36 | Batch sent! - #1 37 | Batch sent! - #2 38 | Batch sent! - #3 39 | Batch sent! - #4 40 | Batch sent! - #5 41 | Batch sent! - #6 42 | Batch sent! - #7 43 | Batch sent! - #8 44 | Batch sent! - #9 45 | Batch sent! - #10 46 | Done! 47 | Upload complete 48 | ``` 49 | 50 | If you get a "file not found error" on good-books-index.json, try adding the "Terminal: Execute in File Directory" in **Settings** > **Extensions** > **Python**. 51 | -------------------------------------------------------------------------------- /azure-function-search/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | .dmypy.json 120 | dmypy.json 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | # Azure Functions artifacts 126 | bin 127 | obj 128 | appsettings.json 129 | local.settings.json 130 | 131 | # Azurite artifacts 132 | __blobstorage__ 133 | __queuestorage__ 134 | __azurite_db*__.json 135 | .python_packages -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python samples for Azure AI Search 2 | 3 | This repository contains Python code samples used in Azure AI Search documentation. Unless noted otherwise, all samples run on the shared (free) pricing tier of a [search service](https://learn.microsoft.com/azure/search/search-create-service-portal). 4 | 5 | If your configuration uses a search service managed identity for indexer connections, your search service must be on the Basic tier or higher. 6 | 7 | ## Day-one quickstarts and tutorials 8 | 9 | | Sample | Description | 10 | |--------|-------------| 11 | | [Quickstart](Quickstart/README.md) | Introduces the fundamental tasks of working with a classic search index: create, load, and query. The index is modeled on a subset of the hotels dataset, which is widely used in Azure AI Search samples but reduced in this sample for readability and comprehension. | 12 | | [Quickstart-Agentic-Retrieval](Quickstart-Agentic-Retrieval/quickstart-agentic-retrieval.ipynb) | Create a knowledge base in Azure AI Search to integrate LLM reasoning into query planning. | 13 | | [Quickstart-Document-Permissions-Pull-API](Quickstart-Document-Permissions-Pull-API/document-permissions-pull-api.ipynb) | Using an indexer "pull API" approach, flow access control lists from a data source to search results and apply permission filters that restrict access to authorized content. Indexer support is limited to Azure Data Lake Storage (ADLS) Gen2 permission metadata. | 14 | | [Quickstart-Document-Permissions-Push-API](Quickstart-Document-Permissions-Push-API/document-permissions-push-api.ipynb) | Using the push APIs for indexing a JSON payload, flow embedded permission metadata to indexed documents and search results that are filtered based on user access to authorized content. | 15 | | [Quickstart-Semantic-Search](Quickstart-Semantic-Search/semantic-search-quickstart.ipynb) | Extends the quickstart through modifications that invoke semantic ranking. This notebook adds a semantic configuration to the index and semantic query options that formulate the query and response. | 16 | | [Quickstart-Vector-Search](Quickstart-Vector-Search/quickstart-vector-search.ipynb) | Introduces vector search in Azure AI Search. This notebook demonstrates how to create, load, and query a vector index. | 17 | 18 | ## Deeper dive tutorials 19 | 20 | | Sample | Description | 21 | |--------|-------------| 22 | | [agentic-retrieval-pipeline-example](agentic-retrieval-pipeline-example/agent-example.ipynb) | Extends the quickstart by integrating Foundry Agent Service. Add an AI agent and MCP tool to your Azure AI Search agentic retrieval pipeline for an end-to-end conversational search experience. | 23 | | [azure-function-search](azure-function-search/readme.md) | An Azure Function that sends query requests to an Azure AI Search service. You can substitute this code to replace the contents of the `api` folder in the C# sample [azure-search-static-web-app](https://github.com/Azure-Samples/azure-search-static-web-app). | 24 | | [bulk-insert](bulk-insert/readme.md) | Create and load an index using the push APIs and sample data. You can substitute this code to replace the contents of the `bulk-insert` folder in the C# sample [azure-search-static-web-app](https://github.com/Azure-Samples/azure-search-static-web-app) | 25 | | [cmk-encryption](cmk-example/cmk-example.ipynb) | Encrypt content using customer-managed keys. | 26 | 27 | ## Archived samples 28 | 29 | + **azureml-custom-skill**: See the **Archive** branch of this repository. 30 | + **image-processing**: See [azure-search-sample-archive/tree/main/image-processing](https://github.com/Azure-Samples/azure-search-sample-archive/tree/main/image-processing). 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Azure AI Search samples 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 6 | 7 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | 15 | - [Code of Conduct](#coc) 16 | - [Issues and Bugs](#issue) 17 | - [Feature Requests](#feature) 18 | - [Submission Guidelines](#submit) 19 | 20 | ## Code of Conduct 21 | Help us keep this project open and inclusive. Please read and follow our [Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 22 | 23 | ## Found an Issue? 24 | If you find a bug in the source code or a mistake in the documentation, you can help us by 25 | [submitting an issue](#submit-issue) to the GitHub Repository. Even better, you can 26 | [submit a Pull Request](#submit-pr) with a fix. 27 | 28 | ## Want a Feature? 29 | You can *request* a new feature by [submitting an issue](#submit-issue) to the GitHub 30 | Repository. If you would like to *implement* a new feature, please submit an issue with 31 | a proposal for your work first, to be sure that we can use it. 32 | 33 | * **Small Features** can be crafted and directly [submitted as a Pull Request](#submit-pr). 34 | 35 | ## Submission Guidelines 36 | 37 | ### Submitting an Issue 38 | Before you submit an issue, search the archive, maybe your question was already answered. 39 | 40 | If your issue appears to be a bug, and hasn't been reported, open a new issue. 41 | Help us to maximize the effort we can spend fixing issues and adding new 42 | features, by not reporting duplicate issues. Providing the following information will increase the 43 | chances of your issue being dealt with quickly: 44 | 45 | * **Overview of the Issue** - if an error is being thrown a non-minified stack trace helps 46 | * **Version** - what version is affected (e.g. 0.1.2) 47 | * **Motivation for or Use Case** - explain what are you trying to do and why the current behavior is a bug for you 48 | * **Browsers and Operating System** - is this a problem with all browsers? 49 | * **Reproduce the Error** - provide a live example or a unambiguous set of steps 50 | * **Related Issues** - has a similar issue been reported before? 51 | * **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be 52 | causing the problem (line of code or commit) 53 | 54 | You can file new issues by providing the above information at the corresponding repository's issues link: https://github.com/[organization-name]/[repository-name]/issues/new]. 55 | 56 | ### Submitting a Pull Request (PR) 57 | Before you submit your Pull Request (PR) consider the following guidelines: 58 | 59 | * Search the repository (https://github.com/[organization-name]/[repository-name]/pulls) for an open or closed PR 60 | that relates to your submission. You don't want to duplicate effort. 61 | 62 | * Make your changes in a new git fork: 63 | 64 | * Commit your changes using a descriptive commit message 65 | * Push your fork to GitHub: 66 | * In GitHub, create a pull request 67 | * If we suggest changes then: 68 | * Make the required updates. 69 | * Rebase your fork and force push to your GitHub repository (this will update your Pull Request): 70 | 71 | ```shell 72 | git rebase master -i 73 | git push -f 74 | ``` 75 | 76 | That's it! Thank you for your contribution! 77 | -------------------------------------------------------------------------------- /azure-function-search/search.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import azure.functions as func 3 | from azure.core.credentials import AzureKeyCredential 4 | from azure.search.documents import SearchClient 5 | from shared_code import azure_config 6 | import json 7 | 8 | environment_vars = azure_config() 9 | 10 | # Set Azure Search endpoint and key 11 | endpoint = f'https://{environment_vars["search_service_name"]}.search.windows.net' 12 | key = environment_vars["search_api_key"] 13 | 14 | # Your index name 15 | index_name = "good-books" 16 | 17 | # Create Azure SDK client 18 | search_client = SearchClient(endpoint, index_name, AzureKeyCredential(key)) 19 | 20 | # returns obj like {authors: 'array', language_code:'string'} 21 | def read_facets(facetsString): 22 | facets = facetsString.split(",") 23 | output = {} 24 | for x in facets: 25 | if x.find("*") != -1: 26 | newVal = x.replace("*", "") 27 | output[newVal] = "array" 28 | else: 29 | output[x] = "string" 30 | 31 | return output 32 | 33 | 34 | # creates filters in odata syntax 35 | def create_filter_expression(filter_list, facets): 36 | i = 0 37 | filter_expressions = [] 38 | return_string = "" 39 | separator = " and " 40 | 41 | while i < len(filter_list): 42 | field = filter_list[i]["field"] 43 | value = filter_list[i]["value"] 44 | 45 | if facets[field] == "array": 46 | print("array") 47 | filter_expressions.append(f"{field}/any(t: search.in(t, '{value}', ','))") 48 | else: 49 | print("value") 50 | filter_expressions.append(f"{field} eq '{value}'") 51 | 52 | i += 1 53 | 54 | return_string = separator.join(filter_expressions) 55 | 56 | return return_string 57 | 58 | 59 | def new_shape(docs): 60 | 61 | old_api_shape = list(docs) 62 | 63 | client_side_expected_shape = [] 64 | 65 | for item in old_api_shape: 66 | 67 | new_document = {} 68 | new_document["score"] = item["@search.score"] 69 | new_document["highlights"] = item["@search.highlights"] 70 | 71 | new_api_shape = {} 72 | new_api_shape["id"] = item["id"] 73 | new_api_shape["goodreads_book_id"] = item["goodreads_book_id"] 74 | new_api_shape["best_book_id"] = item["best_book_id"] 75 | new_api_shape["work_id"] = item["work_id"] 76 | new_api_shape["books_count"] = item["books_count"] 77 | new_api_shape["isbn"] = item["isbn"] 78 | new_api_shape["isbn13"] = item["isbn13"] 79 | new_api_shape["authors"] = item["authors"] 80 | new_api_shape["original_publication_year"] = item["original_publication_year"] 81 | new_api_shape["original_title"] = item["original_title"] 82 | new_api_shape["title"] = item["title"] 83 | new_api_shape["language_code"] = item["language_code"] 84 | new_api_shape["average_rating"] = item["average_rating"] 85 | new_api_shape["ratings_count"] = item["ratings_count"] 86 | new_api_shape["work_ratings_count"] = item["work_ratings_count"] 87 | new_api_shape["work_text_reviews_count"] = item["work_text_reviews_count"] 88 | new_api_shape["ratings_1"] = item["ratings_1"] 89 | new_api_shape["ratings_2"] = item["ratings_2"] 90 | new_api_shape["ratings_3"] = item["ratings_3"] 91 | new_api_shape["ratings_4"] = item["ratings_4"] 92 | new_api_shape["ratings_5"] = item["ratings_5"] 93 | new_api_shape["image_url"] = item["image_url"] 94 | new_api_shape["small_image_url"] = item["small_image_url"] 95 | 96 | new_document["document"] = new_api_shape 97 | 98 | client_side_expected_shape.append(new_document) 99 | 100 | return list(client_side_expected_shape) 101 | 102 | bp=func.Blueprint() 103 | @bp.function_name("search") 104 | @bp.route(route="search", methods=[func.HttpMethod.GET, func.HttpMethod.POST] ) 105 | def main(req: func.HttpRequest) -> func.HttpResponse: 106 | 107 | # variables sent in body 108 | req_body = req.get_json() 109 | q = req_body.get("q") 110 | top = req_body.get("top") or 8 111 | skip = req_body.get("skip") or 0 112 | filters = req_body.get("filters") or [] 113 | 114 | facets = environment_vars["search_facets"] 115 | facetKeys = read_facets(facets) 116 | 117 | search_filter = "" 118 | if len(filters): 119 | search_filter = create_filter_expression(filters, facetKeys) 120 | 121 | if q: 122 | logging.info(f"/Search q = {q}") 123 | 124 | search_results = search_client.search( 125 | search_text=q, 126 | top=top, 127 | skip=skip, 128 | facets=facetKeys, 129 | filter=search_filter, 130 | include_total_count=True, 131 | ) 132 | 133 | returned_docs = new_shape(search_results) 134 | 135 | # format the React app expects 136 | full_response = {} 137 | 138 | full_response["count"] = search_results.get_count() 139 | full_response["facets"] = search_results.get_facets() 140 | full_response["results"] = returned_docs 141 | 142 | return func.HttpResponse( 143 | body=json.dumps(full_response), mimetype="application/json", status_code=200 144 | ) 145 | else: 146 | return func.HttpResponse("No query param found.", status_code=200) 147 | -------------------------------------------------------------------------------- /bulk-insert/bulk-insert.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import requests 4 | import pandas as pd 5 | from azure.core.credentials import AzureKeyCredential 6 | from azure.search.documents import SearchClient 7 | from azure.search.documents.indexes import SearchIndexClient 8 | from azure.search.documents.indexes.models import SearchIndex 9 | from azure.search.documents.indexes.models import ( 10 | ComplexField, 11 | CorsOptions, 12 | SearchIndex, 13 | ScoringProfile, 14 | SearchFieldDataType, 15 | SimpleField, 16 | SearchableField, 17 | ) 18 | 19 | # Get the service name (short name) and admin API key from the environment 20 | service_name = "YOUR-SEARCH-SERVICE-NAME" 21 | key = "YOUR-SEARCH-SERVICE-ADMIN-API-KEY" 22 | endpoint = "https://{}.search.windows.net/".format(service_name) 23 | 24 | # Give your index a name 25 | # You can also supply this at runtime in __main__ 26 | index_name = "good-books" 27 | 28 | # Search Index Schema definition 29 | index_schema = "./good-books-index.json" 30 | 31 | # Books catalog 32 | books_url = "https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/main/good-books/books.csv" 33 | batch_size = 1000 34 | 35 | # Instantiate a client 36 | class CreateClient(object): 37 | def __init__(self, endpoint, key, index_name): 38 | self.endpoint = endpoint 39 | self.index_name = index_name 40 | self.key = key 41 | self.credentials = AzureKeyCredential(key) 42 | 43 | # Create a SearchClient 44 | # Use this to upload docs to the Index 45 | def create_search_client(self): 46 | return SearchClient( 47 | endpoint=self.endpoint, 48 | index_name=self.index_name, 49 | credential=self.credentials, 50 | ) 51 | 52 | # Create a SearchIndexClient 53 | # This is used to create, manage, and delete an index 54 | def create_admin_client(self): 55 | return SearchIndexClient(endpoint=endpoint, credential=self.credentials) 56 | 57 | 58 | # Get Schema from File or URL 59 | def get_schema_data(schema, url=False): 60 | if not url: 61 | with open(schema) as json_file: 62 | schema_data = json.load(json_file) 63 | return schema_data 64 | else: 65 | data_from_url = requests.get(schema) 66 | schema_data = json.loads(data_from_url.content) 67 | return schema_data 68 | 69 | 70 | # Create Search Index from the schema 71 | # If reading the schema from a URL, set url=True 72 | def create_schema_from_json_and_upload(schema, index_name, admin_client, url=False): 73 | 74 | cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60) 75 | scoring_profiles = [] 76 | schema_data = get_schema_data(schema, url) 77 | 78 | index = SearchIndex( 79 | name=index_name, 80 | fields=schema_data["fields"], 81 | scoring_profiles=scoring_profiles, 82 | suggesters=schema_data["suggesters"], 83 | cors_options=cors_options, 84 | ) 85 | 86 | try: 87 | upload_schema = admin_client.create_index(index) 88 | if upload_schema: 89 | print(f"Schema uploaded; Index created for {index_name}.") 90 | else: 91 | exit(0) 92 | except: 93 | print("Unexpected error:", sys.exc_info()[0]) 94 | 95 | 96 | # Convert CSV data to JSON 97 | def convert_csv_to_json(url): 98 | df = pd.read_csv(url) 99 | convert = df.to_json(orient="records") 100 | return json.loads(convert) 101 | 102 | 103 | # Batch your uploads to Azure Search 104 | def batch_upload_json_data_to_index(json_file, client): 105 | batch_array = [] 106 | count = 0 107 | batch_counter = 0 108 | for i in json_file: 109 | count += 1 110 | batch_array.append( 111 | { 112 | "id": str(i["book_id"]), 113 | "goodreads_book_id": int(i["goodreads_book_id"]), 114 | "best_book_id": int(i["best_book_id"]), 115 | "work_id": int(i["work_id"]), 116 | "books_count": i["books_count"] if i["books_count"] else 0, 117 | "isbn": str(i["isbn"]), 118 | "isbn13": str(i["isbn13"]), 119 | "authors": i["authors"].split(",") if i["authors"] else None, 120 | "original_publication_year": int(i["original_publication_year"]) 121 | if i["original_publication_year"] 122 | else 0, 123 | "original_title": i["original_title"], 124 | "title": i["title"], 125 | "language_code": i["language_code"], 126 | "average_rating": int(i["average_rating"]) 127 | if i["average_rating"] 128 | else 0, 129 | "ratings_count": int(i["ratings_count"]) if i["ratings_count"] else 0, 130 | "work_ratings_count": int(i["work_ratings_count"]) 131 | if i["work_ratings_count"] 132 | else 0, 133 | "work_text_reviews_count": i["work_text_reviews_count"] 134 | if i["work_text_reviews_count"] 135 | else 0, 136 | "ratings_1": int(i["ratings_1"]) if i["ratings_1"] else 0, 137 | "ratings_2": int(i["ratings_2"]) if i["ratings_2"] else 0, 138 | "ratings_3": int(i["ratings_3"]) if i["ratings_3"] else 0, 139 | "ratings_4": int(i["ratings_4"]) if i["ratings_4"] else 0, 140 | "ratings_5": int(i["ratings_5"]) if i["ratings_5"] else 0, 141 | "image_url": i["image_url"], 142 | "small_image_url": i["small_image_url"], 143 | } 144 | ) 145 | 146 | # In this sample, we limit batches to 1000 records. 147 | # When the counter hits a number divisible by 1000, the batch is sent. 148 | if count % batch_size == 0: 149 | client.upload_documents(documents=batch_array) 150 | batch_counter += 1 151 | print(f"Batch sent! - #{batch_counter}") 152 | batch_array = [] 153 | 154 | # This will catch any records left over, when not divisible by 1000 155 | if len(batch_array) > 0: 156 | client.upload_documents(documents=batch_array) 157 | batch_counter += 1 158 | print(f"Final batch sent! - #{batch_counter}") 159 | 160 | print("Done!") 161 | 162 | 163 | if __name__ == "__main__": 164 | start_client = CreateClient(endpoint, key, index_name) 165 | admin_client = start_client.create_admin_client() 166 | search_client = start_client.create_search_client() 167 | schema = create_schema_from_json_and_upload( 168 | index_schema, index_name, admin_client, url=False 169 | ) 170 | books_data = convert_csv_to_json(books_url) 171 | batch_upload = batch_upload_json_data_to_index(books_data, search_client) 172 | print("Upload complete") 173 | -------------------------------------------------------------------------------- /bulk-insert/good-books-index.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "good-books", 3 | "fields": [ 4 | { 5 | "name": "id", 6 | "type": "Edm.String", 7 | "facetable": false, 8 | "filterable": false, 9 | "key": true, 10 | "retrievable": true, 11 | "searchable": true, 12 | "sortable": false, 13 | "analyzer": "standard.lucene", 14 | "indexAnalyzer": null, 15 | "searchAnalyzer": null, 16 | "synonymMaps": [], 17 | "fields": [] 18 | }, 19 | { 20 | "name": "goodreads_book_id", 21 | "type": "Edm.Int64", 22 | "facetable": false, 23 | "filterable": false, 24 | "retrievable": true, 25 | "sortable": false, 26 | "analyzer": null, 27 | "indexAnalyzer": null, 28 | "searchAnalyzer": null, 29 | "synonymMaps": [], 30 | "fields": [] 31 | }, 32 | { 33 | "name": "best_book_id", 34 | "type": "Edm.Int64", 35 | "facetable": false, 36 | "filterable": false, 37 | "retrievable": true, 38 | "sortable": false, 39 | "analyzer": null, 40 | "indexAnalyzer": null, 41 | "searchAnalyzer": null, 42 | "synonymMaps": [], 43 | "fields": [] 44 | }, 45 | { 46 | "name": "work_id", 47 | "type": "Edm.Int64", 48 | "facetable": false, 49 | "filterable": false, 50 | "retrievable": true, 51 | "sortable": false, 52 | "analyzer": null, 53 | "indexAnalyzer": null, 54 | "searchAnalyzer": null, 55 | "synonymMaps": [], 56 | "fields": [] 57 | }, 58 | { 59 | "name": "books_count", 60 | "type": "Edm.Int64", 61 | "facetable": false, 62 | "filterable": false, 63 | "retrievable": true, 64 | "sortable": false, 65 | "analyzer": null, 66 | "indexAnalyzer": null, 67 | "searchAnalyzer": null, 68 | "synonymMaps": [], 69 | "fields": [] 70 | }, 71 | { 72 | "name": "isbn", 73 | "type": "Edm.String", 74 | "facetable": false, 75 | "filterable": false, 76 | "key": false, 77 | "retrievable": true, 78 | "searchable": true, 79 | "sortable": false, 80 | "analyzer": "standard.lucene", 81 | "indexAnalyzer": null, 82 | "searchAnalyzer": null, 83 | "synonymMaps": [], 84 | "fields": [] 85 | }, 86 | { 87 | "name": "isbn13", 88 | "type": "Edm.String", 89 | "facetable": false, 90 | "filterable": false, 91 | "retrievable": true, 92 | "sortable": false, 93 | "analyzer": null, 94 | "indexAnalyzer": null, 95 | "searchAnalyzer": null, 96 | "synonymMaps": [], 97 | "fields": [] 98 | }, 99 | { 100 | "name": "authors", 101 | "type": "Collection(Edm.String)", 102 | "facetable": true, 103 | "filterable": true, 104 | "key": false, 105 | "retrievable": true, 106 | "searchable": true, 107 | "sortable": false, 108 | "analyzer": "standard.lucene", 109 | "indexAnalyzer": null, 110 | "searchAnalyzer": null, 111 | "synonymMaps": [], 112 | "fields": [] 113 | }, 114 | { 115 | "name": "original_publication_year", 116 | "type": "Edm.Int64", 117 | "facetable": false, 118 | "filterable": false, 119 | "retrievable": true, 120 | "sortable": false, 121 | "analyzer": null, 122 | "indexAnalyzer": null, 123 | "searchAnalyzer": null, 124 | "synonymMaps": [], 125 | "fields": [] 126 | }, 127 | { 128 | "name": "original_title", 129 | "type": "Edm.String", 130 | "facetable": false, 131 | "filterable": false, 132 | "key": false, 133 | "retrievable": true, 134 | "searchable": true, 135 | "sortable": false, 136 | "analyzer": "standard.lucene", 137 | "indexAnalyzer": null, 138 | "searchAnalyzer": null, 139 | "synonymMaps": [], 140 | "fields": [] 141 | }, 142 | { 143 | "name": "title", 144 | "type": "Edm.String", 145 | "facetable": false, 146 | "filterable": false, 147 | "key": false, 148 | "retrievable": true, 149 | "searchable": true, 150 | "sortable": true, 151 | "analyzer": "standard.lucene", 152 | "indexAnalyzer": null, 153 | "searchAnalyzer": null, 154 | "synonymMaps": [], 155 | "fields": [] 156 | }, 157 | { 158 | "name": "language_code", 159 | "type": "Edm.String", 160 | "facetable": true, 161 | "filterable": true, 162 | "key": false, 163 | "retrievable": true, 164 | "searchable": false, 165 | "sortable": false, 166 | "analyzer": null, 167 | "indexAnalyzer": null, 168 | "searchAnalyzer": null, 169 | "synonymMaps": [], 170 | "fields": [] 171 | }, 172 | { 173 | "name": "average_rating", 174 | "type": "Edm.Double", 175 | "facetable": true, 176 | "filterable": true, 177 | "retrievable": true, 178 | "sortable": true, 179 | "analyzer": null, 180 | "indexAnalyzer": null, 181 | "searchAnalyzer": null, 182 | "synonymMaps": [], 183 | "fields": [] 184 | }, 185 | { 186 | "name": "ratings_count", 187 | "type": "Edm.Int64", 188 | "facetable": true, 189 | "filterable": true, 190 | "retrievable": true, 191 | "sortable": true, 192 | "analyzer": null, 193 | "indexAnalyzer": null, 194 | "searchAnalyzer": null, 195 | "synonymMaps": [], 196 | "fields": [] 197 | }, 198 | { 199 | "name": "work_ratings_count", 200 | "type": "Edm.Int64", 201 | "facetable": false, 202 | "filterable": false, 203 | "retrievable": true, 204 | "sortable": false, 205 | "analyzer": null, 206 | "indexAnalyzer": null, 207 | "searchAnalyzer": null, 208 | "synonymMaps": [], 209 | "fields": [] 210 | }, 211 | { 212 | "name": "work_text_reviews_count", 213 | "type": "Edm.Int64", 214 | "facetable": false, 215 | "filterable": false, 216 | "retrievable": true, 217 | "sortable": false, 218 | "analyzer": null, 219 | "indexAnalyzer": null, 220 | "searchAnalyzer": null, 221 | "synonymMaps": [], 222 | "fields": [] 223 | }, 224 | { 225 | "name": "ratings_1", 226 | "type": "Edm.Int64", 227 | "facetable": false, 228 | "filterable": false, 229 | "retrievable": true, 230 | "sortable": false, 231 | "analyzer": null, 232 | "indexAnalyzer": null, 233 | "searchAnalyzer": null, 234 | "synonymMaps": [], 235 | "fields": [] 236 | }, 237 | { 238 | "name": "ratings_2", 239 | "type": "Edm.Int64", 240 | "facetable": false, 241 | "filterable": false, 242 | "retrievable": true, 243 | "sortable": false, 244 | "analyzer": null, 245 | "indexAnalyzer": null, 246 | "searchAnalyzer": null, 247 | "synonymMaps": [], 248 | "fields": [] 249 | }, 250 | { 251 | "name": "ratings_3", 252 | "type": "Edm.Int64", 253 | "facetable": false, 254 | "filterable": false, 255 | "retrievable": true, 256 | "sortable": false, 257 | "analyzer": null, 258 | "indexAnalyzer": null, 259 | "searchAnalyzer": null, 260 | "synonymMaps": [], 261 | "fields": [] 262 | }, 263 | { 264 | "name": "ratings_4", 265 | "type": "Edm.Int64", 266 | "facetable": false, 267 | "filterable": false, 268 | "retrievable": true, 269 | "sortable": false, 270 | "analyzer": null, 271 | "indexAnalyzer": null, 272 | "searchAnalyzer": null, 273 | "synonymMaps": [], 274 | "fields": [] 275 | }, 276 | { 277 | "name": "ratings_5", 278 | "type": "Edm.Int64", 279 | "facetable": false, 280 | "filterable": false, 281 | "retrievable": true, 282 | "sortable": false, 283 | "analyzer": null, 284 | "indexAnalyzer": null, 285 | "searchAnalyzer": null, 286 | "synonymMaps": [], 287 | "fields": [] 288 | }, 289 | { 290 | "name": "image_url", 291 | "type": "Edm.String", 292 | "facetable": false, 293 | "filterable": false, 294 | "key": false, 295 | "retrievable": true, 296 | "searchable": true, 297 | "sortable": false, 298 | "analyzer": "standard.lucene", 299 | "indexAnalyzer": null, 300 | "searchAnalyzer": null, 301 | "synonymMaps": [], 302 | "fields": [] 303 | }, 304 | { 305 | "name": "small_image_url", 306 | "type": "Edm.String", 307 | "facetable": false, 308 | "filterable": false, 309 | "key": false, 310 | "retrievable": true, 311 | "searchable": true, 312 | "sortable": false, 313 | "analyzer": "standard.lucene", 314 | "indexAnalyzer": null, 315 | "searchAnalyzer": null, 316 | "synonymMaps": [], 317 | "fields": [] 318 | } 319 | ], 320 | "suggesters": [ 321 | { 322 | "name": "sg", 323 | "searchMode": "analyzingInfixMatching", 324 | "sourceFields": [ 325 | "authors", 326 | "original_title" 327 | ] 328 | } 329 | ], 330 | "scoringProfiles": [], 331 | "defaultScoringProfile": "", 332 | "corsOptions": { 333 | "allowedOrigins": [ 334 | "*" 335 | ], 336 | "maxAgeInSeconds": 300 337 | }, 338 | "analyzers": [], 339 | "charFilters": [], 340 | "tokenFilters": [], 341 | "tokenizers": [], 342 | "encryptionKey": null 343 | } -------------------------------------------------------------------------------- /Quickstart-Document-Permissions-Push-API/document-permissions-push-api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "810ce279", 6 | "metadata": {}, 7 | "source": [ 8 | "# Document-level access example using the push document APIs\n", 9 | "\n", 10 | "In Azure AI Search, you can upload any JSON document payload to a search index for indexing. This notebook shows you how index documents that contain [user access permissions at the document level](azure/search/search-document-level-access-overview), and then query the index to return only those results that the user is authorized to view.\n", 11 | "\n", 12 | "The security principal behind the query access token determines the \"user\". The permission metadata in the document determines whether the user has authorization to the content. Internally, the search engine filters out any documents that aren't associated with the security principal.\n", 13 | "\n", 14 | "This feature is currently in preview.\n", 15 | "\n", 16 | "For an alternative approaching using indexers and pull API, see [Quickstart-Document-Permissions-Pull-API](../Quickstart-Document-Permissions-Pull-API/document-permissions-pull-api.ipynb).\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "b6585426", 22 | "metadata": {}, 23 | "source": [ 24 | "## Prerequisites\n", 25 | "\n", 26 | "+ Azure AI Search with [role-based access control](https://learn.microsoft.com/azure/search/search-security-enable-roles).\n", 27 | "\n", 28 | "+ [Visual Studio Code](https://code.visualstudio.com/download) with the [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) and [Jupyter package](https://pypi.org/project/jupyter/).\n", 29 | "\n", 30 | "## Permissions\n", 31 | "\n", 32 | "This walkthrough uses Microsoft Entra ID authentication and authorization.\n", 33 | "\n", 34 | "On Azure AI Search, you must have role assignments to create objects and run queries:\n", 35 | "\n", 36 | "+ **Search Service Contributor**\n", 37 | "+ **Search Index Data Contributor**\n", 38 | "+ **Search Index Data Reader**\n", 39 | "\n", 40 | "For more information, see [Connect to Azure AI Search using roles](https://learn.microsoft.com/azure/search/search-security-rbac) and [Quickstart: Connect without keys for local testing](https://learn.microsoft.com/azure/search/search-get-started-rbac).\n", 41 | "\n", 42 | "## Set the environment variables\n", 43 | "\n", 44 | "1. Rename `sample.env` to `.env`.\n", 45 | "1. In the `.env` file, provide a full endpoint to your search service (https://your-search-service.search.windows.net).\n", 46 | "1. Replace the default index name if you want a different name.\n", 47 | "\n", 48 | "## Load connections\n", 49 | "\n", 50 | "We recommend creating a virtual environment to run this sample code. In Visual Studio Code, open the control palette (ctrl-shift-p) to create an environment. This notebook was tested on Python 3.10.\n", 51 | "\n", 52 | "After your environment is created, load the environment variables." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "2975a7f5", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from dotenv import load_dotenv\n", 63 | "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", 64 | "import os\n", 65 | "\n", 66 | "load_dotenv(override=True) # take environment variables from .env.\n", 67 | "\n", 68 | "# The following variables from your .env file are used in this notebook\n", 69 | "endpoint = os.environ[\"AZURE_SEARCH_ENDPOINT\"]\n", 70 | "credential = DefaultAzureCredential()\n", 71 | "index_name = os.getenv(\"AZURE_SEARCH_INDEX\")\n", 72 | "token_provider = get_bearer_token_provider(credential, \"https://search.azure.com/.default\")\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "9327cf01", 78 | "metadata": {}, 79 | "source": [ 80 | "## Create a sample index\n", 81 | "\n", 82 | "The search index must includes fields for your content and for permission metadata. Assign the new permission filter option to a string field and make sure the field is filterable. The search engine builds the filter internally at query time." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "9863061f", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from azure.search.documents.indexes.models import SearchField, SearchIndex, PermissionFilter, SearchIndexPermissionFilterOption\n", 93 | "from azure.search.documents.indexes import SearchIndexClient\n", 94 | "\n", 95 | "index_client = SearchIndexClient(endpoint=endpoint, credential=credential)\n", 96 | "index = SearchIndex(\n", 97 | " name=index_name,\n", 98 | " fields=[\n", 99 | " SearchField(name=\"id\", type=\"Edm.String\", key=True, filterable=True, sortable=True),\n", 100 | " SearchField(name=\"oid\", type=\"Collection(Edm.String)\", retrievable=True, filterable=True, permission_filter=PermissionFilter.USER_IDS),\n", 101 | " SearchField(name=\"group\", type=\"Collection(Edm.String)\", retrievable=True, filterable=True, permission_filter=PermissionFilter.GROUP_IDS),\n", 102 | " SearchField(name=\"name\", type=\"Edm.String\", searchable=True)\n", 103 | " ],\n", 104 | " permission_filter_option=SearchIndexPermissionFilterOption.ENABLED\n", 105 | ")\n", 106 | "\n", 107 | "index_client.create_index(index=index)\n", 108 | "print(f\"Index '{index_name}' created with permission filter option enabled.\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "id": "f5cf4169", 114 | "metadata": {}, 115 | "source": [ 116 | "## Connect to Graph to find your object ID (OID) and groups\n", 117 | "\n", 118 | "This step calls the Graph APIs to get a few group IDs for your Microsoft Entra identity. Your group IDs will be added to the access control list of the objects created in the next step." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "63904f09", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "from msgraph import GraphServiceClient\n", 129 | "client = GraphServiceClient(credentials=credential, scopes=[\"https://graph.microsoft.com/.default\"])\n", 130 | "\n", 131 | "groups = await client.me.member_of.get()\n", 132 | "me = await client.me.get()\n", 133 | "oid = me.id" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "a9ce6d0f", 139 | "metadata": {}, 140 | "source": [ 141 | "## Upload sample data\n", 142 | "\n", 143 | "This step creates the container, folders, and uploads documents into Azure Storage. It assigns your group IDs to to the access control list for each file." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "8fb830a1", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "from azure.search.documents import SearchClient\n", 154 | "search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)\n", 155 | "\n", 156 | "documents = [\n", 157 | " { \"id\": \"1\", \"oid\": [oid], \"group\": [groups.value[0].id], \"name\": \"Document 1\" },\n", 158 | " { \"id\": \"2\", \"oid\": [\"all\"], \"group\": [groups.value[0].id], \"name\": \"Document 2\" },\n", 159 | " { \"id\": \"3\", \"oid\": [oid], \"group\": [\"all\"], \"name\": \"Document 3\" },\n", 160 | " { \"id\": \"4\", \"oid\": [\"none\"], \"group\": [\"none\"], \"name\": \"Document 4\" },\n", 161 | " { \"id\": \"5\", \"oid\": [\"none\"], \"group\": [groups.value[0].id], \"name\": \"Document 5\" },\n", 162 | "]\n", 163 | "search_client.upload_documents(documents=documents)\n", 164 | "print(\"Documents uploaded to the index.\")\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "e5c93f76", 170 | "metadata": {}, 171 | "source": [ 172 | "## Search sample data with x-ms-query-source-authorization\n", 173 | "\n", 174 | "This query uses an empty search string (`*`) to provide an unqualified search. It returns the file name and permission metadata associated with each file. Notice that each file is associated with a different group ID." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "cd872e8c", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "results = search_client.search(search_text=\"*\", x_ms_query_source_authorization=token_provider(), select=\"name,oid,group\", order_by=\"id asc\")\n", 185 | "\n", 186 | "for result in results:\n", 187 | " print(f\"Name: {result['name']}, OID: {result['oid']}, Group: {result['group']}\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "id": "d31b67d8", 193 | "metadata": {}, 194 | "source": [ 195 | "## Search sample data without x-ms-query-source-authorization \n", 196 | "\n", 197 | "This step demonstrates the user experience when authorization fails. No results are returned in the response." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "a1f2f2a0", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "results = search_client.search(search_text=\"*\", x_ms_query_source_authorization=None, select=\"name,oid,group\", order_by=\"id asc\")\n", 208 | "\n", 209 | "for result in results:\n", 210 | " print(f\"Name: {result['name']}, OID: {result['oid']}, Group: {result['group']}\")" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "id": "5ad253ec", 216 | "metadata": {}, 217 | "source": [ 218 | "## Next step\n", 219 | "\n", 220 | "To learn more, see [Document-level access control in Azure AI Search](https://learn.microsoft.com/azure/search/search-document-level-access-overview)." 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": ".venv", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.12.10" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 5 245 | } 246 | -------------------------------------------------------------------------------- /cmk-example/cmk-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python example for CMK-encryption in Azure AI Search\n", 8 | "\n", 9 | "This notebook provides sample script for [adding customer-managed key (CMK) encryption](https://learn.microsoft.com/azure/search/search-security-manage-encryption-keys) to objects on Azure AI Search." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Prerequisites\n", 17 | "\n", 18 | "\n", 19 | "- [Azure AI Search](https://learn.microsoft.com/azure/search/search-create-service-portal)\n", 20 | "- [Azure Key Vault]()\n", 21 | "- [Azure Storage](https://learn.microsoft.com/azure/storage/common/storage-account-create) or [Azure Log Analytics](https://learn.microsoft.com/azure/azure-monitor/logs/quick-create-workspace?tabs=azure-portal) for data retention of audit logs.\n", 22 | "- [Visual Studio Code](https://code.visualstudio.com/download) with the [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) and [Jupyter package](https://pypi.org/project/jupyter/).\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Sign in to Azure\n", 30 | "\n", 31 | "You might not need this step, but if downstream connections fail with a 401 during indexer pipeline execution, it could be because you're using the wrong tenant or subscription. You can avoid this issue by signing in from the command line, explicitly setting the tenant ID and choosing the right subscription.\n", 32 | "\n", 33 | "This section assumes you have the [Azure CLI](https://learn.microsoft.com/cli/azure/authenticate-azure-cli-interactively).\n", 34 | "\n", 35 | "1. Open a command line prompt.\n", 36 | "\n", 37 | "1. Run this command to get a list of Azure tenants: `az account tenant list`\n", 38 | "\n", 39 | "1. If you have multiple tenants, set the tenant: `az login --tenant `\n", 40 | "\n", 41 | "If you have multiple subscriptions, a list is provided so that you can select one." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Create a virtual environment in Visual Studio Code\n", 49 | "\n", 50 | "Create a virtual environment so that you can install the dependencies in isolation.\n", 51 | "\n", 52 | "1. In Visual Studio Code, open the folder containing tutorial-rag.ipynb.\n", 53 | "\n", 54 | "1. Press Ctrl-shift-P to open the command palette, search for \"Python: Create Environment\", and then select `Venv` to create a virtual environment in the current workspace.\n", 55 | "\n", 56 | "1. Select Tutorial-RAG\\tutorial-rag-requirements.txt for the dependencies.\n", 57 | "\n", 58 | "It takes several minutes to create the environment. When the environment is ready, continue to the next step." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Install packages" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "! pip install python-dotenv\n", 75 | "! pip install azure-core\n", 76 | "! pip install azure-search-documents\n", 77 | "! pip install azure-storage-blob\n", 78 | "! pip install azure-identity\n", 79 | "! pip install openai\n", 80 | "! pip install aiohttp" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Create an index" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 26, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# Set variables\n", 97 | "AZURE_SEARCH_SERVICE: str = \"\"\n", 98 | "AZURE_KEY_VAULT_NAME: str = \"\"\n", 99 | "AZURE_KEY_VAULT_URI: str = \"\"\n", 100 | "AZURE_KEY_VAULT_VERSION: str = \"\"" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 24, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | " test-cmk-index-qs created\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "from azure.search.documents.indexes import SearchIndexClient\n", 118 | "from azure.search.documents.indexes.models import (\n", 119 | " SimpleField,\n", 120 | " SearchFieldDataType,\n", 121 | " SearchableField,\n", 122 | " SearchIndex,\n", 123 | " SearchResourceEncryptionKey\n", 124 | ")\n", 125 | "from azure.identity import DefaultAzureCredential\n", 126 | "\n", 127 | "credential = DefaultAzureCredential()\n", 128 | "\n", 129 | "# Create a search index \n", 130 | "index_name = \"test-cmk-index\"\n", 131 | "index_client = SearchIndexClient(endpoint=AZURE_SEARCH_SERVICE, credential=credential) \n", 132 | "fields = [\n", 133 | " SimpleField(name=\"Id\", type=SearchFieldDataType.String, key=True),\n", 134 | " SearchableField(name=\"Description\", type=SearchFieldDataType.String)\n", 135 | " ]\n", 136 | "\n", 137 | "scoring_profiles = []\n", 138 | "suggester = []\n", 139 | "encryption_key = SearchResourceEncryptionKey(\n", 140 | " key_name=AZURE_KEY_VAULT_NAME,\n", 141 | " key_version=AZURE_KEY_VAULT_VERSION,\n", 142 | " vault_uri=AZURE_KEY_VAULT_URI\n", 143 | ")\n", 144 | "\n", 145 | "# Create the search index=\n", 146 | "index = SearchIndex(name=index_name, fields=fields, encryption_key=encryption_key)\n", 147 | "result = index_client.create_or_update_index(index)\n", 148 | "print(f' {result.name} created')" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## Get the index definition" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "index_name = \"test-cmk-index\"\n", 165 | "index_client = SearchIndexClient(endpoint=AZURE_SEARCH_SERVICE, credential=credential) \n", 166 | "\n", 167 | "result = index_client.get_index(index_name) \n", 168 | "print(f\"{result}\") " 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## Load the index" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 29, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Upload of new document succeeded: True\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "from azure.search.documents import SearchClient\n", 193 | "\n", 194 | "# Create a documents payload\n", 195 | "documents = [\n", 196 | " {\n", 197 | " \"@search.action\": \"upload\",\n", 198 | " \"Id\": \"1\",\n", 199 | " \"Description\": \"The hotel is ideally located on the main commercial artery of the city in the heart of New York. A few minutes away is Time's Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.\"\n", 200 | " },\n", 201 | " {\n", 202 | " \"@search.action\": \"upload\",\n", 203 | " \"Id\": \"2\",\n", 204 | " \"Description\": \"The hotel is situated in a nineteenth century plaza, which has been expanded and renovated to the highest architectural standards to create a modern, functional and first-class hotel in which art and unique historical elements coexist with the most modern comforts.\"\n", 205 | " },\n", 206 | " {\n", 207 | " \"@search.action\": \"upload\",\n", 208 | " \"Id\": \"3\",\n", 209 | " \"Description\": \"The hotel stands out for its gastronomic excellence under the management of William Dough, who advises on and oversees all of the Hotel's restaurant services.\"\n", 210 | " },\n", 211 | " {\n", 212 | " \"@search.action\": \"upload\",\n", 213 | " \"Id\": \"4\",\n", 214 | " \"Description\": \"The hotel is located in the heart of the historic center of Sublime in an extremely vibrant and lively area within short walking distance to the sites and landmarks of the city and is surrounded by the extraordinary beauty of churches, buildings, shops and monuments. Sublime Cliff is part of a lovingly restored 1800 palace.\"\n", 215 | " }\n", 216 | "]\n", 217 | "\n", 218 | "search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE, index_name=index_name, credential=credential)\n", 219 | "try:\n", 220 | " result = search_client.upload_documents(documents=documents)\n", 221 | " print(\"Upload of new document succeeded: {}\".format(result[0].succeeded))\n", 222 | "except Exception as ex:\n", 223 | " print (ex.message)\n", 224 | "\n", 225 | " index_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE, credential=credential)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Query encrypted content" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 31, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "Score: 0.6130029\n", 245 | "Id: 4\n", 246 | "Description: The hotel is located in the heart of the historic center of Sublime in an extremely vibrant and lively area within short walking distance to the sites and landmarks of the city and is surrounded by the extraordinary beauty of churches, buildings, shops and monuments. Sublime Cliff is part of a lovingly restored 1800 palace.\n", 247 | "Score: 0.26286605\n", 248 | "Id: 1\n", 249 | "Description: The hotel is ideally located on the main commercial artery of the city in the heart of New York. A few minutes away is Time's Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "from azure.search.documents import SearchClient\n", 255 | "\n", 256 | "query = \"historic\" \n", 257 | "\n", 258 | "search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE, credential=credential, index_name=index_name)\n", 259 | " \n", 260 | "results = search_client.search( \n", 261 | " query_type='simple',\n", 262 | " search_text=query, \n", 263 | " select=[\"Id\", \"Description\"],\n", 264 | " include_total_count=True\n", 265 | " )\n", 266 | " \n", 267 | "for result in results: \n", 268 | " print(f\"Score: {result['@search.score']}\")\n", 269 | " print(f\"Id: {result['Id']}\")\n", 270 | " print(f\"Description: {result['Description']}\")\n" 271 | ] 272 | } 273 | ], 274 | "metadata": { 275 | "kernelspec": { 276 | "display_name": ".venv", 277 | "language": "python", 278 | "name": "python3" 279 | }, 280 | "language_info": { 281 | "codemirror_mode": { 282 | "name": "ipython", 283 | "version": 3 284 | }, 285 | "file_extension": ".py", 286 | "mimetype": "text/x-python", 287 | "name": "python", 288 | "nbconvert_exporter": "python", 289 | "pygments_lexer": "ipython3", 290 | "version": "3.11.9" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 2 295 | } 296 | -------------------------------------------------------------------------------- /Quickstart-Document-Permissions-Pull-API/document-permissions-pull-api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "aba4346f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Document level access in Azure AI Search using the indexer pull APIs\n", 9 | "\n", 10 | "In Azure AI Search, you can use an indexer to pull content into a search index for indexing. This notebook shows you how index blobs that have access control lists (ACLs) in Azure Storage Data Lake Storage (ADLS) Gen2, and then query the index to return only those results that the user is authorized to view.\n", 11 | "\n", 12 | "The security principal behind the query access token determines the \"user\". The ACLs on folders and files determine whether the user is authorized to view the content, and that metadata is pulled into the index along with other document content. Internally at query time, the search engine filters out any documents that aren't associated with the object ID.\n", 13 | "\n", 14 | "This feature is currently in preview.\n", 15 | "\n", 16 | "For an alternative approaching using push APIs to index any data, see [Quickstart-Document-Permissions-Push-API](../Quickstart-Document-Permissions-Push-API/document-permissions-push-api.ipynb).\n", 17 | "\n", 18 | "\n", 19 | "## Prerequisites\n", 20 | "\n", 21 | "+ Azure AI Search, Basic tier or higher, with a [system-assigned managed identity](https://learn.microsoft.com/azure/search/search-howto-managed-identities-data-sources) and [role-based access control](https://learn.microsoft.com/azure/search/search-security-enable-roles).\n", 22 | "\n", 23 | "+ Azure Storage, general purpose account, with a [hierarchical namespace](https://learn.microsoft.com/azure/storage/blobs/create-data-lake-storage-account).\n", 24 | "\n", 25 | "+ Folders and files, where each file has an [access control list specified](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). We recommend group IDs.\n", 26 | "\n", 27 | "+ [Visual Studio Code](https://code.visualstudio.com/download) with the [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) and [Jupyter package](https://pypi.org/project/jupyter/).\n", 28 | "\n", 29 | "## Permissions\n", 30 | "\n", 31 | "This walkthrough uses Microsoft Entra ID authentication and authorization.\n", 32 | "\n", 33 | "+ On Azure Storage, **Storage Blob Data Reader** permissions are required for both the search service identity and for your user account since you are testing locally. You also need **Storage Blob Data Contributor** because the sample includes code for creating and configuring a container and its contents.\n", 34 | "\n", 35 | "+ On Azure AI Search, assign yourself **Search Service Contributor**, **Search Index Data Contributor**, and **Search Index Data Reader** permissions to create objects and run queries. For more information, see [Connect to Azure AI Search using roles](https://learn.microsoft.com/azure/search/search-security-rbac) and [Quickstart: Connect without keys for local testing](https://learn.microsoft.com/azure/search/search-get-started-rbac).\n", 36 | "\n", 37 | "## Limitations\n", 38 | "\n", 39 | "+ Parsing indexer options aren't currently supported. There's no support for CSV, JSON, or Markdown parsing." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "f445040a", 45 | "metadata": {}, 46 | "source": [ 47 | "## Set up connections\n", 48 | "\n", 49 | "Save the `sample.env` file as `.env` and then modify the environment variables to use your Azure endpoints. All variables must be specified.\n", 50 | "\n", 51 | "You need endpoints for:\n", 52 | "\n", 53 | "+ Azure AI Search\n", 54 | "+ Azure Storage\n", 55 | "\n", 56 | "For Azure AI Search, find the endpoint in the [Azure portal](https://portal.azure.com), in the **Essentials** section of the Overview page.\n", 57 | "\n", 58 | "For Azure Storage, follow the guidance in [Get storage account configuration information](https://learn.microsoft.com/azure/storage/common/storage-account-get-info) to specify all of the variables in the `.env` file. \n", 59 | "\n", 60 | "## Load connections\n", 61 | "\n", 62 | "We recommend creating a virtual environment to run this sample code. In Visual Studio Code, open the control palette (ctrl-shift-p) to create an environment. This notebook was tested on Python 3.10.\n", 63 | "\n", 64 | "After the environment is created, load the environment variables to set up connections and object names." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "0b40bb5b", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "from dotenv import load_dotenv\n", 75 | "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", 76 | "import os\n", 77 | "\n", 78 | "load_dotenv(override=True) # take environment variables from .env.\n", 79 | "\n", 80 | "# The following variables from your .env file are used in this notebook\n", 81 | "endpoint = os.environ[\"AZURE_SEARCH_ENDPOINT\"]\n", 82 | "credential = DefaultAzureCredential()\n", 83 | "index_name = os.getenv(\"AZURE_SEARCH_INDEX\", \"document-permissions-indexer-idx\")\n", 84 | "indexer_name = os.getenv(\"AZURE_SEARCH_INDEXER\", \"document-permissions-indexer-idxr\")\n", 85 | "datasource_name = os.getenv(\"AZURE_SEARCH_DATASOURCE\", \"document-permissions-indexer-ds\")\n", 86 | "adls_gen2_account_name = os.getenv(\"AZURE_STORAGE_ACCOUNT_NAME\")\n", 87 | "adls_gen2_container_name = os.getenv(\"AZURE_STORAGE_CONTAINER_NAME\")\n", 88 | "adls_gen2_connection_string = os.environ[\"AZURE_STORAGE_CONNECTION_STRING\"]\n", 89 | "adls_gen2_resource_id = os.environ[\"AZURE_STORAGE_RESOURCE_ID\"]\n", 90 | "token_provider = get_bearer_token_provider(credential, \"https://search.azure.com/.default\")" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "2d46b940", 96 | "metadata": {}, 97 | "source": [ 98 | "## Create an index\n", 99 | "\n", 100 | "The search index must include fields for your content and for permission metadata. Assign the new permission filter option to a string field and make sure the field is filterable. The search engine builds the filter internally at query time." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "2f981cad", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "from azure.search.documents.indexes.models import SearchField, SearchIndex, PermissionFilter, SearchIndexPermissionFilterOption\n", 111 | "from azure.search.documents.indexes import SearchIndexClient\n", 112 | "\n", 113 | "index_client = SearchIndexClient(endpoint=endpoint, credential=credential)\n", 114 | "index = SearchIndex(\n", 115 | " name=index_name,\n", 116 | " fields=[\n", 117 | " SearchField(name=\"id\", type=\"Edm.String\", key=True, filterable=True, sortable=True),\n", 118 | " SearchField(name=\"content\", type=\"Edm.String\", searchable=True, filterable=False, sortable=False),\n", 119 | " SearchField(name=\"oids\", type=\"Collection(Edm.String)\", filterable=True, permission_filter=PermissionFilter.USER_IDS),\n", 120 | " SearchField(name=\"groups\", type=\"Collection(Edm.String)\", filterable=True, permission_filter=PermissionFilter.GROUP_IDS),\n", 121 | " SearchField(name=\"metadata_storage_path\", type=\"Edm.String\", searchable=True),\n", 122 | " SearchField(name=\"metadata_storage_name\", type=\"Edm.String\", searchable=True)\n", 123 | " ],\n", 124 | " permission_filter_option=SearchIndexPermissionFilterOption.ENABLED\n", 125 | ")\n", 126 | "\n", 127 | "index_client.create_or_update_index(index=index)\n", 128 | "print(f\"Index '{index_name}' created with permission filter option enabled.\")" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "2b8945a2", 134 | "metadata": {}, 135 | "source": [ 136 | "## Create a data source\n", 137 | "\n", 138 | "Set the `IndexerPermissionOption` so that the indexer knows to retrieve the permission metadata." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "b25aaf7b", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from azure.search.documents.indexes.models import SearchIndexerDataSourceConnection, SearchIndexerDataSourceType, IndexerPermissionOption, SearchIndexerDataContainer, DataSourceCredentials\n", 149 | "from azure.search.documents.indexes import SearchIndexerClient\n", 150 | "indexer_client = SearchIndexerClient(endpoint=endpoint, credential=credential)\n", 151 | "datasource = SearchIndexerDataSourceConnection(\n", 152 | " name=datasource_name,\n", 153 | " type=SearchIndexerDataSourceType.ADLS_GEN2,\n", 154 | " connection_string=f\"ResourceId={adls_gen2_resource_id};\",\n", 155 | " container=SearchIndexerDataContainer(name=adls_gen2_container_name),\n", 156 | " indexer_permission_options=[IndexerPermissionOption.GROUP_IDS]\n", 157 | ")\n", 158 | "\n", 159 | "indexer_client.create_or_update_data_source_connection(datasource)\n", 160 | "print(f\"Datasource '{datasource_name}' created with permission filter option enabled.\")" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "ff5b912d", 166 | "metadata": {}, 167 | "source": [ 168 | "## Get group IDs\n", 169 | "\n", 170 | "This step calls the Graph APIs to get a few group IDs for your Microsoft Entra identity. Your group IDs will be added to the access control list of the objects created in the next step. Two group identifiers are retrieved. Each one is assigned to a different file." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "329fe160", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "from msgraph import GraphServiceClient\n", 181 | "client = GraphServiceClient(credentials=credential, scopes=[\"https://graph.microsoft.com/.default\"])\n", 182 | "\n", 183 | "groups = await client.me.member_of.get()\n", 184 | "first_group_id = groups.value[0].id\n", 185 | "second_group_id = groups.value[1].id" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "id": "20588dc3", 191 | "metadata": {}, 192 | "source": [ 193 | "## Upload sample directory and file\n", 194 | "\n", 195 | "This step creates the container, folders, and uploads the files into Azure Storage. It assigns your group IDs to to the access control list for each file." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "acd28b29", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "from azure.storage.filedatalake import DataLakeServiceClient\n", 206 | "import requests\n", 207 | "\n", 208 | "service = DataLakeServiceClient.from_connection_string(adls_gen2_connection_string, credential=credential)\n", 209 | "container = service.get_file_system_client(adls_gen2_container_name)\n", 210 | "if not container.exists():\n", 211 | " container.create_file_system()\n", 212 | "root_dir_client = container.get_directory_client(\"/\")\n", 213 | "state_parks_dir_client = container.get_directory_client(\"state-parks\")\n", 214 | "state_parks_dir_client.create_directory()\n", 215 | "root_dir_client.update_access_control_recursive(f\"group:{first_group_id}:rwx\")\n", 216 | "root_dir_client.update_access_control_recursive(f\"group:{second_group_id}:rwx\")\n", 217 | "\n", 218 | "oregon_dir_client = state_parks_dir_client.create_sub_directory(\"oregon\")\n", 219 | "oregon_dir_client.create_directory()\n", 220 | "file_client = oregon_dir_client.create_file(\"oregon_state_parks.csv\")\n", 221 | "oregon_state_parks_content = requests.get(\"https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/refs/heads/main/state-parks/Oregon/oregon_state_parks.csv\").content.decode(\"utf-8\")\n", 222 | "file_client.upload_data(oregon_state_parks_content, overwrite=True)\n", 223 | "oregon_dir_client.update_access_control_recursive(f\"group:{first_group_id}:rwx\")\n", 224 | "\n", 225 | "washington_dir_client = state_parks_dir_client.create_sub_directory(\"washington\")\n", 226 | "washington_dir_client.create_directory()\n", 227 | "file_client = washington_dir_client.create_file(\"washington_state_parks.csv\")\n", 228 | "washington_state_parks_content = requests.get(\"https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/refs/heads/main/state-parks/Washington/washington_state_parks.csv\").content.decode(\"utf-8\")\n", 229 | "file_client.upload_data(washington_state_parks_content, overwrite=True)\n", 230 | "washington_dir_client.update_access_control_recursive(f\"group:{second_group_id}:rwx\")" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "id": "ca6de2ad", 236 | "metadata": {}, 237 | "source": [ 238 | "## Run the indexer\n", 239 | "\n", 240 | "Start the indexer to run all operations, from data retrieval to indexing. Any connection errors or permission problems become evident here." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "id": "2ce7eb5e", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "from azure.search.documents.indexes.models import SearchIndexer, FieldMapping\n", 251 | "\n", 252 | "indexer = SearchIndexer(\n", 253 | " name=indexer_name,\n", 254 | " target_index_name=index_name,\n", 255 | " data_source_name=datasource_name,\n", 256 | " field_mappings=[\n", 257 | " FieldMapping(source_field_name=\"metadata_group_ids\", target_field_name=\"groups\"),\n", 258 | " FieldMapping(source_field_name=\"metadata_user_ids\", target_field_name=\"oids\"),\n", 259 | " ]\n", 260 | ")\n", 261 | "\n", 262 | "indexer_client.create_or_update_indexer(indexer)\n", 263 | "print(f\"Indexer '{indexer_name}' created\")\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "987dd496", 269 | "metadata": {}, 270 | "source": [ 271 | "## Search sample data using x-ms-query-source-authorization\n", 272 | "\n", 273 | "Wait for the indexer to finish processing before running queries. This query uses an empty search string (`*`) for an unqualified search. It returns the file name and permission metadata associated with each file. Notice that each file is associated with a different group ID." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "7a899da1", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "from azure.search.documents import SearchClient\n", 284 | "search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)\n", 285 | "\n", 286 | "results = search_client.search(search_text=\"*\", x_ms_query_source_authorization=token_provider(), select=\"metadata_storage_path,oids,groups\", order_by=\"id asc\")\n", 287 | "for result in results:\n", 288 | " print(f\"Path: {result['metadata_storage_path']}, OID: {result['oids']}, Group: {result['groups']}\")" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "c712ab8c", 294 | "metadata": {}, 295 | "source": [ 296 | "## Search sample data without x-ms-query-source-authorization \n", 297 | "\n", 298 | "This step demonstrates the user experience when authorization fails. No results are returned in the response." 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "id": "72d203f0", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "from azure.search.documents import SearchClient\n", 309 | "search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)\n", 310 | "\n", 311 | "results = search_client.search(search_text=\"*\", x_ms_query_source_authorization=None, select=\"metadata_storage_path,oids,groups\", order_by=\"id asc\")\n", 312 | "for result in results:\n", 313 | " print(f\"Path: {result['metadata_storage_path']}, OID: {result['oids']}, Group: {result['groups']}\")" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "id": "e1ac3c84", 319 | "metadata": {}, 320 | "source": [ 321 | "## Next step\n", 322 | "\n", 323 | "To learn more, see [Document-level access control in Azure AI Search](https://learn.microsoft.com/azure/search/search-document-level-access-overview)." 324 | ] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": ".venv", 330 | "language": "python", 331 | "name": "python3" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.12.10" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 5 348 | } 349 | -------------------------------------------------------------------------------- /Quickstart/azure-search-quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Create a search index in Azure AI Search using the Azure SDK for Python" 9 | ] 10 | }, 11 | { 12 | "attachments": {}, 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "This notebook steps through creating, loading, and querying an index in Azure AI Search index by calling the azure-search-documents library in the Azure SDK for Python. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Install packages and set variables" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "! pip install azure-search-documents==11.7.0b2 --quiet\n", 33 | "! pip install azure-identity --quiet\n", 34 | "! pip install python-dotenv --quiet" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Provide variables\n", 44 | "search_endpoint: str = \"PUT-YOUR-SEARCH-ENDPOINT-HERE\"\n", 45 | "search_api_key: str = \"PUT-YOUR-SEARCH-API-KEY-HERE\"\n", 46 | "index_name: str = \"hotels-quickstart-csharp\"" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Create an index" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from azure.core.credentials import AzureKeyCredential\n", 63 | "\n", 64 | "credential = AzureKeyCredential(search_api_key)\n", 65 | "\n", 66 | "from azure.search.documents.indexes import SearchIndexClient\n", 67 | "from azure.search.documents import SearchClient\n", 68 | "from azure.search.documents.indexes.models import (\n", 69 | " ComplexField,\n", 70 | " SimpleField,\n", 71 | " SearchFieldDataType,\n", 72 | " SearchableField,\n", 73 | " SearchIndex\n", 74 | ")\n", 75 | "\n", 76 | "# Create a search schema\n", 77 | "index_client = SearchIndexClient(\n", 78 | " endpoint=search_endpoint, credential=credential)\n", 79 | "fields = [\n", 80 | " SimpleField(name=\"HotelId\", type=SearchFieldDataType.String, key=True),\n", 81 | " SearchableField(name=\"HotelName\", type=SearchFieldDataType.String, sortable=True),\n", 82 | " SearchableField(name=\"Description\", type=SearchFieldDataType.String, analyzer_name=\"en.lucene\"),\n", 83 | " SearchableField(name=\"Category\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", 84 | "\n", 85 | " SearchableField(name=\"Tags\", collection=True, type=SearchFieldDataType.String, facetable=True, filterable=True),\n", 86 | "\n", 87 | " SimpleField(name=\"ParkingIncluded\", type=SearchFieldDataType.Boolean, facetable=True, filterable=True, sortable=True),\n", 88 | " SimpleField(name=\"LastRenovationDate\", type=SearchFieldDataType.DateTimeOffset, facetable=True, filterable=True, sortable=True),\n", 89 | " SimpleField(name=\"Rating\", type=SearchFieldDataType.Double, facetable=True, filterable=True, sortable=True),\n", 90 | "\n", 91 | " ComplexField(name=\"Address\", fields=[\n", 92 | " SearchableField(name=\"StreetAddress\", type=SearchFieldDataType.String),\n", 93 | " SearchableField(name=\"City\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", 94 | " SearchableField(name=\"StateProvince\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", 95 | " SearchableField(name=\"PostalCode\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", 96 | " SearchableField(name=\"Country\", type=SearchFieldDataType.String, facetable=True, filterable=True, sortable=True),\n", 97 | " ])\n", 98 | " ]\n", 99 | "\n", 100 | "scoring_profiles = []\n", 101 | "suggester = [{'name': 'sg', 'source_fields': ['Tags', 'Address/City', 'Address/Country']}]\n", 102 | "\n", 103 | "# Create the search index=\n", 104 | "index = SearchIndex(name=index_name, fields=fields, suggesters=suggester, scoring_profiles=scoring_profiles)\n", 105 | "result = index_client.create_or_update_index(index)\n", 106 | "print(f' {result.name} created')" 107 | ] 108 | }, 109 | { 110 | "attachments": {}, 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Create a documents payload" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# Create a documents payload\n", 124 | "documents = [\n", 125 | " {\n", 126 | " \"@search.action\": \"upload\",\n", 127 | " \"HotelId\": \"1\",\n", 128 | " \"HotelName\": \"Stay-Kay City Hotel\",\n", 129 | " \"Description\": \"This classic hotel is fully-refurbished and ideally located on the main commercial artery of the city in the heart of New York. A few minutes away is Times Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.\",\n", 130 | " \"Category\": \"Boutique\",\n", 131 | " \"Tags\": [ \"view\", \"air conditioning\", \"concierge\" ],\n", 132 | " \"ParkingIncluded\": \"false\",\n", 133 | " \"LastRenovationDate\": \"2020-01-18T00:00:00Z\",\n", 134 | " \"Rating\": 3.60,\n", 135 | " \"Address\": {\n", 136 | " \"StreetAddress\": \"677 5th Ave\",\n", 137 | " \"City\": \"New York\",\n", 138 | " \"StateProvince\": \"NY\",\n", 139 | " \"PostalCode\": \"10022\",\n", 140 | " \"Country\": \"USA\"\n", 141 | " }\n", 142 | " },\n", 143 | " {\n", 144 | " \"@search.action\": \"upload\",\n", 145 | " \"HotelId\": \"2\",\n", 146 | " \"HotelName\": \"Old Century Hotel\",\n", 147 | " \"Description\": \"The hotel is situated in a nineteenth century plaza, which has been expanded and renovated to the highest architectural standards to create a modern, functional and first-class hotel in which art and unique historical elements coexist with the most modern comforts. The hotel also regularly hosts events like wine tastings, beer dinners, and live music.\",\n", 148 | " \"Category\": \"Boutique\",\n", 149 | " \"Tags\": [ \"pool\", \"free wifi\", \"concierge\" ],\n", 150 | " \"ParkingIncluded\": \"false\",\n", 151 | " \"LastRenovationDate\": \"2019-02-18T00:00:00Z\",\n", 152 | " \"Rating\": 3.60,\n", 153 | " \"Address\": {\n", 154 | " \"StreetAddress\": \"140 University Town Center Dr\",\n", 155 | " \"City\": \"Sarasota\",\n", 156 | " \"StateProvince\": \"FL\",\n", 157 | " \"PostalCode\": \"34243\",\n", 158 | " \"Country\": \"USA\"\n", 159 | " }\n", 160 | " },\n", 161 | " {\n", 162 | " \"@search.action\": \"upload\",\n", 163 | " \"HotelId\": \"3\",\n", 164 | " \"HotelName\": \"Gastronomic Landscape Hotel\",\n", 165 | " \"Description\": \"The Gastronomic Hotel stands out for its culinary excellence under the management of William Dough, who advises on and oversees all of the Hotel’s restaurant services.\",\n", 166 | " \"Category\": \"Suite\",\n", 167 | " \"Tags\": [ \"restaurant\", \"bar\", \"continental breakfast\" ],\n", 168 | " \"ParkingIncluded\": \"true\",\n", 169 | " \"LastRenovationDate\": \"2015-09-20T00:00:00Z\",\n", 170 | " \"Rating\": 4.80,\n", 171 | " \"Address\": {\n", 172 | " \"StreetAddress\": \"3393 Peachtree Rd\",\n", 173 | " \"City\": \"Atlanta\",\n", 174 | " \"StateProvince\": \"GA\",\n", 175 | " \"PostalCode\": \"30326\",\n", 176 | " \"Country\": \"USA\"\n", 177 | " }\n", 178 | " },\n", 179 | " {\n", 180 | " \"@search.action\": \"upload\",\n", 181 | " \"HotelId\": \"4\",\n", 182 | " \"HotelName\": \"Sublime Palace Hotel\",\n", 183 | " \"Description\": \"Sublime Palace Hotel is located in the heart of the historic center of Sublime in an extremely vibrant and lively area within short walking distance to the sites and landmarks of the city and is surrounded by the extraordinary beauty of churches, buildings, shops and monuments. Sublime Cliff is part of a lovingly restored 19th century resort, updated for every modern convenience.\",\n", 184 | " \"Category\": \"Boutique\",\n", 185 | " \"Tags\": [ \"concierge\", \"view\", \"air conditioning\" ],\n", 186 | " \"ParkingIncluded\": \"true\",\n", 187 | " \"LastRenovationDate\": \"2020-02-06T00:00:00Z\",\n", 188 | " \"Rating\": 4.60,\n", 189 | " \"Address\": {\n", 190 | " \"StreetAddress\": \"7400 San Pedro Ave\",\n", 191 | " \"City\": \"San Antonio\",\n", 192 | " \"StateProvince\": \"TX\",\n", 193 | " \"PostalCode\": \"78216\",\n", 194 | " \"Country\": \"USA\"\n", 195 | " }\n", 196 | " }\n", 197 | "]" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Upload documents" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "search_client = SearchClient(endpoint=search_endpoint,\n", 214 | " index_name=index_name,\n", 215 | " credential=credential)\n", 216 | "try:\n", 217 | " result = search_client.upload_documents(documents=documents)\n", 218 | " print(\"Upload of new document succeeded: {}\".format(result[0].succeeded))\n", 219 | "except Exception as ex:\n", 220 | " print (ex.message)\n", 221 | "\n", 222 | " index_client = SearchIndexClient(\n", 223 | " endpoint=search_endpoint, credential=credential)" 224 | ] 225 | }, 226 | { 227 | "attachments": {}, 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "## Run your first query" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# Run an empty query (returns selected fields, all documents)\n", 241 | "results = search_client.search(query_type='simple',\n", 242 | " search_text=\"*\" ,\n", 243 | " select='HotelName,Description,Tags',\n", 244 | " include_total_count=True)\n", 245 | "\n", 246 | "print ('Total Documents Matching Query:', results.get_count())\n", 247 | "for result in results:\n", 248 | " print(result[\"@search.score\"])\n", 249 | " print(result[\"HotelName\"])\n", 250 | " print(result[\"Tags\"])\n", 251 | " print(f\"Description: {result['Description']}\")\n" 252 | ] 253 | }, 254 | { 255 | "attachments": {}, 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## Run a term query" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "# Run a term query\n", 269 | "results = search_client.search(query_type='simple',\n", 270 | " search_text=\"wifi\" ,\n", 271 | " select='HotelName,Description,Tags',\n", 272 | " include_total_count=True)\n", 273 | "\n", 274 | "print ('Total Documents Matching Query:', results.get_count())\n", 275 | "for result in results:\n", 276 | " print(result[\"@search.score\"])\n", 277 | " print(result[\"HotelName\"])\n", 278 | " print(f\"Description: {result['Description']}\")" 279 | ] 280 | }, 281 | { 282 | "attachments": {}, 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Add a filter" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "# Add a filter\n", 296 | "results = search_client.search(\n", 297 | " search_text=\"hotels\", \n", 298 | " select='HotelId,HotelName,Rating', \n", 299 | " filter='Rating gt 4', \n", 300 | " order_by='Rating desc')\n", 301 | "\n", 302 | "for result in results:\n", 303 | " print(\"{}: {} - {} rating\".format(result[\"HotelId\"], result[\"HotelName\"], result[\"Rating\"]))" 304 | ] 305 | }, 306 | { 307 | "attachments": {}, 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "## Scope a query to specific searchable fields" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "results = search_client.search(\n", 321 | " search_text=\"sublime\", \n", 322 | " search_fields=['HotelName'], \n", 323 | " select='HotelId,HotelName')\n", 324 | "\n", 325 | "for result in results:\n", 326 | " print(\"{}: {}\".format(result[\"HotelId\"], result[\"HotelName\"]))" 327 | ] 328 | }, 329 | { 330 | "attachments": {}, 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "## Return facets" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Return facets\n", 344 | "results = search_client.search(search_text=\"*\", facets=[\"Category\"])\n", 345 | "\n", 346 | "facets = results.get_facets()\n", 347 | "\n", 348 | "for facet in facets[\"Category\"]:\n", 349 | " print(\" {}\".format(facet))" 350 | ] 351 | }, 352 | { 353 | "attachments": {}, 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Look up a document " 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "# Look up a specific document by ID\n", 367 | "result = search_client.get_document(key=\"3\")\n", 368 | "\n", 369 | "print(\"Details for hotel '3' are:\")\n", 370 | "print(\"Name: {}\".format(result[\"HotelName\"]))\n", 371 | "print(\"Rating: {}\".format(result[\"Rating\"]))\n", 372 | "print(\"Category: {}\".format(result[\"Category\"]))" 373 | ] 374 | }, 375 | { 376 | "attachments": {}, 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "## Autocomplete a query" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "# Autocomplete a query\n", 390 | "search_suggestion = 'sa'\n", 391 | "results = search_client.autocomplete(\n", 392 | " search_text=search_suggestion, \n", 393 | " suggester_name=\"sg\",\n", 394 | " mode='twoTerms')\n", 395 | "\n", 396 | "print(\"Autocomplete for:\", search_suggestion)\n", 397 | "for result in results:\n", 398 | " print (result['text'])" 399 | ] 400 | }, 401 | { 402 | "attachments": {}, 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "## Clean up\n", 407 | "\n", 408 | "If you are finished with this index, you can delete it by running the following lines. Deleting unnecessary indexes frees up space for stepping through more quickstarts and tutorials." 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "try:\n", 418 | " result = index_client.delete_index(index_name)\n", 419 | " print ('Index', index_name, 'Deleted')\n", 420 | "except Exception as ex:\n", 421 | " print (ex)" 422 | ] 423 | }, 424 | { 425 | "attachments": {}, 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "Confirm the index deletion by running the following script that lists all of the indexes on your search service. If hotels-quickstart is not listed, you've successfully deleted the index and have completed this quickstart." 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "try:\n", 439 | " result = index_client.get_index(index_name)\n", 440 | " print (result)\n", 441 | "except Exception as ex:\n", 442 | " print (ex)\n" 443 | ] 444 | } 445 | ], 446 | "metadata": { 447 | "kernelspec": { 448 | "display_name": ".venv", 449 | "language": "python", 450 | "name": "python3" 451 | }, 452 | "language_info": { 453 | "codemirror_mode": { 454 | "name": "ipython", 455 | "version": 3 456 | }, 457 | "file_extension": ".py", 458 | "mimetype": "text/x-python", 459 | "name": "python", 460 | "nbconvert_exporter": "python", 461 | "pygments_lexer": "ipython3", 462 | "version": "3.10.12" 463 | } 464 | }, 465 | "nbformat": 4, 466 | "nbformat_minor": 2 467 | } 468 | -------------------------------------------------------------------------------- /Quickstart-Agentic-Retrieval/quickstart-agentic-retrieval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4be5d807", 6 | "metadata": {}, 7 | "source": [ 8 | "# Quickstart: Agentic retrieval in Azure AI Search\n", 9 | "\n", 10 | "Use this notebook to get started with [agentic retrieval](https://learn.microsoft.com/azure/search/search-agentic-retrieval-concept) in Azure AI Search, which integrates an LLM from Azure OpenAI in Foundry Models to process queries, retrieve relevant content from indexed documents, and generate natural-language answers.\n", 11 | "\n", 12 | "In this notebook, you:\n", 13 | "\n", 14 | "1. Create and load an `earth-at-night` search index.\n", 15 | "\n", 16 | "1. Create an `earth-knowledge-source` that targets your index.\n", 17 | "\n", 18 | "1. Create an `earth-knowledge-base` that targets your knowledge source and an LLM for query planning and answer synthesis.\n", 19 | "\n", 20 | "1. Use the knowledge base to fetch, rank, and synthesize relevant information from the index.\n", 21 | "\n", 22 | "1. Run an evaluation to assess the groundedness and relevance of the pipeline.\n", 23 | "\n", 24 | "This notebook provides a high-level demonstration of agentic retrieval. For more detailed guidance, see [Quickstart: Use agentic retrieval in Azure AI Search](https://learn.microsoft.com/azure/search/search-get-started-agentic-retrieval)." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "6712b97d", 30 | "metadata": {}, 31 | "source": [ 32 | "## Prerequisites\n", 33 | "\n", 34 | "+ An [Azure AI Search service](https://learn.microsoft.com/azure/search/search-create-service-portal) in any [region that provides agentic retrieval](https://learn.microsoft.com/azure/search/search-region-support).\n", 35 | "\n", 36 | "+ A [Microsoft Foundry project](https://learn.microsoft.com/azure/ai-foundry/how-to/create-projects) and resource. When you create a project, the resource is automatically created.\n", 37 | "\n", 38 | "+ A [supported LLM](https://learn.microsoft.com/azure/search/agentic-retrieval-how-to-create-knowledge-base#supported-models). This sample uses `gpt-5-mini`.\n", 39 | "\n", 40 | "+ A text embedding model. This sample uses `text-embedding-3-large`.\n", 41 | "\n", 42 | "+ [Visual Studio Code](https://code.visualstudio.com/download) with the [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) and [Jupyter package](https://pypi.org/project/jupyter/)." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "3f5fbd46", 48 | "metadata": {}, 49 | "source": [ 50 | "## Configure access\n", 51 | "\n", 52 | "This notebook assumes that you're using Microsoft Entra ID for authentication and role assignments for authorization.\n", 53 | "\n", 54 | "To configure role-based access:\n", 55 | "\n", 56 | "1. Sign in to the [Azure portal](https://portal.azure.com).\n", 57 | "\n", 58 | "1. On your Azure AI Search service:\n", 59 | "\n", 60 | " 1. [Enable role-based access](https://learn.microsoft.com/azure/search/search-security-enable-roles).\n", 61 | " \n", 62 | " 1. [Create a system-assigned managed identity](https://learn.microsoft.com/azure/search/search-howto-managed-identities-data-sources#create-a-system-managed-identity).\n", 63 | " \n", 64 | " 1. [Assign the following roles](https://learn.microsoft.com/azure/search/search-security-rbac#how-to-assign-roles-in-the-azure-portal) to yourself.\n", 65 | " \n", 66 | " + **Search Service Contributor**\n", 67 | " \n", 68 | " + **Search Index Data Contributor**\n", 69 | " \n", 70 | " + **Search Index Data Reader**\n", 71 | "\n", 72 | "1. On your Microsoft Foundry resource, assign **Cognitive Services User** to the managed identity of your search service." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "733bf308", 78 | "metadata": {}, 79 | "source": [ 80 | "## Set up connections\n", 81 | "\n", 82 | "The `sample.env` file contains environment variables for connections to Azure AI Search and Azure OpenAI in Foundry Models. Agentic retrieval requires these connections for document retrieval, query planning, and query execution.\n", 83 | "\n", 84 | "To set up the connections:\n", 85 | "\n", 86 | "1. Sign in to the [Azure portal](https://portal.azure.com).\n", 87 | "\n", 88 | "1. Get the endpoints for Azure AI Search (`https://your-search-service.search.windows.net`) and Azure OpenAI in Foundry Models (`https://your-foundry-resource.openai.azure.com`).\n", 89 | "\n", 90 | "1. Save the `sample.env` file as `.env` on your local system.\n", 91 | "\n", 92 | "1. Update the `.env` file with the retrieved endpoints." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "34a54a0f", 98 | "metadata": {}, 99 | "source": [ 100 | "## Create a virtual environment\n", 101 | "\n", 102 | "The `requirements.txt` file contains the dependencies for this notebook. You can use a virtual environment to install these dependencies in isolation.\n", 103 | "\n", 104 | "To create a virtual environment:\n", 105 | "\n", 106 | "1. In Visual Studio Code, open the folder that contains `quickstart-agentic-retrieval.ipynb`.\n", 107 | "\n", 108 | "1. Press **Ctrl**+**Shift**+**P** to open the command palette.\n", 109 | "\n", 110 | "1. Search for **Python: Create Environment**, and then select **Venv**.\n", 111 | "\n", 112 | "1. Select a Python installation. We tested this notebook on Python 3.13.7.\n", 113 | "\n", 114 | "1. Select `requirements.txt` for the dependencies.\n", 115 | "\n", 116 | "Creating the virtual environment can take several minutes. When the environment is ready, proceed to the next step." 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "0714a968", 122 | "metadata": {}, 123 | "source": [ 124 | "## Install packages and load connections\n", 125 | "\n", 126 | "This step installs the packages for this notebook and establishes connections to Azure AI Search and Azure OpenAI in Foundry Models." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "041e5d89", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "! pip install -r requirements.txt --quiet" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "2df3a118", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from dotenv import load_dotenv\n", 147 | "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", 148 | "import os\n", 149 | "\n", 150 | "# Take environment variables from .env\n", 151 | "load_dotenv(override=True)\n", 152 | "\n", 153 | "# This notebook uses the following variables from your .env file\n", 154 | "search_endpoint = os.environ[\"SEARCH_ENDPOINT\"]\n", 155 | "credential = DefaultAzureCredential()\n", 156 | "token_provider = get_bearer_token_provider(credential, \"https://search.azure.com/.default\")\n", 157 | "aoai_endpoint = os.environ[\"AOAI_ENDPOINT\"]\n", 158 | "aoai_embedding_model = os.environ.get(\"AOAI_EMBEDDING_MODEL\", \"text-embedding-3-large\")\n", 159 | "aoai_embedding_deployment = os.environ.get(\"AOAI_EMBEDDING_DEPLOYMENT\", \"text-embedding-3-large\")\n", 160 | "aoai_gpt_model = os.environ.get(\"AOAI_GPT_MODEL\", \"gpt-5-mini\")\n", 161 | "aoai_gpt_deployment = os.environ.get(\"AOAI_GPT_DEPLOYMENT\", \"gpt-5-mini\")\n", 162 | "index_name = os.environ.get(\"INDEX_NAME\", \"earth-at-night\")\n", 163 | "knowledge_source_name = os.environ.get(\"KNOWLEDGE_SOURCE_NAME\", \"earth-knowledge-source\")\n", 164 | "knowledge_base_name = os.environ.get(\"KNOWLEDGE_BASE_NAME\", \"earth-knowledge-base\")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "58e8a088", 170 | "metadata": {}, 171 | "source": [ 172 | "## Create a search index\n", 173 | "\n", 174 | "This step creates an index that contains plain text and vector content. You can use an existing index, but it must meet the criteria for [agentic retrieval workloads](https://learn.microsoft.com/azure/search/search-agentic-retrieval-how-to-index). The primary schema requirement is a semantic configuration with a `default_configuration_name`." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "ee48bec5", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from azure.search.documents.indexes.models import SearchIndex, SearchField, VectorSearch, VectorSearchProfile, HnswAlgorithmConfiguration, AzureOpenAIVectorizer, AzureOpenAIVectorizerParameters, SemanticSearch, SemanticConfiguration, SemanticPrioritizedFields, SemanticField\n", 185 | "from azure.search.documents.indexes import SearchIndexClient\n", 186 | "from azure.identity import get_bearer_token_provider\n", 187 | "\n", 188 | "azure_openai_token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n", 189 | "index = SearchIndex(\n", 190 | " name=index_name,\n", 191 | " fields=[\n", 192 | " SearchField(name=\"id\", type=\"Edm.String\", key=True, filterable=True, sortable=True, facetable=True),\n", 193 | " SearchField(name=\"page_chunk\", type=\"Edm.String\", filterable=False, sortable=False, facetable=False),\n", 194 | " SearchField(name=\"page_embedding_text_3_large\", type=\"Collection(Edm.Single)\", stored=False, vector_search_dimensions=3072, vector_search_profile_name=\"hnsw_text_3_large\"),\n", 195 | " SearchField(name=\"page_number\", type=\"Edm.Int32\", filterable=True, sortable=True, facetable=True)\n", 196 | " ],\n", 197 | " vector_search=VectorSearch(\n", 198 | " profiles=[VectorSearchProfile(name=\"hnsw_text_3_large\", algorithm_configuration_name=\"alg\", vectorizer_name=\"azure_openai_text_3_large\")],\n", 199 | " algorithms=[HnswAlgorithmConfiguration(name=\"alg\")],\n", 200 | " vectorizers=[\n", 201 | " AzureOpenAIVectorizer(\n", 202 | " vectorizer_name=\"azure_openai_text_3_large\",\n", 203 | " parameters=AzureOpenAIVectorizerParameters(\n", 204 | " resource_url=aoai_endpoint,\n", 205 | " deployment_name=aoai_embedding_deployment,\n", 206 | " model_name=aoai_embedding_model\n", 207 | " )\n", 208 | " )\n", 209 | " ]\n", 210 | " ),\n", 211 | " semantic_search=SemanticSearch(\n", 212 | " default_configuration_name=\"semantic_config\",\n", 213 | " configurations=[\n", 214 | " SemanticConfiguration(\n", 215 | " name=\"semantic_config\",\n", 216 | " prioritized_fields=SemanticPrioritizedFields(\n", 217 | " content_fields=[\n", 218 | " SemanticField(field_name=\"page_chunk\")\n", 219 | " ]\n", 220 | " )\n", 221 | " )\n", 222 | " ]\n", 223 | " )\n", 224 | ")\n", 225 | "\n", 226 | "index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)\n", 227 | "index_client.create_or_update_index(index)\n", 228 | "print(f\"Index '{index_name}' created or updated successfully.\")" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "39874f61", 234 | "metadata": {}, 235 | "source": [ 236 | "## Upload sample documents\n", 237 | "\n", 238 | "This notebook uses data from NASA's Earth at Night e-book. The data is retrieved from the [azure-search-sample-data](https://github.com/Azure-Samples/azure-search-sample-data) repository on GitHub and passed to the search client for indexing." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "ded5147b", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "import requests\n", 249 | "from azure.search.documents import SearchIndexingBufferedSender\n", 250 | "\n", 251 | "url = \"https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/refs/heads/main/nasa-e-book/earth-at-night-json/documents.json\"\n", 252 | "documents = requests.get(url).json()\n", 253 | "\n", 254 | "with SearchIndexingBufferedSender(endpoint=search_endpoint, index_name=index_name, credential=credential) as client:\n", 255 | " client.upload_documents(documents=documents)\n", 256 | "\n", 257 | "print(f\"Documents uploaded to index '{index_name}' successfully.\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "d0fb9e5f", 263 | "metadata": {}, 264 | "source": [ 265 | "## Create a knowledge source\n", 266 | "\n", 267 | "This step creates a knowledge source that targets the index you previously created. In the next step, you create a knowledge base that uses the knowledge source to orchestrate agentic retrieval." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "e3415954", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "from azure.search.documents.indexes.models import SearchIndexKnowledgeSource, SearchIndexKnowledgeSourceParameters, SearchIndexFieldReference\n", 278 | "from azure.search.documents.indexes import SearchIndexClient\n", 279 | "\n", 280 | "ks = SearchIndexKnowledgeSource(\n", 281 | " name=knowledge_source_name,\n", 282 | " description=\"Knowledge source for Earth at night data\",\n", 283 | " search_index_parameters=SearchIndexKnowledgeSourceParameters(\n", 284 | " search_index_name=index_name,\n", 285 | " source_data_fields=[SearchIndexFieldReference(name=\"id\"), SearchIndexFieldReference(name=\"page_number\")]\n", 286 | " ),\n", 287 | ")\n", 288 | "\n", 289 | "index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)\n", 290 | "index_client.create_or_update_knowledge_source(knowledge_source=ks)\n", 291 | "print(f\"Knowledge source '{knowledge_source_name}' created or updated successfully.\")" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "id": "5c5e0a34", 297 | "metadata": {}, 298 | "source": [ 299 | "## Create a knowledge base\n", 300 | "\n", 301 | "This step creates a knowledge base, which acts as a wrapper for your knowledge source and LLM deployment.\n", 302 | "\n", 303 | "`EXTRACTIVE_DATA` is the default modality and returns content from your knowledge sources without generative alteration. However, this quickstart uses the `ANSWER_SYNTHESIS` modality for LLM-generated answers that cite the retrieved content." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "d3fe4183", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "from azure.search.documents.indexes.models import KnowledgeBase, KnowledgeBaseAzureOpenAIModel, KnowledgeSourceReference, AzureOpenAIVectorizerParameters, KnowledgeRetrievalOutputMode, KnowledgeRetrievalLowReasoningEffort\n", 314 | "from azure.search.documents.indexes import SearchIndexClient\n", 315 | "\n", 316 | "aoai_params = AzureOpenAIVectorizerParameters(\n", 317 | " resource_url=aoai_endpoint,\n", 318 | " deployment_name=aoai_gpt_deployment,\n", 319 | " model_name=aoai_gpt_model,\n", 320 | ")\n", 321 | "\n", 322 | "knowledge_base = KnowledgeBase(\n", 323 | " name=knowledge_base_name,\n", 324 | " models=[KnowledgeBaseAzureOpenAIModel(azure_open_ai_parameters=aoai_params)],\n", 325 | " knowledge_sources=[\n", 326 | " KnowledgeSourceReference(\n", 327 | " name=knowledge_source_name\n", 328 | " )\n", 329 | " ],\n", 330 | " output_mode=KnowledgeRetrievalOutputMode.ANSWER_SYNTHESIS,\n", 331 | " answer_instructions=\"Provide a 2 sentence concise and informative answer based on the retrieved documents.\"\n", 332 | ")\n", 333 | "\n", 334 | "index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)\n", 335 | "index_client.create_or_update_knowledge_base(knowledge_base)\n", 336 | "print(f\"Knowledge base '{knowledge_base_name}' created or updated successfully.\")" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "id": "cf7d8fbe", 342 | "metadata": {}, 343 | "source": [ 344 | "## Set up messages\n", 345 | "\n", 346 | "Messages are the input for the retrieval route and contain the conversation history. Each message includes a `role` that indicates its origin, such as `system` or `user`, and `content` in natural language. The LLM you use determines which roles are valid." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "357268fc", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "instructions = \"\"\"\n", 357 | "A Q&A agent that can answer questions about the Earth at night.\n", 358 | "If you don't have the answer, respond with \"I don't know\".\n", 359 | "\"\"\"\n", 360 | "\n", 361 | "messages = [\n", 362 | " {\n", 363 | " \"role\": \"system\",\n", 364 | " \"content\": instructions\n", 365 | " }\n", 366 | "]" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "id": "4090707f", 372 | "metadata": {}, 373 | "source": [ 374 | "## Use agentic retrieval to fetch results\n", 375 | "\n", 376 | "This step runs the agentic retrieval pipeline to produce a grounded, citation-backed answer. Given the conversation history and retrieval parameters, your knowledge base:\n", 377 | "\n", 378 | "1. Analyzes the entire conversation to infer the user's information need.\n", 379 | "\n", 380 | "1. Decomposes the compound query into focused subqueries.\n", 381 | "\n", 382 | "1. Runs the subqueries concurrently against your knowledge source.\n", 383 | "\n", 384 | "1. Uses semantic ranker to rerank and filter the results.\n", 385 | "\n", 386 | "1. Synthesizes the top results into a natural-language answer." 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "id": "918ded26", 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "from azure.search.documents.knowledgebases import KnowledgeBaseRetrievalClient\n", 397 | "from azure.search.documents.knowledgebases.models import KnowledgeBaseRetrievalRequest, KnowledgeBaseMessage, KnowledgeBaseMessageTextContent, SearchIndexKnowledgeSourceParams\n", 398 | "\n", 399 | "agent_client = KnowledgeBaseRetrievalClient(endpoint=search_endpoint, knowledge_base_name=knowledge_base_name, credential=credential)\n", 400 | "query_1 = \"\"\"\n", 401 | " Why do suburban belts display larger December brightening than urban cores even though absolute light levels are higher downtown?\n", 402 | " Why is the Phoenix nighttime street grid is so sharply visible from space, whereas large stretches of the interstate between midwestern cities remain comparatively dim?\n", 403 | " \"\"\"\n", 404 | "\n", 405 | "messages.append({\n", 406 | " \"role\": \"user\",\n", 407 | " \"content\": query_1\n", 408 | "})\n", 409 | "\n", 410 | "req = KnowledgeBaseRetrievalRequest(\n", 411 | " messages=[\n", 412 | " KnowledgeBaseMessage(\n", 413 | " role=m[\"role\"],\n", 414 | " content=[KnowledgeBaseMessageTextContent(text=m[\"content\"])]\n", 415 | " ) for m in messages if m[\"role\"] != \"system\"\n", 416 | " ],\n", 417 | " knowledge_source_params=[\n", 418 | " SearchIndexKnowledgeSourceParams(\n", 419 | " knowledge_source_name=knowledge_source_name,\n", 420 | " include_references=True,\n", 421 | " include_reference_source_data=True,\n", 422 | " always_query_source=True\n", 423 | " )\n", 424 | " ],\n", 425 | " include_activity=True,\n", 426 | " retrieval_reasoning_effort=KnowledgeRetrievalLowReasoningEffort\n", 427 | ")\n", 428 | "\n", 429 | "result = agent_client.retrieve(retrieval_request=req)\n", 430 | "print(f\"Retrieved content from '{knowledge_base_name}' successfully.\")" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "id": "886fc687", 436 | "metadata": {}, 437 | "source": [ 438 | "### Review the retrieval response, activity, and results\n", 439 | "\n", 440 | "Because your knowledge base is configured for answer synthesis, the retrieval response contains the following values:\n", 441 | "\n", 442 | "+ `response_contents`: An LLM-generated answer to the query that cites the retrieved documents.\n", 443 | "\n", 444 | "+ `activity_contents`: Detailed planning and execution information, including subqueries, reranking decisions, and intermediate steps.\n", 445 | "\n", 446 | "+ `references_contents`: Source documents and chunks that contributed to the answer.\n", 447 | "\n", 448 | "**Tip:** Retrieval parameters, such as reranker thresholds and knowledge source parameters, influence how aggressively your agent reranks and which sources it queries. Inspect the activity and references to validate grounding and build traceable citations." 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "id": "d4d78fbe", 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "response_contents = []\n", 459 | "activity_contents = []\n", 460 | "references_contents = []" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "id": "7fccf4b0", 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "import json\n", 471 | "\n", 472 | "# Build simple string values for response_content, activity_content, and references_content\n", 473 | "\n", 474 | "# Responses -> Concatenate text/value fields from all response contents\n", 475 | "response_parts = []\n", 476 | "for resp in result.response:\n", 477 | " for content in resp.content:\n", 478 | " response_parts.append(content.text)\n", 479 | "response_content = \"\\n\\n\".join(response_parts) if response_parts else \"No response found on 'result'\"\n", 480 | "\n", 481 | "response_contents.append(response_content)\n", 482 | "\n", 483 | "# Print the three string values\n", 484 | "print(\"response_content:\\n\", response_content, \"\\n\")" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "id": "7355941b", 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "messages.append({\n", 495 | " \"role\": \"assistant\",\n", 496 | " \"content\": response_content\n", 497 | "})" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "id": "4cef4fd3", 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "# Activity -> JSON string of activity as list of dicts\n", 508 | "if result.activity:\n", 509 | " activity_content = json.dumps([a.as_dict() for a in result.activity], indent=2)\n", 510 | "else:\n", 511 | " activity_content = \"No activity found on 'result'\"\n", 512 | " \n", 513 | "activity_contents.append(activity_content)\n", 514 | "print(\"activity_content:\\n\", activity_content, \"\\n\")" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "id": "172df234", 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "# References -> JSON string of references as list of dicts\n", 525 | "if result.references:\n", 526 | " references_content = json.dumps([r.as_dict() for r in result.references], indent=2)\n", 527 | "else:\n", 528 | " references_content = \"No references found on 'result'\"\n", 529 | " \n", 530 | "references_contents.append(references_content)\n", 531 | "print(\"references_content:\\n\", references_content)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "id": "75386ed1", 537 | "metadata": {}, 538 | "source": [ 539 | "## Continue the conversation\n", 540 | "\n", 541 | "This step continues the conversation with your knowledge base, building upon the previous messages and queries to retrieve relevant information from your knowledge source." 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "id": "da260539", 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "query_2 = \"How do I find lava at night?\"\n", 552 | "messages.append({\n", 553 | " \"role\": \"user\",\n", 554 | " \"content\": query_2\n", 555 | "})\n", 556 | "\n", 557 | "req = KnowledgeBaseRetrievalRequest(\n", 558 | " messages=[\n", 559 | " KnowledgeBaseMessage(\n", 560 | " role=m[\"role\"],\n", 561 | " content=[KnowledgeBaseMessageTextContent(text=m[\"content\"])]\n", 562 | " ) for m in messages if m[\"role\"] != \"system\"\n", 563 | " ],\n", 564 | " knowledge_source_params=[\n", 565 | " SearchIndexKnowledgeSourceParams(\n", 566 | " knowledge_source_name=knowledge_source_name,\n", 567 | " include_references=True,\n", 568 | " include_reference_source_data=True,\n", 569 | " always_query_source=True\n", 570 | " )\n", 571 | " ],\n", 572 | " include_activity=True,\n", 573 | " retrieval_reasoning_effort=KnowledgeRetrievalLowReasoningEffort\n", 574 | ")\n", 575 | "\n", 576 | "result = agent_client.retrieve(retrieval_request=req)\n", 577 | "print(f\"Retrieved content from '{knowledge_base_name}' successfully.\")" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "id": "fd1cba0c", 583 | "metadata": {}, 584 | "source": [ 585 | "### Review the new retrieval response, activity, and results" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "id": "35a1bfcb", 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "import json\n", 596 | "\n", 597 | "# Build simple string values for response_content, activity_content, and references_content\n", 598 | "\n", 599 | "# Responses -> Concatenate text/value fields from all response contents\n", 600 | "response_parts = []\n", 601 | "for resp in result.response:\n", 602 | " for content in resp.content:\n", 603 | " response_parts.append(content.text)\n", 604 | "response_content = \"\\n\\n\".join(response_parts) if response_parts else \"No response found on 'result'\"\n", 605 | "\n", 606 | "response_contents.append(response_content)\n", 607 | "\n", 608 | "# Print the three string values\n", 609 | "print(\"response_content:\\n\", response_content, \"\\n\")" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "id": "6f74c2c3", 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "# Activity -> JSON string of activity as list of dicts\n", 620 | "if result.activity:\n", 621 | " activity_content = json.dumps([a.as_dict() for a in result.activity], indent=2)\n", 622 | "else:\n", 623 | " activity_content = \"No activity found on 'result'\"\n", 624 | " \n", 625 | "activity_contents.append(activity_content)\n", 626 | "print(\"activity_content:\\n\", activity_content, \"\\n\")" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "id": "a6486c8a", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "# References -> JSON string of references as list of dicts\n", 637 | "if result.references:\n", 638 | " references_content = json.dumps([r.as_dict() for r in result.references], indent=2)\n", 639 | "else:\n", 640 | " references_content = \"No references found on 'result'\"\n", 641 | " \n", 642 | "references_contents.append(references_content)\n", 643 | "print(\"references_content:\\n\", references_content)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "markdown", 648 | "id": "e98057c5", 649 | "metadata": {}, 650 | "source": [ 651 | "## Run an evaluation with Microsoft Foundry\n", 652 | "\n", 653 | "To evaluate the groundedness and relevance of the pipeline, run an evaluation with Foundry. For more detailed guidance, see [Evaluate your generative AI application locally with the Azure AI Evaluation SDK (preview)](https://learn.microsoft.com/azure/ai-foundry/how-to/develop/evaluate-sdk)." 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "id": "d88117b3", 659 | "metadata": {}, 660 | "source": [ 661 | "### Prerequisites\n", 662 | "\n", 663 | "+ The same [Foundry project](https://learn.microsoft.com/azure/ai-foundry/how-to/create-projects) you used for agentic retrieval. Set `FOUNDRY_ENDPOINT` to your project endpoint in the `.env` file. You can find this endpoint in the [Foundry portal](https://ai.azure.com/).\n", 664 | "\n", 665 | "+ The `azure-ai-evaluation` package, which is already installed as part of the `requirements.txt` file." 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "id": "80001db4", 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "# Load connections\n", 676 | "from dotenv import load_dotenv\n", 677 | "import os\n", 678 | "\n", 679 | "load_dotenv(override=True)\n", 680 | "\n", 681 | "foundry_endpoint = os.environ[\"FOUNDRY_ENDPOINT\"]\n", 682 | "aoai_api_version = os.environ[\"AOAI_API_VERSION\"]\n", 683 | "\n", 684 | "# Run the evaluation\n", 685 | "from azure.ai.evaluation import AzureOpenAIModelConfiguration, GroundednessEvaluator, RelevanceEvaluator, evaluate\n", 686 | "import json\n", 687 | "\n", 688 | "evaluation_data = []\n", 689 | "print(\"Preparing evaluation data...\")\n", 690 | "for q, r, g in zip([query_1, query_2], references_contents, response_contents):\n", 691 | " evaluation_data.append({\n", 692 | " \"query\": q,\n", 693 | " \"response\": g,\n", 694 | " \"context\": r,\n", 695 | " })\n", 696 | "\n", 697 | "filename = \"evaluation_data.jsonl\"\n", 698 | "\n", 699 | "with open(filename, \"w\") as f:\n", 700 | " for item in evaluation_data:\n", 701 | " f.write(json.dumps(item) + \"\\n\")\n", 702 | "\n", 703 | "model_config = AzureOpenAIModelConfiguration(\n", 704 | " azure_endpoint=aoai_endpoint,\n", 705 | " api_version=aoai_api_version,\n", 706 | " azure_deployment=aoai_gpt_model\n", 707 | ")\n", 708 | "\n", 709 | "# RAG triad metrics\n", 710 | "groundedness = GroundednessEvaluator(model_config=model_config)\n", 711 | "relevance = RelevanceEvaluator(model_config=model_config)\n", 712 | "\n", 713 | "print(\"Starting evaluation...\")\n", 714 | "result = evaluate(\n", 715 | " data=filename,\n", 716 | " evaluators={\n", 717 | " \"groundedness\": groundedness,\n", 718 | " \"relevance\": relevance,\n", 719 | " },\n", 720 | " azure_ai_project=foundry_endpoint,\n", 721 | ")\n", 722 | "\n", 723 | "print(\"Evaluation complete.\")\n", 724 | "studio_url = result.get(\"studio_url\")\n", 725 | "print(\"For more information, go to the Foundry portal.\") if studio_url else None" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "id": "75777ed2", 731 | "metadata": {}, 732 | "source": [ 733 | "## Clean up objects and resources\n", 734 | "\n", 735 | "If you no longer need Azure AI Search or Microsoft Foundry, delete the resources from your Azure subscription. You can also start over by deleting individual objects." 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "id": "a14f6fe6", 741 | "metadata": {}, 742 | "source": [ 743 | "### Delete the knowledge base" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": null, 749 | "id": "67b6a475", 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "from azure.search.documents.indexes import SearchIndexClient\n", 754 | "\n", 755 | "index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)\n", 756 | "index_client.delete_knowledge_base(knowledge_base_name)\n", 757 | "print(f\"Knowledge base '{knowledge_base_name}' deleted successfully.\")" 758 | ] 759 | }, 760 | { 761 | "cell_type": "markdown", 762 | "id": "ecdfb289", 763 | "metadata": {}, 764 | "source": [ 765 | "### Delete the knowledge source" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "id": "3b08f5e1", 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "from azure.search.documents.indexes import SearchIndexClient\n", 776 | "\n", 777 | "index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)\n", 778 | "index_client.delete_knowledge_source(knowledge_source=knowledge_source_name)\n", 779 | "print(f\"Knowledge source '{knowledge_source_name}' deleted successfully.\")" 780 | ] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "id": "a35bfbb1", 785 | "metadata": {}, 786 | "source": [ 787 | "### Delete the search index" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "id": "25f5e6a4", 794 | "metadata": {}, 795 | "outputs": [], 796 | "source": [ 797 | "from azure.search.documents.indexes import SearchIndexClient\n", 798 | "\n", 799 | "index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)\n", 800 | "index_client.delete_index(index_name)\n", 801 | "print(f\"Index '{index_name}' deleted successfully.\")" 802 | ] 803 | } 804 | ], 805 | "metadata": { 806 | "kernelspec": { 807 | "display_name": ".venv", 808 | "language": "python", 809 | "name": "python3" 810 | }, 811 | "language_info": { 812 | "codemirror_mode": { 813 | "name": "ipython", 814 | "version": 3 815 | }, 816 | "file_extension": ".py", 817 | "mimetype": "text/x-python", 818 | "name": "python", 819 | "nbconvert_exporter": "python", 820 | "pygments_lexer": "ipython3", 821 | "version": "3.13.9" 822 | } 823 | }, 824 | "nbformat": 4, 825 | "nbformat_minor": 5 826 | } 827 | -------------------------------------------------------------------------------- /agentic-retrieval-pipeline-example/agent-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5e3d4685", 6 | "metadata": {}, 7 | "source": [ 8 | "# Tutorial: Agentic retrieval using Azure AI Search and Foundry Agent Service\n", 9 | "\n", 10 | "Use this notebook to create an agentic retrieval pipeline built on Azure AI Search and Foundry Agent Service.\n", 11 | "\n", 12 | "In this notebook, you:\n", 13 | "\n", 14 | "1. Create and load an `earth-at-night` search index.\n", 15 | "\n", 16 | "1. Create an `earth-knowledge-source` that targets your index.\n", 17 | "\n", 18 | "1. Create an `earth-knowledge-base` that targets your knowledge source and an LLM for intelligent query planning.\n", 19 | "\n", 20 | "1. Use the knowledge base to fetch, rank, and synthesize relevant information from the index.\n", 21 | "\n", 22 | "1. Create an agent in Foundry Agent Service to determine when queries are needed.\n", 23 | "\n", 24 | "1. Create an MCP tool to orchestrate all requests.\n", 25 | "\n", 26 | "1. Start a chat with the agent.\n", 27 | "\n", 28 | "This notebook is referenced in [Tutorial: Build an end-to-end agentic retrieval solution using Azure AI Search](https://learn.microsoft.com/azure/search/search-agentic-retrieval-how-to-pipeline).\n", 29 | "\n", 30 | "Unlike [Quickstart: Use agentic retrieval in Azure AI Search](https://learn.microsoft.com/azure/search/search-get-started-agentic-retrieval), this quickstart uses Foundry Agent Service to determine whether to retrieve data from the knowledge source and uses an MCP tool for orchestration." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "ecd68a6e", 36 | "metadata": {}, 37 | "source": [ 38 | "## Prerequisites\n", 39 | "\n", 40 | "+ An Azure AI Search service in any [region that provides agentic retrieval](https://learn.microsoft.com/azure/search/search-region-support).\n", 41 | "\n", 42 | "+ A [Microsoft Foundry project](https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/create-projects?view=foundry-classic&tabs=foundry) and resource. When you create a project, the resource is automatically created.\n", 43 | "\n", 44 | "+ A [supported LLM](https://learn.microsoft.com/azure/search/search-agentic-retrieval-how-to-create#supported-models) deployed to your project. This notebook uses `gpt-5-mini`. We recommend a minimum token capacity of 100,000. You can find the LLM's capacity and rate limit in the Foundry portal. If you want vectorization at query time, you should also deploy a text embedding model.\n", 45 | "\n", 46 | "+ [Visual Studio Code](https://code.visualstudio.com/download) with the [Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) and [Jupyter package](https://pypi.org/project/jupyter/)." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "1a2379a9", 52 | "metadata": {}, 53 | "source": [ 54 | "## Configure access\n", 55 | "\n", 56 | "This notebook assumes that you're using Microsoft Entra ID for authentication and role assignments for authorization.\n", 57 | "\n", 58 | "To configure role-based access:\n", 59 | "\n", 60 | "1. Sign in to the [Azure portal](https://portal.azure.com).\n", 61 | "\n", 62 | "1. On your Azure AI Search service:\n", 63 | "\n", 64 | " 1. [Enable role-based access](https://learn.microsoft.com/azure/search/search-security-enable-roles).\n", 65 | " \n", 66 | " 1. [Create a system-assigned managed identity](https://learn.microsoft.com/azure/search/search-howto-managed-identities-data-sources#create-a-system-managed-identity).\n", 67 | " \n", 68 | " 1. [Assign the following roles](https://learn.microsoft.com/azure/search/search-security-rbac#how-to-assign-roles-in-the-azure-portal) to yourself.\n", 69 | " \n", 70 | " + **Search Service Contributor**\n", 71 | " \n", 72 | " + **Search Index Data Contributor**\n", 73 | " \n", 74 | " + **Search Index Data Reader**\n", 75 | "\n", 76 | " 1. Assign **Search Index Data Reader** to your Microsoft Foundry project.\n", 77 | "\n", 78 | "1. On your Microsoft Foundry resource:\n", 79 | "\n", 80 | " 1. Assign the following roles to yourself.\n", 81 | "\n", 82 | " + **Azure AI User**\n", 83 | "\n", 84 | " + **Azure AI Project Manager**\n", 85 | "\n", 86 | " 1. Assign **Cognitive Services User** to the managed identity of your search service." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "1f40a871", 92 | "metadata": {}, 93 | "source": [ 94 | "## Set up connections\n", 95 | "\n", 96 | "Save the `sample.env` file as `.env` and then modify the environment variables to use your Azure endpoints. You need endpoints for:\n", 97 | "\n", 98 | "+ Azure AI Search\n", 99 | "+ Azure OpenAI (for the models deployed to your project)\n", 100 | "+ Microsoft Foundry project\n", 101 | "\n", 102 | "You also need the resource ID of your project. You can find all of these values in the [Azure portal](https://portal.azure.com/)." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "679bc80a", 108 | "metadata": {}, 109 | "source": [ 110 | "## Load connections\n", 111 | "\n", 112 | "We recommend creating a virtual environment to run this sample code. In Visual Studio Code, open the control palette (ctrl-shift-p) to create an environment. This notebook was tested on Python 3.13.7.\n", 113 | "\n", 114 | "After your environment is created, load the environment variables to set up connections and object names." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 1, 120 | "id": "e42b4a10", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "import os\n", 125 | "\n", 126 | "from azure.identity import DefaultAzureCredential\n", 127 | "from azure.mgmt.core.tools import parse_resource_id\n", 128 | "from dotenv import load_dotenv\n", 129 | "\n", 130 | "load_dotenv(override=True) # take environment variables from .env.\n", 131 | "\n", 132 | "project_endpoint = os.environ[\"PROJECT_ENDPOINT\"]\n", 133 | "project_resource_id = os.environ[\"PROJECT_RESOURCE_ID\"]\n", 134 | "project_connection_name = os.getenv(\"PROJECT_CONNECTION_NAME\", \"earthknowledgeconnection\")\n", 135 | "agent_model = os.getenv(\"AGENT_MODEL\", \"gpt-4.1-mini\")\n", 136 | "agent_name = os.getenv(\"AGENT_NAME\", \"earth-knowledge-agent\")\n", 137 | "endpoint = os.environ[\"AZURE_SEARCH_ENDPOINT\"]\n", 138 | "credential = DefaultAzureCredential()\n", 139 | "knowledge_source_name = os.getenv(\"AZURE_SEARCH_KNOWLEDGE_SOURCE_NAME\", \"earth-knowledge-source\")\n", 140 | "index_name = os.getenv(\"AZURE_SEARCH_INDEX\", \"earth-at-night\")\n", 141 | "azure_openai_endpoint = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n", 142 | "azure_openai_gpt_deployment = os.getenv(\"AZURE_OPENAI_GPT_DEPLOYMENT\", \"gpt-4.1-mini\")\n", 143 | "azure_openai_gpt_model = os.getenv(\"AZURE_OPENAI_GPT_MODEL\", \"gpt-4.1-mini\")\n", 144 | "azure_openai_embedding_deployment = os.getenv(\"AZURE_OPENAI_EMBEDDING_DEPLOYMENT\", \"text-embedding-3-large\")\n", 145 | "azure_openai_embedding_model = os.getenv(\"AZURE_OPENAI_EMBEDDING_MODEL\", \"text-embedding-3-large\")\n", 146 | "base_name = os.getenv(\"AZURE_SEARCH_AGENT_NAME\", \"earth-knowledge-base\")\n", 147 | "\n", 148 | "# Parse the resource ID to extract subscription and other components\n", 149 | "parsed_resource_id = parse_resource_id(project_resource_id)\n", 150 | "subscription_id = parsed_resource_id['subscription']\n", 151 | "resource_group = parsed_resource_id['resource_group']\n", 152 | "account_name = parsed_resource_id['name']\n", 153 | "project_name = parsed_resource_id['child_name_1']" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "ea2ecdce", 159 | "metadata": {}, 160 | "source": [ 161 | "## Create a search index\n", 162 | "\n", 163 | "This steps create a search index that contains plain text and vector content. You can use an existing index, but it must meet the [criteria for agentic retrieval workloads](https://learn.microsoft.com/azure/search/search-agentic-retrieval-how-to-index). The primary schema requirement is a semantic configuration with a `default_configuration_name`." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 2, 169 | "id": "91fd6810", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "Index 'earth-at-night' created or updated successfully\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "from azure.search.documents.indexes import SearchIndexClient\n", 182 | "from azure.search.documents.indexes.models import (\n", 183 | " AzureOpenAIVectorizer, AzureOpenAIVectorizerParameters,\n", 184 | " HnswAlgorithmConfiguration, SearchField, SearchIndex,\n", 185 | " SemanticConfiguration, SemanticField, SemanticPrioritizedFields,\n", 186 | " SemanticSearch, VectorSearch, VectorSearchProfile\n", 187 | ")\n", 188 | "\n", 189 | "index = SearchIndex(\n", 190 | " name=index_name,\n", 191 | " fields=[\n", 192 | " SearchField(name=\"id\", type=\"Edm.String\", key=True, filterable=True, sortable=True, facetable=True),\n", 193 | " SearchField(name=\"page_chunk\", type=\"Edm.String\", filterable=False, sortable=False, facetable=False),\n", 194 | " SearchField(name=\"page_embedding_text_3_large\", type=\"Collection(Edm.Single)\", stored=False, vector_search_dimensions=3072, vector_search_profile_name=\"hnsw_text_3_large\"),\n", 195 | " SearchField(name=\"page_number\", type=\"Edm.Int32\", filterable=True, sortable=True, facetable=True)\n", 196 | " ],\n", 197 | " vector_search=VectorSearch(\n", 198 | " profiles=[VectorSearchProfile(name=\"hnsw_text_3_large\", algorithm_configuration_name=\"alg\", vectorizer_name=\"azure_openai_text_3_large\")],\n", 199 | " algorithms=[HnswAlgorithmConfiguration(name=\"alg\")],\n", 200 | " vectorizers=[\n", 201 | " AzureOpenAIVectorizer(\n", 202 | " vectorizer_name=\"azure_openai_text_3_large\",\n", 203 | " parameters=AzureOpenAIVectorizerParameters(\n", 204 | " resource_url=azure_openai_endpoint,\n", 205 | " deployment_name=azure_openai_embedding_deployment,\n", 206 | " model_name=azure_openai_embedding_model\n", 207 | " )\n", 208 | " )\n", 209 | " ]\n", 210 | " ),\n", 211 | " semantic_search=SemanticSearch(\n", 212 | " default_configuration_name=\"semantic_config\",\n", 213 | " configurations=[\n", 214 | " SemanticConfiguration(\n", 215 | " name=\"semantic_config\",\n", 216 | " prioritized_fields=SemanticPrioritizedFields(\n", 217 | " content_fields=[\n", 218 | " SemanticField(field_name=\"page_chunk\")\n", 219 | " ]\n", 220 | " )\n", 221 | " )\n", 222 | " ]\n", 223 | " )\n", 224 | ")\n", 225 | "\n", 226 | "index_client = SearchIndexClient(endpoint=endpoint, credential=credential)\n", 227 | "index_client.create_or_update_index(index)\n", 228 | "print(f\"Index '{index_name}' created or updated successfully\")" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "376b9785", 234 | "metadata": {}, 235 | "source": [ 236 | "## Upload sample documents\n", 237 | "\n", 238 | "This notebook uses data from NASA's Earth at Night e-book. The data is retrieved from the [azure-search-sample-data](https://github.com/Azure-Samples/azure-search-sample-data) repository on GitHub and passed to the search client for indexing." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 3, 244 | "id": "f98f31e7", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "Documents uploaded to index 'earth-at-night'\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "import requests\n", 257 | "from azure.search.documents import SearchIndexingBufferedSender\n", 258 | "\n", 259 | "url = \"https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/refs/heads/main/nasa-e-book/earth-at-night-json/documents.json\"\n", 260 | "documents = requests.get(url).json()\n", 261 | "\n", 262 | "with SearchIndexingBufferedSender(endpoint=endpoint, index_name=index_name, credential=credential) as client:\n", 263 | " client.upload_documents(documents=documents)\n", 264 | "\n", 265 | "print(f\"Documents uploaded to index '{index_name}'\")" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "id": "38c2d9d5", 271 | "metadata": {}, 272 | "source": [ 273 | "## Create a knowledge source\n", 274 | "\n", 275 | "This step creates a knowledge source that targets the index you previously created. In the next step, you create a knowledge base that uses the knowledge source to orchestrate agentic retrieval.\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 4, 281 | "id": "0cf01881", 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "Knowledge source 'earth-knowledge-source' created or updated successfully.\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "from azure.search.documents.indexes import SearchIndexClient\n", 294 | "from azure.search.documents.indexes.models import (\n", 295 | " SearchIndexFieldReference, SearchIndexKnowledgeSource,\n", 296 | " SearchIndexKnowledgeSourceParameters\n", 297 | ")\n", 298 | "\n", 299 | "ks = SearchIndexKnowledgeSource(\n", 300 | " name=knowledge_source_name,\n", 301 | " description=\"Knowledge source for Earth at night data\",\n", 302 | " search_index_parameters=SearchIndexKnowledgeSourceParameters(\n", 303 | " search_index_name=index_name,\n", 304 | " source_data_fields=[SearchIndexFieldReference(name=\"id\"), SearchIndexFieldReference(name=\"page_number\")]\n", 305 | " ),\n", 306 | ")\n", 307 | "\n", 308 | "index_client = SearchIndexClient(endpoint=endpoint, credential=credential)\n", 309 | "index_client.create_or_update_knowledge_source(knowledge_source=ks)\n", 310 | "print(f\"Knowledge source '{knowledge_source_name}' created or updated successfully.\")" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "id": "e3d0081e", 316 | "metadata": {}, 317 | "source": [ 318 | "## Create a knowledge base\n", 319 | "\n", 320 | "This step creates a knowledge base, which acts as a wrapper for your knowledge source and LLM deployment.\n", 321 | "\n", 322 | "`EXTRACTIVE_DATA` is the default modality and returns content from your knowledge sources without answer generation. This is recommended for interaction with Foundry Agent Service." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 5, 328 | "id": "fbe31e32", 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "Knowledge base 'earth-knowledge-base' created or updated successfully\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "from azure.search.documents.indexes import SearchIndexClient\n", 341 | "from azure.search.documents.indexes.models import (\n", 342 | " KnowledgeBase, KnowledgeRetrievalMinimalReasoningEffort,\n", 343 | " KnowledgeRetrievalOutputMode, KnowledgeSourceReference\n", 344 | ")\n", 345 | "\n", 346 | "knowledge_base = KnowledgeBase(\n", 347 | " name=base_name,\n", 348 | " knowledge_sources=[\n", 349 | " KnowledgeSourceReference(\n", 350 | " name=knowledge_source_name\n", 351 | " )\n", 352 | " ],\n", 353 | " output_mode=KnowledgeRetrievalOutputMode.EXTRACTIVE_DATA,\n", 354 | " retrieval_reasoning_effort=KnowledgeRetrievalMinimalReasoningEffort()\n", 355 | ")\n", 356 | "\n", 357 | "\n", 358 | "index_client = SearchIndexClient(endpoint=endpoint, credential=credential)\n", 359 | "index_client.create_or_update_knowledge_base(knowledge_base=knowledge_base)\n", 360 | "print(f\"Knowledge base '{base_name}' created or updated successfully\")\n", 361 | "\n", 362 | "mcp_endpoint = f\"{endpoint}/knowledgebases/{base_name}/mcp?api-version=2025-11-01-Preview\"" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "id": "ff845de0", 368 | "metadata": {}, 369 | "source": [ 370 | "## Create an agent\n", 371 | "\n", 372 | "In Foundry Agent Service, an agent is an smart micro-service that can use an LLM with tools. The purpose of this agent is to use retrieval tools from the knowledge base to do RAG.\n", 373 | "\n", 374 | "Your Foundry project may have no agents at this stage, but if you've already run this notebook, you will see the agent listed here." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "6eb0ebd3", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "from azure.ai.projects import AIProjectClient\n", 385 | "\n", 386 | "project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)\n", 387 | "\n", 388 | "list(project_client.agents.list())" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "id": "61de7601", 394 | "metadata": {}, 395 | "source": [ 396 | "## Create an MCP tool connection\n", 397 | "\n", 398 | "In Microsoft Foundry, you must create a connection to authenticate to your MCP tool." 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "id": "80c209b9", 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "name": "stdout", 409 | "output_type": "stream", 410 | "text": [ 411 | "Connection 'earthknowledgeconnection' created or updated successfully.\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "import requests\n", 417 | "from azure.identity import get_bearer_token_provider\n", 418 | "\n", 419 | "# Requires the Foundry project to have Search Index Data Reader role on the search service\n", 420 | "bearer_token_provider = get_bearer_token_provider(credential, \"https://management.azure.com/.default\")\n", 421 | "headers = {\n", 422 | " \"Authorization\": f\"Bearer {bearer_token_provider()}\",\n", 423 | "}\n", 424 | "response = requests.put(\n", 425 | " f\"https://management.azure.com{project_resource_id}/connections/{project_connection_name}?api-version=2025-10-01-preview\",\n", 426 | " headers=headers,\n", 427 | " json={\n", 428 | " \"name\": project_connection_name,\n", 429 | " \"type\": \"Microsoft.MachineLearningServices/workspaces/connections\",\n", 430 | " \"properties\": {\n", 431 | " \"authType\": \"ProjectManagedIdentity\",\n", 432 | " \"category\": \"RemoteTool\",\n", 433 | " \"target\": mcp_endpoint,\n", 434 | " \"isSharedToAll\": True,\n", 435 | " \"audience\": \"https://search.azure.com/\",\n", 436 | " \"metadata\": { \"ApiType\": \"Azure\" }\n", 437 | " }\n", 438 | " }\n", 439 | ")\n", 440 | "response.raise_for_status()\n", 441 | "print(f\"Connection '{project_connection_name}' created or updated successfully.\")" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "id": "6631324e", 447 | "metadata": {}, 448 | "source": [ 449 | "## Optimize agent instructions for knowledge retrieval\n", 450 | "\n", 451 | "To maximize the accuracy of knowledge base invocations and ensure proper citation formatting, use optimized agent instructions. Based on our experiments, we recommend the following instruction template as a starting point:\n", 452 | "\n", 453 | "```\n", 454 | "You are a helpful assistant that must use the knowledge base to answer all the questions from user. You must never answer from your own knowledge under any circumstances.\n", 455 | "Every answer must always provide annotations for using the MCP knowledge base tool and render them as: `【message_idx:search_idx†source_name】`\n", 456 | "If you cannot find the answer in the provided knowledge base you must respond with \"I don't know\".\n", 457 | "```\n", 458 | "\n", 459 | "The specified citation format ensures the agent includes provenance information in responses, making it clear which knowledge sources were used." 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "id": "aa363122", 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | "AI agent 'earth-knowledge-agent' created or updated successfully\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "from azure.ai.projects.models import PromptAgentDefinition, MCPTool\n", 478 | "\n", 479 | "instructions = \"\"\"\n", 480 | "You are a helpful assistant that must use the knowledge base to answer all the questions from user. You must never answer from your own knowledge under any circumstances.\n", 481 | "Every answer must always provide annotations for using the MCP knowledge base tool and render them as: `【message_idx:search_idx†source_name】`\n", 482 | "If you cannot find the answer in the provided knowledge base you must respond with \"I don't know\".\n", 483 | "\"\"\"\n", 484 | "mcp_kb_tool = MCPTool(\n", 485 | " server_label=\"knowledge-base\",\n", 486 | " server_url=mcp_endpoint,\n", 487 | " require_approval=\"never\",\n", 488 | " allowed_tools=[\"knowledge_base_retrieve\"],\n", 489 | " project_connection_id=project_connection_name\n", 490 | ")\n", 491 | "agent = project_client.agents.create_version(\n", 492 | " agent_name=agent_name,\n", 493 | " definition=PromptAgentDefinition(\n", 494 | " model=agent_model,\n", 495 | " instructions=instructions,\n", 496 | " tools=[mcp_kb_tool]\n", 497 | " )\n", 498 | ")\n", 499 | "\n", 500 | "\n", 501 | "print(f\"AI agent '{agent_name}' created or updated successfully\")" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "id": "16a2d5ed", 507 | "metadata": {}, 508 | "source": [ 509 | "## Start a chat with the agent\n", 510 | "\n", 511 | "Set the `tool_choice` parameter to \"required\" to ensure the knowledge base tool is consistently used" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 9, 517 | "id": "e9492c4a", 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "Response: Here are evidence-based explanations to your questions:\n", 525 | "\n", 526 | "---\n", 527 | "\n", 528 | "**1. Why do suburban belts display larger December brightening than urban cores, even though absolute light levels are higher downtown?**\n", 529 | "\n", 530 | "- Suburban belts show a *larger percentage increase* in night brightness during December compared to urban cores, largely because suburban residential areas feature more single-family homes and larger yards, which are typically decorated with holiday lights. These areas start from a lower baseline (less bright overall at night compared to dense urban centers), so the relative change (brightening) is much more noticeable.\n", 531 | "\n", 532 | "- In contrast, the downtown core is already very bright at night due to dense commercial lighting and streetlights. While it also sees a December increase (often 20–30% brighter), the *absolute* change is less striking because it begins at a much higher base of illumination.\n", 533 | "\n", 534 | "- This pattern is observed across U.S. cities, with the phenomenon driven by widespread cultural practices and the suburban landscape’s suitability for holiday lighting displays. The effect is visible in satellite data and was quantified at 20–50% brighter in December, especially in suburbs and city outskirts.\n", 535 | "\n", 536 | "---\n", 537 | "\n", 538 | "**2. Why is the Phoenix nighttime street grid so sharply visible from space, whereas large stretches of the interstate between midwestern cities remain comparatively dim?**\n", 539 | "\n", 540 | "- Phoenix’s sharply visible nighttime street grid from space is a result of its urban layout: the city (like many western U.S. cities) was developed using a regular grid system, with extensive and uniform street lighting and strong urban sprawl. The grid pattern, and the dense network of intersecting surface streets, is brightly illuminated, particularly at intersections, commercial areas, and major thoroughfares.\n", 541 | "\n", 542 | "- The interstate highways between midwestern cities, though significant in length and crucial to national infrastructure, traverse sparsely populated rural areas. These stretches typically have very little artificial lighting (due to low traffic volumes at night and cost considerations), making them much less visible in nighttime satellite imagery. Only nodes (cities and towns) along the route show as bright \"pearls\" in the darkness, while the \"strings\" (highways) connecting them remain faint or invisible.\n", 543 | "\n", 544 | "- In summary:\n", 545 | " - Urban areas like Phoenix stand out with strong, connected patterns of light due to dense development and extensive lighting.\n", 546 | " - Rural interstates are sparsely lit, and only their endpoints—cities and large towns—generate notable light visible from space.\n", 547 | "\n", 548 | "---\n", 549 | "\n", 550 | "**References**:\n", 551 | "- [Holiday Lights increase most dramatically in suburbs, not downtowns: earth_at_night_508_page_176_verbalized, page 160](4:5)\n", 552 | "- [Lighting paths and urban grids are visible from space, while rural highways remain dim: earth_at_night_508_page_124_verbalized, page 108](4:3)\n", 553 | "- [Phoenix’s grid and surrounding urban structure: earth_at_night_508_page_104_verbalized, page 88](4:1)\n" 554 | ] 555 | } 556 | ], 557 | "source": [ 558 | "# Get the OpenAI client for responses and conversations\n", 559 | "openai_client = project_client.get_openai_client()\n", 560 | "\n", 561 | "conversation = openai_client.conversations.create()\n", 562 | "\n", 563 | "# Send initial request that will trigger the MCP tool\n", 564 | "response = openai_client.responses.create(\n", 565 | " conversation=conversation.id,\n", 566 | " tool_choice=\"required\",\n", 567 | " input=\"\"\"\n", 568 | " Why do suburban belts display larger December brightening than urban cores even though absolute light levels are higher downtown?\n", 569 | " Why is the Phoenix nighttime street grid is so sharply visible from space, whereas large stretches of the interstate between midwestern cities remain comparatively dim?\n", 570 | " \"\"\",\n", 571 | " extra_body={\"agent\": {\"name\": agent.name, \"type\": \"agent_reference\"}},\n", 572 | ")\n", 573 | "\n", 574 | "print(f\"Response: {response.output_text}\")\n" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "id": "39c02785", 580 | "metadata": {}, 581 | "source": [ 582 | "## Inspect the response\n", 583 | "\n", 584 | "The underlying response from the agent contains metadata about what queries the agent sent to the knowledge base and what citations were found" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "id": "b845d2ae", 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "response.to_dict()" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "id": "3b328340", 600 | "metadata": {}, 601 | "source": [ 602 | "## (Optional) Add remote SharePoint as a knowledge source\n", 603 | "\n", 604 | "Adding a remote SharePoint knowledge source requires an additional `x-ms-query-source-authorization` header in your MCP connection." 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 11, 610 | "id": "3711252f", 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "name": "stdout", 615 | "output_type": "stream", 616 | "text": [ 617 | "Knowledge source 'remote-sharepoint' created or updated successfully.\n", 618 | "Knowledge base 'earth-knowledge-base' updated with new knowledge source successfully\n", 619 | "AI agent 'earth-knowledge-agent' created or updated successfully\n" 620 | ] 621 | } 622 | ], 623 | "source": [ 624 | "from azure.search.documents.indexes.models import RemoteSharePointKnowledgeSource, KnowledgeSourceReference\n", 625 | "from azure.search.documents.indexes import SearchIndexClient\n", 626 | "from azure.identity import get_bearer_token_provider\n", 627 | "\n", 628 | "remote_sp_ks = RemoteSharePointKnowledgeSource(\n", 629 | " name=\"remote-sharepoint\",\n", 630 | " description=\"SharePoint knowledge source\"\n", 631 | ")\n", 632 | "\n", 633 | "index_client = SearchIndexClient(endpoint=endpoint, credential=credential)\n", 634 | "index_client.create_or_update_knowledge_source(knowledge_source=remote_sp_ks)\n", 635 | "print(f\"Knowledge source '{remote_sp_ks.name}' created or updated successfully.\")\n", 636 | "\n", 637 | "knowledge_base.knowledge_sources = [\n", 638 | " KnowledgeSourceReference(name=remote_sp_ks.name), KnowledgeSourceReference(name=knowledge_source_name)\n", 639 | "]\n", 640 | "index_client.create_or_update_knowledge_base(knowledge_base=knowledge_base)\n", 641 | "print(f\"Knowledge base '{base_name}' updated with new knowledge source successfully\")\n", 642 | "\n", 643 | "mcp_kb_tool = MCPTool(\n", 644 | " server_label=\"knowledge-base\",\n", 645 | " server_url=mcp_endpoint,\n", 646 | " require_approval=\"never\",\n", 647 | " allowed_tools=[\"knowledge_base_retrieve\"],\n", 648 | " project_connection_id=project_connection_name,\n", 649 | " headers={\n", 650 | " \"x-ms-query-source-authorization\": get_bearer_token_provider(credential, \"https://search.azure.com/.default\")()\n", 651 | " }\n", 652 | ")\n", 653 | "agent = project_client.agents.create_version(\n", 654 | " agent_name=agent_name,\n", 655 | " definition=PromptAgentDefinition(\n", 656 | " model=agent_model,\n", 657 | " instructions=instructions,\n", 658 | " tools=[mcp_kb_tool]\n", 659 | " )\n", 660 | ")\n", 661 | "\n", 662 | "\n", 663 | "print(f\"AI agent '{agent_name}' created or updated successfully\")" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "id": "0926264d", 669 | "metadata": {}, 670 | "source": [ 671 | "## Clean up objects and resources\n", 672 | "\n", 673 | "If you no longer need Azure AI Search or Microsoft Foundry, delete the resources from your Azure subscription. You can also start over by deleting individual objects." 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "id": "4395247f", 679 | "metadata": {}, 680 | "source": [ 681 | "### Delete the agent" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 12, 687 | "id": "409befbb", 688 | "metadata": {}, 689 | "outputs": [ 690 | { 691 | "name": "stdout", 692 | "output_type": "stream", 693 | "text": [ 694 | "AI agent 'earth-knowledge-agent' version '7' deleted successfully\n" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "project_client.agents.delete_version(agent.name, agent.version)\n", 700 | "print(f\"AI agent '{agent.name}' version '{agent.version}' deleted successfully\")" 701 | ] 702 | }, 703 | { 704 | "cell_type": "markdown", 705 | "id": "b7e67115", 706 | "metadata": {}, 707 | "source": [ 708 | "### Delete the knowledge base" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 13, 714 | "id": "d67f8609", 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "name": "stdout", 719 | "output_type": "stream", 720 | "text": [ 721 | "Knowledge base 'earth-knowledge-base' deleted successfully\n" 722 | ] 723 | } 724 | ], 725 | "source": [ 726 | "index_client.delete_knowledge_base(base_name)\n", 727 | "print(f\"Knowledge base '{base_name}' deleted successfully\")" 728 | ] 729 | }, 730 | { 731 | "cell_type": "markdown", 732 | "id": "ff523474", 733 | "metadata": {}, 734 | "source": [ 735 | "### Delete the knowledge source" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 14, 741 | "id": "e35a6eb0", 742 | "metadata": {}, 743 | "outputs": [ 744 | { 745 | "name": "stdout", 746 | "output_type": "stream", 747 | "text": [ 748 | "Knowledge source 'earth-knowledge-source' deleted successfully.\n" 749 | ] 750 | } 751 | ], 752 | "source": [ 753 | "index_client.delete_knowledge_source(knowledge_source=knowledge_source_name) # This is new feature in 2025-08-01-Preview api version\n", 754 | "print(f\"Knowledge source '{knowledge_source_name}' deleted successfully.\")\n" 755 | ] 756 | }, 757 | { 758 | "cell_type": "markdown", 759 | "id": "882ea545", 760 | "metadata": {}, 761 | "source": [ 762 | "### Delete the search index" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 15, 768 | "id": "d9895f27", 769 | "metadata": {}, 770 | "outputs": [ 771 | { 772 | "name": "stdout", 773 | "output_type": "stream", 774 | "text": [ 775 | "Index 'earth-at-night' deleted successfully\n" 776 | ] 777 | } 778 | ], 779 | "source": [ 780 | "index_client.delete_index(index)\n", 781 | "print(f\"Index '{index_name}' deleted successfully\")" 782 | ] 783 | } 784 | ], 785 | "metadata": { 786 | "kernelspec": { 787 | "display_name": ".venv", 788 | "language": "python", 789 | "name": "python3" 790 | }, 791 | "language_info": { 792 | "codemirror_mode": { 793 | "name": "ipython", 794 | "version": 3 795 | }, 796 | "file_extension": ".py", 797 | "mimetype": "text/x-python", 798 | "name": "python", 799 | "nbconvert_exporter": "python", 800 | "pygments_lexer": "ipython3", 801 | "version": "3.12.10" 802 | } 803 | }, 804 | "nbformat": 4, 805 | "nbformat_minor": 5 806 | } 807 | --------------------------------------------------------------------------------