├── img └── custom-skill-archi.png ├── openai-custom-skill ├── .vscode │ └── extensions.json ├── requirements.txt ├── host.json ├── summarize │ ├── function.json │ └── __init__.py ├── .gitignore ├── getting_started_with_azure_functions.md └── custom_skillset_setup.md ├── LICENSE ├── cognitive_search_skillset ├── Cognitive Search-OpenAI Integration.postman_environment.json ├── cognitive_search_setup.md └── Cognitive Search-OpenAI Integration.postman_collection.json └── README.md /img/custom-skill-archi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Anaig/OpenAI-and-Cognitive-Search/HEAD/img/custom-skill-archi.png -------------------------------------------------------------------------------- /openai-custom-skill/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions" 4 | ] 5 | } -------------------------------------------------------------------------------- /openai-custom-skill/requirements.txt: -------------------------------------------------------------------------------- 1 | # Do not include azure-functions-worker as it may conflict with the Azure Functions platform 2 | 3 | azure-functions 4 | openai -------------------------------------------------------------------------------- /openai-custom-skill/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "logging": { 4 | "applicationInsights": { 5 | "samplingSettings": { 6 | "isEnabled": true, 7 | "excludedTypes": "Request" 8 | } 9 | } 10 | }, 11 | "extensionBundle": { 12 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 13 | "version": "[3.*, 4.0.0)" 14 | } 15 | } -------------------------------------------------------------------------------- /openai-custom-skill/summarize/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": [ 10 | "get", 11 | "post" 12 | ] 13 | }, 14 | { 15 | "type": "http", 16 | "direction": "out", 17 | "name": "$return" 18 | } 19 | ] 20 | } -------------------------------------------------------------------------------- /openai-custom-skill/.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | obj 3 | csx 4 | .vs 5 | edge 6 | Publish 7 | 8 | *.user 9 | *.suo 10 | *.cscfg 11 | *.Cache 12 | project.lock.json 13 | 14 | /packages 15 | /TestResults 16 | 17 | /tools/NuGet.exe 18 | /App_Data 19 | /secrets 20 | /data 21 | .secrets 22 | appsettings.json 23 | local.settings.json 24 | 25 | node_modules 26 | dist 27 | 28 | # Local python packages 29 | .python_packages/ 30 | 31 | # Python Environments 32 | .env 33 | .venv 34 | env/ 35 | venv/ 36 | ENV/ 37 | env.bak/ 38 | venv.bak/ 39 | 40 | # Byte-compiled / optimized / DLL files 41 | __pycache__/ 42 | *.py[cod] 43 | *$py.class 44 | 45 | # Azurite artifacts 46 | __blobstorage__ 47 | __queuestorage__ 48 | __azurite_db*__.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Anaig 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cognitive_search_skillset/Cognitive Search-OpenAI Integration.postman_environment.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "14ebb3f7-9ada-4110-8f6c-b021c65823fd", 3 | "name": "OpenAI and Cognitive Search", 4 | "values": [ 5 | { 6 | "key": "search_service", 7 | "value": "[YOUR_SEARCH_SERVICE_NAME]", 8 | "type": "default", 9 | "enabled": true 10 | }, 11 | { 12 | "key": "index_name", 13 | "value": "[YOUR_SEARCH_INDEX_NAME_TO_BE_DEPLOYED]", 14 | "type": "default", 15 | "enabled": true 16 | }, 17 | { 18 | "key": "env_search_api_key", 19 | "value": "[YOUR_COGNITIVE_SEARCH_KEY]", 20 | "type": "secret", 21 | "enabled": true 22 | }, 23 | { 24 | "key": "env_storage_connection_string", 25 | "value": "[YOUR_DATA_STORAGE_CONNECTION_STRING]", 26 | "type": "default", 27 | "enabled": true 28 | }, 29 | { 30 | "key": "env_storage_container", 31 | "value": "[YOUR_STORAGE_CONTAINER_NAME]", 32 | "type": "default", 33 | "enabled": true 34 | }, 35 | { 36 | "key": "cog_services_key", 37 | "value": "[YOUR_COGNITIVE_SERVICE_KEY]", 38 | "type": "secret", 39 | "enabled": true 40 | }, 41 | { 42 | "key": "api_version", 43 | "value": "2020-06-30", 44 | "type": "default", 45 | "enabled": true 46 | }, 47 | { 48 | "key": "function_key", 49 | "value": "[YOUR_AZURE_FUNCTION_KEY]", 50 | "type": "default", 51 | "enabled": true 52 | }, 53 | { 54 | "key": "azure_functions_endpoint", 55 | "value": "[YOUR_AUIRE_FUNCTION_ENDPOINT]", 56 | "type": "default", 57 | "enabled": true 58 | } 59 | ], 60 | "_postman_variable_scope": "environment", 61 | "_postman_exported_at": "2023-03-03T22:37:15.409Z", 62 | "_postman_exported_using": "Postman/10.11.1" 63 | } -------------------------------------------------------------------------------- /openai-custom-skill/summarize/__init__.py: -------------------------------------------------------------------------------- 1 | """Modules import.""" 2 | import os 3 | import json 4 | import logging 5 | import openai 6 | import azure.functions as func 7 | 8 | # Your Azure OpenAI endpoint 9 | OPENAI_ENDPOINT = "https://YOUR_OPEN_AI_SERVICE_NAME.openai.azure.com/" 10 | # Prompt to be sent to Azure OpenAI 11 | OPENAI_PROMPT = "Summarize the following text in 3 sentences:" 12 | 13 | def main(req: func.HttpRequest) -> func.HttpResponse: 14 | """Get text from Cognitive Search and return OpenAI completion""" 15 | 16 | logging.info('Python HTTP trigger function processed a request.') 17 | 18 | # Extract text from request payload 19 | req_body = req.get_body().decode('utf-8') 20 | request = json.loads(req_body) 21 | text = request['values'][0]['data']['text'] 22 | 23 | # Get OpenAI summary 24 | summary = get_summary(text) 25 | 26 | # Create the response object 27 | response_body = { 28 | "values": [ 29 | { 30 | "recordId": request['values'][0]['recordId'], 31 | "data": { 32 | "summary": summary 33 | }, 34 | "errors": None, 35 | "warnings": None 36 | } 37 | ] 38 | } 39 | 40 | # Serialize the response object to JSON and return it 41 | response = func.HttpResponse(json.dumps(response_body)) 42 | response.headers['Content-Type'] = 'application/json' 43 | return response 44 | 45 | 46 | # Get an OpenAI summary 47 | def get_summary(text): 48 | """Send a prompt to Azure OpenAI""" 49 | 50 | openai.api_type = "azure" 51 | openai.api_base = OPENAI_ENDPOINT 52 | openai.api_version = "2022-12-01" 53 | openai.api_key = os.environ["AZURE_OPENAI_SECRET"] 54 | 55 | response = openai.Completion.create( 56 | engine = os.environ["OPENAI_ENGINE"], 57 | prompt = f'{OPENAI_PROMPT} {text}', 58 | temperature = 0.5, 59 | max_tokens = 500, 60 | top_p = 1, 61 | frequency_penalty = 0, 62 | presence_penalty = 0, 63 | best_of = 1, 64 | stop = None) 65 | 66 | if response.object == 'text_completion': 67 | if response['choices'][0]['finish_reason'] == 'stop': 68 | category = response['choices'][0]['text'] 69 | else: 70 | category = None 71 | else: 72 | category = None 73 | 74 | return category 75 | -------------------------------------------------------------------------------- /openai-custom-skill/getting_started_with_azure_functions.md: -------------------------------------------------------------------------------- 1 | ## Getting Started with Azure Function 2 | ### Last updated: March 8th 2021 3 | 4 | #### Project Structure 5 | The main project folder () can contain the following files: 6 | 7 | * **local.settings.json** - Used to store app settings and connection strings when running locally. This file doesn't get published to Azure. To learn more, see [local.settings.file](https://aka.ms/azure-functions/python/local-settings). 8 | * **requirements.txt** - Contains the list of Python packages the system installs when publishing to Azure. 9 | * **host.json** - Contains global configuration options that affect all functions in a function app. This file does get published to Azure. Not all options are supported when running locally. To learn more, see [host.json](https://aka.ms/azure-functions/python/host.json). 10 | * **.vscode/** - (Optional) Contains store VSCode configuration. To learn more, see [VSCode setting](https://aka.ms/azure-functions/python/vscode-getting-started). 11 | * **.venv/** - (Optional) Contains a Python virtual environment used by local development. 12 | * **Dockerfile** - (Optional) Used when publishing your project in a [custom container](https://aka.ms/azure-functions/python/custom-container). 13 | * **tests/** - (Optional) Contains the test cases of your function app. For more information, see [Unit Testing](https://aka.ms/azure-functions/python/unit-testing). 14 | * **.funcignore** - (Optional) Declares files that shouldn't get published to Azure. Usually, this file contains .vscode/ to ignore your editor setting, .venv/ to ignore local Python virtual environment, tests/ to ignore test cases, and local.settings.json to prevent local app settings being published. 15 | 16 | Each function has its own code file and binding configuration file ([**function.json**](https://aka.ms/azure-functions/python/function.json)). 17 | 18 | #### Developing your first Python function using VS Code 19 | 20 | If you have not already, please checkout our [quickstart](https://aka.ms/azure-functions/python/quickstart) to get you started with Azure Functions developments in Python. 21 | 22 | #### Publishing your function app to Azure 23 | 24 | For more information on deployment options for Azure Functions, please visit this [guide](https://docs.microsoft.com/en-us/azure/azure-functions/create-first-function-vs-code-python#publish-the-project-to-azure). 25 | 26 | #### Next Steps 27 | 28 | * To learn more about developing Azure Functions, please visit [Azure Functions Developer Guide](https://aka.ms/azure-functions/python/developer-guide). 29 | 30 | * To learn more specific guidance on developing Azure Functions with Python, please visit [Azure Functions Developer Python Guide](https://aka.ms/azure-functions/python/python-developer-guide). -------------------------------------------------------------------------------- /cognitive_search_skillset/cognitive_search_setup.md: -------------------------------------------------------------------------------- 1 | # Set-up the new skill in the Azure Cognitive Search indexer 2 | 3 | In this example, we will use the REST APIs to deploy the Azure Cognitive Search indexer. 4 | 5 | - [x] **If you have an existing indexer:** You can add the new skills using the steps below. 6 | - [ ] **If you don't have an indexer yet:** You can use the [Postman collection](../cognitive_search_skillset/Cognitive%20Search-OpenAI%20Integration.postman_collection.json) in this repository as a deployment template, after updating the parameters in the [Postman environment](../cognitive_search_skillset/Cognitive%20Search-OpenAI%20Integration.postman_environment.json) file. 7 | 8 | ## Skillset 9 | 10 | In the [Cognitive Search skillset](https://learn.microsoft.com/en-us/rest/api/searchservice/create-skillset#request-body), add a skill as the example below. 11 | 12 | - If your text is is longer than the number of tokens supported by OpenAI, you can use `"/document/pages/*"` as a context and source to apply the OpenAI prompt at the page level. 13 | 14 | - Replace `uri` with your Azure Functions endpoint, in the format *https://YOUR_WEB_APP_NAME.azurewebsites.net/api/FUNCTION_NAME*. 15 | 16 | - Replace `"x-functions-key"` with your Azure Functions secret. You can find it under the *App Keys* section in the Azure portal. 17 | 18 | ``` json 19 | { 20 | "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill", 21 | "name": "summary-custom-skill", 22 | "description": "Short summary OpenAI generated", 23 | "context": "/document/merged_text", 24 | "uri": "[YOUR_AZURE_FUNCTIONS_ENDPOINT]", 25 | "httpMethod": "POST", 26 | "timeout": "PT30S", 27 | "batchSize": 1, 28 | "degreeOfParallelism": 1, 29 | "inputs": [ 30 | { 31 | "name": "text", 32 | "source": "/document/merged_text" 33 | } 34 | ], 35 | "outputs": [ 36 | { 37 | "name": "summary", 38 | "targetName": "summary" 39 | } 40 | ], 41 | "httpHeaders": { 42 | "x-functions-key": "[YOUR_AZURE_FUNCTIONS_KEY]" 43 | } 44 | } 45 | ``` 46 | 47 | ## Index 48 | 49 | Add a new field in the [index](https://learn.microsoft.com/en-us/rest/api/searchservice/create-index#request-body) for the custom skill. 50 | 51 | ``` json 52 | { 53 | "name": "summary", 54 | "type": "Edm.String", 55 | "searchable": true, 56 | "sortable": false, 57 | "filterable": false, 58 | "facetable": false 59 | } 60 | ``` 61 | 62 | ## Indexer 63 | 64 | Add a new output field mapping in the [indexer](https://learn.microsoft.com/en-us/rest/api/searchservice/create-indexer#request-body). 65 | 66 | ``` json 67 | { 68 | "sourceFieldName": "/document/merged_text", 69 | "targetFieldName": "summary" 70 | } 71 | ``` 72 | 73 | -------------------------------------------------------------------------------- /openai-custom-skill/custom_skillset_setup.md: -------------------------------------------------------------------------------- 1 | # Set-up the OpenAI custom skillet 2 | 3 | 1. In the Azure Portal, get your Azure OpenAI key under the *Keys and Endpoint* section of the resource. Save it for later use. 4 | 5 | 2. In the [OpenAI studio](https://oai.azure.com/), access the playground, go to *Deployments* and create a new deployment with the model of your choice. Save the model name for later use. 6 | 7 | 3. Open the [open-ai-custom-skill](../openai-custom-skill) folder and open it in Visual Studio Code. 8 | 9 | 4. Run the `func init` command in the terminal. You will get a new configuration files *local.settings.json*. You just need to run this command once, to get all the necessary Functions settings in your folder. 10 | 11 | 5. Update *local.settings.json* with the following parameters: 12 | 13 | ```json 14 | { 15 | "IsEncrypted": false, 16 | 17 | "Values": { 18 | 19 | "FUNCTIONS_WORKER_RUNTIME": "python", 20 | 21 | "AzureWebJobsStorage": "UseDevelopmentStorage=true", 22 | 23 | "AZURE_OPENAI_SECRET": "[YOUR_AZURE_OPEN_AI_KEY]", 24 | 25 | "OPENAI_ENGINE": "[YOUR_OPEN-AI_MODEL]" 26 | 27 | } 28 | } 29 | ``` 30 | 31 | The `OPENAI_ENGINE` parameter is your model deployment name in Azure OpenAI. 32 | 33 | 6. Update the [\_\_init\_\_.py](../openai-custom-skill/summarize/__init__.py) function file by updating the global variables `OPENAI_ENDPOINT` and `OPENAI_PROMPT` with your Azure OpenAI endpoint and the prompt you would like to apply to the text of your documents. 34 | 35 | 7. Use the `func start` command in the command line to test locally. A local endpoint must appear at the end of the output. 36 | 37 | 8. Use this endpoint to make a `POST` Rest call with the following body format, which is the format of an [array of record](https://learn.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface#format-web-api-inputs) from Cognitive Search : 38 | 39 | ```json 40 | { 41 | "values": [ 42 | { 43 | "recordId": "0", 44 | "data": { 45 | "id": "1", 46 | "lang": "en", 47 | "text": "[SOME_TEST_TEXT]" 48 | } 49 | } 50 | ] 51 | } 52 | ``` 53 | 54 | 9. Deploy the function to Azure using `func azure functionapp publish `. Don't forget to copy the environment variables from *local.settings.json* to your application settings. 55 | 10. Save your function endpoint to use it in the Cognitive Search script. 56 | 57 | 58 | 59 | You can create as many Functions as Custom Skills you want. Just copy the current function folder or create a new folder under `open-ai-custom-skill`, named by the function you would like to create, that contains a `__init__.py` and a `function.json` file. 60 | 61 | If you are facing any issue, refer to the [Getting started with Azure Functions](../openai-custom-skill/getting_started_with_azure_functions.md) documentation. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenAI integration with Azure Cognitive-Search for document analysis 2 | *Azure OpenAI integration as a custom skillset in Azure Cognitive Search* 3 | 4 | ![image-20230227094029410](./img/custom-skill-archi.png) 5 | 6 | ## Why this integration? 7 | 8 | OpenAI has revolutionized the way we develop applications by providing state-of-the-art machine learning models and making it easy for developers to add AI capabilities to their applications without needing to have an extensive background in data science. 9 | 10 | In addition, many of the OpenAI models are available on Azure, where you can get the security capabilities of Microsoft Azure while running the same models as OpenAI. Azure OpenAI offers private networking, regional availability, and responsible AI content filtering. 11 | 12 | However, even if OpenAI APIs are very easy to use and integrate, you may have faced some of these limitations: 13 | 14 | - **Format**: OpenAI only supports text or json format. If you want to analyze enterprise documents such as PDF, Word, PowerPoint, etc., you need to extract or transform your data. 15 | - **Source**: You cannot directly connect OpenAI to data storages like a database, a SharePoint or a Data Lake. 16 | - **Token limitation**: Depending on the model used, OpenAI requests can use up to 4097 tokens shared between prompt and completion. To analyze longer documents, the text needs to be split into multiple pieces. 17 | 18 | ## What is the added value of using Azure Cognitive Search with OpenAI? 19 | 20 | This is where Azure Cognitive Search comes as a great comes as a great complement to Azure OpenAI. 21 | 22 | - **Data Integration**: Azure Cognitive Search has connectors to many Data Sources to simplify data ingestion into a search index. 23 | - **Data transformation:** Transforms large undifferentiated file formats into into searchable text. Using the Optical Character Recognition skill, it can even process images. 24 | - **Split text:** The Text Split skill breaks text into chunks of text. You can specify whether you want to break the text into sentences or into pages of a particular length. This skill is useful for the maximum text length requirements in OpenAI. 25 | - **Translation capabilities:** The Text Translation skill evaluates text and returns the text translated to the specified target language. Microsoft Translation API supports more than 70 languages for text translation, while OpenAI has only limited support for a few other languages than English. 26 | 27 | Azure OpenAI can offer additional AI enrichment to your Cognitive Search index such as: 28 | 29 | - Document classification 30 | - Document summarization 31 | - New insights generation 32 | - KPI extraction 33 | - Etc. 34 | 35 | In this example, we will add summarization capability to the Cognitive Search index using Azure OpenAI. 36 | 37 | ## Requirements 38 | 39 | To deploy this project you'll need these Azure resources: 40 | 41 | - [Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/) : S1 tier is recommended 42 | - [Azure OpenAI](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/overview): With the text model of your choice 43 | 44 | - Python [Azure Functions](https://learn.microsoft.com/en-us/azure/azure-functions/functions-overview): For this project, I used Python 3.9 45 | 46 | For the development, it is recommended to use [Visual Studio Code](https://code.visualstudio.com/) and [Postman](https://www.postman.com/). 47 | 48 | For Visual Studio Code, you can install the [Azure Function extension](https://marketplace.visualstudio.com/items?itemName=ms-azuretools.vscode-azurefunctions) and the [Azure Function Core Tool](https://learn.microsoft.com/en-us/azure/azure-functions/functions-run-local?tabs=v4%2Cwindows%2Ccsharp%2Cportal%2Cbash#v2). 49 | 50 | ## Get started 51 | 52 | - Deploy your OpenAI custom skill using [Azure Functions](./openai-custom-skill/custom_skillset_setup.md). 53 | - Integrate it in your [Cognitive Search Indexer](./cognitive_search_skillset/cognitive_search_setup.md). 54 | - Query your new index. 55 | -------------------------------------------------------------------------------- /cognitive_search_skillset/Cognitive Search-OpenAI Integration.postman_collection.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "_postman_id": "87ef3532-20b4-4575-a34e-4de62fe6213c", 4 | "name": "Cognitive Search/OpenAI Integration", 5 | "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json", 6 | "_exporter_id": "5336350" 7 | }, 8 | "item": [ 9 | { 10 | "name": "01 - Create a datasource", 11 | "request": { 12 | "method": "PUT", 13 | "header": [ 14 | { 15 | "key": "api-key", 16 | "value": "{{env_search_api_key}}" 17 | }, 18 | { 19 | "key": "Content-Type", 20 | "value": "application/json" 21 | } 22 | ], 23 | "body": { 24 | "mode": "raw", 25 | "raw": "{ \r\n \"type\" : \"azureblob\",\r\n \"credentials\" : { \r\n \t\"connectionString\": \"{{env_storage_connection_string}}\"\r\n }, \r\n\t\"container\" : { \r\n\t\t\"name\" : \"{{env_storage_container}}\"\r\n\t}\r\n}" 26 | }, 27 | "url": { 28 | "raw": "https://{{search_service}}.search.windows.net/datasources/{{index_name}}-datasource?api-version={{api_version}}", 29 | "protocol": "https", 30 | "host": [ 31 | "{{search_service}}", 32 | "search", 33 | "windows", 34 | "net" 35 | ], 36 | "path": [ 37 | "datasources", 38 | "{{index_name}}-datasource" 39 | ], 40 | "query": [ 41 | { 42 | "key": "api-version", 43 | "value": "{{api_version}}" 44 | } 45 | ] 46 | } 47 | }, 48 | "response": [] 49 | }, 50 | { 51 | "name": "02 - Create Index", 52 | "request": { 53 | "method": "PUT", 54 | "header": [ 55 | { 56 | "key": "api-key", 57 | "value": "{{env_search_api_key}}" 58 | }, 59 | { 60 | "key": "Content-Type", 61 | "value": "application/json" 62 | } 63 | ], 64 | "body": { 65 | "mode": "raw", 66 | "raw": "{\r\n \"fields\": [\r\n {\r\n \"name\": \"content\",\r\n \"type\": \"Edm.String\",\r\n \"sortable\": false,\r\n \"searchable\": true,\r\n \"filterable\": false,\r\n \"facetable\": false\r\n },\r\n {\r\n \"name\": \"keyPhrases\",\r\n \"type\": \"Collection(Edm.String)\",\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": true,\r\n \"facetable\": true\r\n },\r\n {\r\n \"name\": \"organizations\",\r\n \"type\": \"Collection(Edm.String)\",\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": true,\r\n \"facetable\": true\r\n },\r\n {\r\n \"name\": \"persons\",\r\n \"type\": \"Collection(Edm.String)\",\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": true,\r\n \"facetable\": true\r\n },\r\n {\r\n \"name\": \"locations\",\r\n \"type\": \"Collection(Edm.String)\",\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": true,\r\n \"facetable\": true\r\n },\r\n {\r\n \"name\": \"language\",\r\n \"type\": \"Edm.String\",\r\n \"facetable\": false,\r\n \"filterable\": true,\r\n \"key\": false,\r\n \"retrievable\": true,\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"analyzer\": \"standard.lucene\",\r\n \"indexAnalyzer\": null,\r\n \"searchAnalyzer\": null,\r\n \"synonymMaps\": [],\r\n \"fields\": []\r\n },\r\n {\r\n \"name\": \"translated_text\",\r\n \"type\": \"Edm.String\",\r\n \"facetable\": false,\r\n \"filterable\": false,\r\n \"key\": false,\r\n \"retrievable\": true,\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"analyzer\": \"en.lucene\",\r\n \"indexAnalyzer\": null,\r\n \"searchAnalyzer\": null,\r\n \"synonymMaps\": [],\r\n \"fields\": []\r\n },\r\n {\r\n \"name\": \"metadata_storage_path\",\r\n \"type\": \"Edm.String\",\r\n \"key\": true,\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": false,\r\n \"facetable\": false\r\n },\r\n {\r\n \"name\": \"metadata_storage_name\",\r\n \"type\": \"Edm.String\",\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": false,\r\n \"facetable\": false\r\n },\r\n {\r\n \"name\": \"summary\",\r\n \"type\": \"Edm.String\",\r\n \"searchable\": true,\r\n \"sortable\": false,\r\n \"filterable\": false,\r\n \"facetable\": false\r\n }\r\n ]\r\n}" 67 | }, 68 | "url": { 69 | "raw": "https://{{search_service}}.search.windows.net/indexes/{{index_name}}?api-version={{api_version}}", 70 | "protocol": "https", 71 | "host": [ 72 | "{{search_service}}", 73 | "search", 74 | "windows", 75 | "net" 76 | ], 77 | "path": [ 78 | "indexes", 79 | "{{index_name}}" 80 | ], 81 | "query": [ 82 | { 83 | "key": "api-version", 84 | "value": "{{api_version}}" 85 | } 86 | ] 87 | } 88 | }, 89 | "response": [] 90 | }, 91 | { 92 | "name": "03 - Create a skillset", 93 | "request": { 94 | "method": "PUT", 95 | "header": [ 96 | { 97 | "key": "api-key", 98 | "value": "{{env_search_api_key}}" 99 | }, 100 | { 101 | "key": "Content-Type", 102 | "value": "application/json" 103 | } 104 | ], 105 | "body": { 106 | "mode": "raw", 107 | "raw": "{\r\n \"description\": \"Extract entities, detect language and extract key-phrases\",\r\n \"cognitiveServices\": {\r\n \t\"@odata.type\": \"#Microsoft.Azure.Search.CognitiveServicesByKey\",\r\n \t\"description\": \"mycogsvcs\",\r\n \t\"key\": \"{{cog_services_key}}\"\r\n },\r\n \"skills\":\r\n [\r\n \t{\r\n \"@odata.type\": \"#Microsoft.Skills.Vision.OcrSkill\",\r\n \"context\": \"/document/normalized_images/*\",\r\n \"defaultLanguageCode\": \"en\",\r\n \"detectOrientation\": true,\r\n \"inputs\": [\r\n {\r\n \"name\": \"image\",\r\n \"source\": \"/document/normalized_images/*\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"text\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Text.MergeSkill\",\r\n \"description\": \"Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.\",\r\n \"context\": \"/document\",\r\n \"insertPreTag\": \" \",\r\n \"insertPostTag\": \" \",\r\n \"inputs\": [\r\n {\r\n \"name\":\"text\", \r\n \"source\": \"/document/content\"\r\n },\r\n {\r\n \"name\": \"itemsToInsert\", \r\n \"source\": \"/document/normalized_images/*/text\"\r\n },\r\n {\r\n \"name\":\"offsets\", \r\n \"source\": \"/document/normalized_images/*/contentOffset\" \r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"mergedText\", \r\n \"targetName\" : \"merged_text\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Text.SplitSkill\",\r\n \"textSplitMode\": \"pages\",\r\n \"maximumPageLength\": 4000,\r\n \"defaultLanguageCode\": \"en\",\r\n \"context\": \"/document\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"text\",\r\n \"source\": \"/document/merged_text\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"textItems\",\r\n \"targetName\": \"pages\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\r\n \"context\": \"/document/pages/*\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"text\",\r\n \"source\": \"/document/pages/*\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"keyPhrases\",\r\n \"targetName\": \"keyPhrases\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Text.LanguageDetectionSkill\",\r\n \"context\": \"/document\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"text\",\r\n \"source\": \"/document/merged_text\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"languageCode\",\r\n \"targetName\": \"language\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Text.TranslationSkill\",\r\n \"context\": \"/document/merged_text\",\r\n \"defaultToLanguageCode\": \"en\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"text\",\r\n \"source\": \"/document/merged_text\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"translatedText\",\r\n \"targetName\": \"translated_text\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Text.V3.EntityRecognitionSkill\",\r\n \"context\": \"/document/pages/*\",\r\n \"categories\": [\"Organization\", \"Location\", \"Person\"],\r\n \"minimumPrecision\": 0.7,\r\n \"inputs\": [\r\n {\r\n \"name\": \"text\",\r\n \"source\": \"/document/pages/*\"\r\n },\r\n {\r\n \"name\": \"languageCode\", \r\n \"source\": \"/document/language\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"organizations\",\r\n \"targetName\": \"organizations\"\r\n },\r\n {\r\n \"name\": \"locations\",\r\n \"targetName\": \"locations\"\r\n },\r\n {\r\n \"name\": \"persons\",\r\n \"targetName\": \"persons\"\r\n }\r\n ]\r\n },\r\n {\r\n \"@odata.type\": \"#Microsoft.Skills.Custom.WebApiSkill\",\r\n \"name\": \"summary-custom-skill\",\r\n \"description\": \"Short summary OpenAI generated\",\r\n \"context\": \"/document/merged_text\",\r\n \"uri\": \"{{azure_functions_endpoint}}\",\r\n \"httpMethod\": \"POST\",\r\n \"timeout\": \"PT30S\",\r\n \"batchSize\": 1,\r\n \"degreeOfParallelism\": 1,\r\n \"inputs\": [\r\n {\r\n \"name\": \"text\",\r\n \"source\": \"/document/merged_text\"\r\n }\r\n ],\r\n \"outputs\": [\r\n {\r\n \"name\": \"summary\",\r\n \"targetName\": \"summary\"\r\n }\r\n ],\r\n \"httpHeaders\": {\r\n \"x-functions-key\": \"{{function_key}}\"\r\n }\r\n }\r\n ],\r\n\"knowledgeStore\": {\r\n \"storageConnectionString\": \"{{env_storage_connection_string}}\",\r\n \"projections\": [\r\n {\r\n \"tables\": [\r\n {\r\n \"tableName\": \"WFPSearchSkillsetDocument\",\r\n \"generatedKeyName\": \"Documentid\",\r\n \"source\": \"/document/metadata_storage_path\"\r\n },\r\n {\r\n \"tableName\": \"WFPSearchSkillsetPages\",\r\n \"generatedKeyName\": \"Pagesid\",\r\n \"source\": \"/document/pages/*\",\r\n \"sourceContext\": null\r\n },\r\n {\r\n \"tableName\": \"WFPSearchSkillsetKeyPhrases\",\r\n \"referenceKeyName\": null,\r\n \"generatedKeyName\": \"KeyPhrasesid\",\r\n \"source\": null,\r\n \"sourceContext\": \"/document/pages/*/keyPhrases/*\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"keyphrases\",\r\n \"source\": \"/document/pages/*/keyPhrases/*\",\r\n \"sourceContext\": null,\r\n \"inputs\": []\r\n }\r\n ]\r\n },\r\n {\r\n \"tableName\": \"WFPSearchSkillsetLocation\",\r\n \"referenceKeyName\": null,\r\n \"generatedKeyName\": \"Locationsid\",\r\n \"source\": null,\r\n \"sourceContext\": \"/document/pages/*/locations/*\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"locations\",\r\n \"source\": \"/document/pages/*/locations/*\",\r\n \"sourceContext\": null,\r\n \"inputs\": []\r\n }\r\n ]\r\n },\r\n {\r\n \"tableName\": \"WFPSearchSkillsetOrg\",\r\n \"referenceKeyName\": null,\r\n \"generatedKeyName\": \"Orgid\",\r\n \"source\": null,\r\n \"sourceContext\": \"/document/pages/*/organizations/*\",\r\n \"inputs\": [ {\r\n \"name\": \"organizations\",\r\n \"source\": \"/document/pages/*/organizations/*\",\r\n \"sourceContext\": null,\r\n \"inputs\": []\r\n }]\r\n },\r\n {\r\n \"tableName\": \"WFPSearchSkillsetPerson\",\r\n \"referenceKeyName\": null,\r\n \"generatedKeyName\": \"Personsid\",\r\n \"source\": null,\r\n \"sourceContext\": \"/document/pages/*/persons/*\",\r\n \"inputs\": [\r\n {\r\n \"name\": \"persons\",\r\n \"source\": \"/document/pages/*/persons/*\",\r\n \"sourceContext\": null,\r\n \"inputs\": []\r\n }\r\n ]\r\n },\r\n {\r\n \"tableName\": \"azureblobSkillsetImages\",\r\n \"referenceKeyName\": null,\r\n \"generatedKeyName\": \"Imagesid\",\r\n \"source\": \"/document/normalized_images/*\",\r\n \"sourceContext\": null,\r\n \"inputs\": []\r\n }\r\n ],\r\n \"objects\": [],\r\n \"files\": []\r\n }\r\n ],\r\n \"parameters\": {\r\n \"synthesizeGeneratedKeyName\": true\r\n }\r\n }\r\n}" 108 | }, 109 | "url": { 110 | "raw": "https://{{search_service}}.search.windows.net/skillsets/{{index_name}}-skillset?api-version={{api_version}}", 111 | "protocol": "https", 112 | "host": [ 113 | "{{search_service}}", 114 | "search", 115 | "windows", 116 | "net" 117 | ], 118 | "path": [ 119 | "skillsets", 120 | "{{index_name}}-skillset" 121 | ], 122 | "query": [ 123 | { 124 | "key": "api-version", 125 | "value": "{{api_version}}" 126 | } 127 | ] 128 | } 129 | }, 130 | "response": [] 131 | }, 132 | { 133 | "name": "04 - Create Indexer", 134 | "request": { 135 | "method": "PUT", 136 | "header": [ 137 | { 138 | "key": "api-key", 139 | "value": "{{env_search_api_key}}" 140 | }, 141 | { 142 | "key": "Content-Type", 143 | "value": "application/json" 144 | } 145 | ], 146 | "body": { 147 | "mode": "raw", 148 | "raw": "{\r\n \"dataSourceName\" : \"{{index_name}}-datasource\",\r\n \"targetIndexName\" : \"{{index_name}}\",\r\n \"skillsetName\" : \"{{index_name}}-skillset\",\r\n \"fieldMappings\" : [\r\n {\r\n \"sourceFieldName\" : \"metadata_storage_path\",\r\n \"targetFieldName\" : \"metadata_storage_path\",\r\n \"mappingFunction\" : { \"name\" : \"base64Encode\" }\r\n },\r\n {\r\n \t\"sourceFieldName\": \"metadata_storage_name\",\r\n \t\"targetFieldName\": \"metadata_storage_name\"\r\n }\r\n ],\r\n \"outputFieldMappings\" : \r\n\t[\r\n\t\t{\r\n \t\"sourceFieldName\": \"/document/merged_text\",\r\n \t\"targetFieldName\": \"content\"\r\n },\r\n \t\t{\r\n \"sourceFieldName\" : \"/document/pages/*/organizations/*\", \r\n \"targetFieldName\" : \"organizations\"\r\n },\r\n \t\t{\r\n \"sourceFieldName\" : \"/document/pages/*/persons/*\", \r\n \"targetFieldName\" : \"persons\"\r\n },\r\n \t\t{\r\n \"sourceFieldName\" : \"/document/pages/*/locations/*\", \r\n \"targetFieldName\" : \"locations\"\r\n },\r\n {\r\n \"sourceFieldName\" : \"/document/pages/*/keyPhrases/*\", \r\n \"targetFieldName\" : \"keyPhrases\"\r\n },\r\n {\r\n\t \"sourceFieldName\": \"/document/language\",\r\n\t \"targetFieldName\": \"language\"\r\n\t },\r\n\t {\r\n\t \"sourceFieldName\": \"/document/merged_text/translated_text\",\r\n\t \"targetFieldName\": \"translated_text\"\r\n\t },\r\n {\r\n\t \"sourceFieldName\": \"/document/merged_text\",\r\n\t \"targetFieldName\": \"summary\"\r\n\t }\r\n ],\r\n \"parameters\":\r\n {\r\n\t\"batchSize\": 1,\r\n \t\"maxFailedItems\":-1,\r\n \t\"maxFailedItemsPerBatch\":-1,\r\n \t\"configuration\": \r\n\t{\r\n \t\"dataToExtract\": \"contentAndMetadata\",\r\n \t\"imageAction\": \"generateNormalizedImages\"\r\n\t}\r\n }\r\n}" 149 | }, 150 | "url": { 151 | "raw": "https://{{search_service}}.search.windows.net/indexers/{{index_name}}-indexer?api-version={{api_version}}", 152 | "protocol": "https", 153 | "host": [ 154 | "{{search_service}}", 155 | "search", 156 | "windows", 157 | "net" 158 | ], 159 | "path": [ 160 | "indexers", 161 | "{{index_name}}-indexer" 162 | ], 163 | "query": [ 164 | { 165 | "key": "api-version", 166 | "value": "{{api_version}}" 167 | } 168 | ] 169 | } 170 | }, 171 | "response": [] 172 | } 173 | ] 174 | } --------------------------------------------------------------------------------